[
    {
        "id": "B1e-kxSKDH",
        "title": "Structured Object-Aware Physics Prediction for Video Modeling and Planning",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a structured object-aware video prediction model, which explicitly reasons about objects and demonstrate that it provides high-quality long term video predictions for planning.",
        "abstract": "When humans observe a physical system, they can easily locate components, understand their interactions, and anticipate future behavior, even in settings with complicated and previously unseen interactions. For computers, however, learning such models from videos in an unsupervised fashion is an unsolved research problem.  In this paper, we present STOVE, a novel state-space model for  videos, which explicitly reasons about objects and their positions, velocities, and interactions. It is constructed by combining an image model and a dynamics model in compositional manner and improves on previous work by reusing the dynamics model for inference, accelerating and regularizing training. STOVE predicts videos with convincing physical behavior over hundreds of timesteps, outperforms previous unsupervised models, and even approaches the performance of supervised baselines. We further demonstrate the strength of our model as a simulator for sample efficient model-based control, in a task with heavily interacting objects.\n",
        "keywords": "self-supervised learning;probabilistic deep learning;structured models;video prediction;physics prediction;planning;variational auteoncoders;model-based reinforcement learning;VAEs;unsupervised;variational;graph neural networks;tractable probabilistic models;attend-infer-repeat;relational learning;AIR;sum-product networks;object-oriented;object-centric;object-aware;MCTS",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jannik Kossen;Karl Stelzner;Marcel Hussing;Claas Voelcker;Kristian Kersting",
        "authorids": "kossen@stud.uni-heidelberg.de;stelzner@cs.tu-darmstadt.de;marcel.hussing@stud.tu-darmstadt.de;c.voelcker@stud.tu-darmstadt.de;kersting@cs.tu-darmstadt.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nKossen2020Structured,\ntitle={Structured Object-Aware Physics Prediction for Video Modeling and Planning},\nauthor={Jannik Kossen and Karl Stelzner and Marcel Hussing and Claas Voelcker and Kristian Kersting},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1e-kxSKDH}\n}",
        "github": "https://github.com/ICLR20/STOVE",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1e-kxSKDH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "779;585;401",
        "wc_reply_reviewers": "65;0;0",
        "wc_reply_authors": "842;778;500",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            588.3333333333334,
            154.33585311117946
        ],
        "wc_reply_reviewers_avg": [
            21.666666666666668,
            30.641293851417057
        ],
        "wc_reply_authors_avg": [
            706.6666666666666,
            148.4527609114166
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 73,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9673300822333166750&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "B1e3OlStPB",
        "title": "DeepSphere: a graph-based spherical CNN",
        "track": "main",
        "status": "Spotlight",
        "tldr": "A graph-based spherical CNN that strikes an interesting balance of trade-offs for a wide variety of applications.",
        "abstract": "Designing a convolution for a spherical neural network requires a delicate tradeoff between efficiency and rotation equivariance. DeepSphere, a method based on a graph representation of the discretized sphere, strikes a controllable balance between these two desiderata. This contribution is twofold. First, we study both theoretically and empirically how equivariance is affected by the underlying graph with respect to the number of pixels and neighbors. Second, we evaluate DeepSphere on relevant problems. Experiments show state-of-the-art performance and demonstrates the efficiency and flexibility of this formulation. Perhaps surprisingly, comparison with previous work suggests that anisotropic filters might be an unnecessary price to pay. Our code is available at https://github.com/deepsphere.",
        "keywords": "spherical cnns;graph neural networks;geometric deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Micha\u00ebl Defferrard;Martino Milani;Fr\u00e9d\u00e9rick Gusset;Nathana\u00ebl Perraudin",
        "authorids": "michael.defferrard@epfl.ch;martino.milani@epfl.ch;frederick.gusset@epfl.ch;nathanael.perraudin@sdsc.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nDefferrard2020DeepSphere:,\ntitle={DeepSphere: a graph-based spherical CNN},\nauthor={Micha\u00ebl Defferrard and Martino Milani and Fr\u00e9d\u00e9rick Gusset and Nathana\u00ebl Perraudin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1e3OlStPB}\n}",
        "github": "https://github.com/deepsphere",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1e3OlStPB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "155;184;258",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "96;107;407",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            199.0,
            43.36665385600631
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            203.33333333333334,
            144.0840803913542
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 116,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17982837150918641650&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "B1e5NySKwH",
        "title": "Instant Quantization of Neural Networks using Monte Carlo Methods",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Monte Carlo methods for quantizing pre-trained models without any additional training.",
        "abstract": "Low bit-width integer weights and activations are very important for efficient inference, especially with respect to lower power consumption. We propose to apply Monte Carlo methods and importance sampling to sparsify and quantize pre-trained neural networks without any retraining. We obtain sparse, low bit-width integer representations that approximate the full precision weights and activations. The precision, sparsity, and complexity are easily configurable by the amount of sampling performed. Our approach, called Monte Carlo Quantization (MCQ), is linear in both time and space, while the resulting quantized sparse networks show minimal accuracy loss compared to the original full-precision networks. Our method either outperforms or achieves results competitive with methods that do require additional training on a variety of challenging tasks.",
        "keywords": "monte carlo;importance sampling;network quantization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gon\u00e7alo Mordido;Matthijs Van Keirsbilck;Alexander Keller",
        "authorids": "goncalo.mordido@hpi.de;matthijsv@nvidia.com;akeller@nvidia.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1e5NySKwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "519;977;112",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            536.0,
            353.3393081255844
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1043696544244402983&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "B1e5TA4FPr",
        "title": "Pareto Optimality in No-Harm Fairness",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a method to reduce risk disparity gaps between sensitive groups in classification and regression tasks following the no unnecessary harm principle, ensuring that tradeoffs are minimally costly to any subgroup",
        "abstract": "Common fairness definitions in machine learning focus on balancing various notions of disparity and utility. In this work we study fairness in the context of risk disparity among sub-populations. We introduce the framework of Pareto-optimal fairness, where the goal of reducing risk disparity gaps is secondary only to the principle of not doing unnecessary harm, a concept that is especially applicable to high-stakes domains such as healthcare. We provide analysis and methodology to obtain maximally-fair no-harm classifiers on finite datasets. We argue that even in domains where fairness at cost is required, no-harm fairness can prove to be the optimal first step. This same methodology can also be applied to any unbalanced classification task, where we want to dynamically equalize the misclassification risks across outcomes without degrading overall performance any more than strictly necessary. We test the proposed methodology on real case-studies of predicting income, ICU patient mortality, classifying skin lesions from images, and assessing credit risk, demonstrating how the proposed framework compares favorably to other traditional approaches.",
        "keywords": "Fairness;Fairness in Machine Learning;No-Harm Fairness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Natalia Martinez;Martin Bertran;Guillermo Sapiro",
        "authorids": "natalia.martinez@duke.edu;martin.bertran@duke.edu;guillermo.sapiro@duke.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmartinez2020pareto,\ntitle={Pareto Optimality in No-Harm Fairness},\nauthor={Natalia Martinez and Martin Bertran and Guillermo Sapiro},\nyear={2020},\nurl={https://openreview.net/forum?id=B1e5TA4FPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1e5TA4FPr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "458;180;622",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "331;216;384",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            420.0,
            182.4353766862849
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            310.3333333333333,
            70.12528470926557
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6OssJOxjAmEJ:scholar.google.com/&scioq=Pareto+Optimality+in+No-Harm+Fairness&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1e9Y2NYvS",
        "title": "On Robustness of Neural Ordinary Differential Equations",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": " Neural ordinary differential equations (ODEs) have been attracting increasing attention in various research domains recently.  There have been some works studying optimization issues and approximation capabilities of neural ODEs, but their robustness is still yet unclear. In this work, we fill this important gap by exploring robustness properties of neural ODEs both empirically and theoretically. We first present an empirical study on the robustness of the neural ODE-based networks (ODENets) by exposing them to inputs with various types of perturbations and subsequently investigating the changes of the corresponding outputs. In contrast to conventional convolutional neural networks (CNNs), we find that the ODENets are more robust against both random Gaussian perturbations and adversarial attack examples. We then provide an insightful understanding of this phenomenon by exploiting a certain desirable property of the flow of a continuous-time ODE, namely that integral curves are non-intersecting. Our work suggests that, due to their intrinsic robustness, it is promising to use neural ODEs as a basic block for building robust deep network models. To further enhance the robustness of vanilla neural ODEs, we propose the time-invariant steady neural ODE (TisODE), which regularizes the flow on perturbed data via the time-invariant property and the imposition of a steady-state constraint. We show that the TisODE method outperforms vanilla neural ODEs and also can work in conjunction with other state-of-the-art architectural methods to build more robust deep networks.",
        "keywords": "Neural ODE",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hanshu YAN;Jiawei DU;Vincent TAN;Jiashi FENG",
        "authorids": "hanshu.yan@u.nus.edu;dujiawei@u.nus.edu;vtan@nus.edu.sg;elefjia@nus.edu.sg",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nYAN2020On,\ntitle={On Robustness of Neural Ordinary Differential Equations},\nauthor={Hanshu YAN and Jiawei DU and Vincent TAN and Jiashi FENG},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1e9Y2NYvS}\n}",
        "github": "https://github.com/HanshuYAN/TisODE",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1e9Y2NYvS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "463;194;296",
        "wc_reply_reviewers": "154;0;0",
        "wc_reply_authors": "1495;266;273",
        "reply_reviewers": "1;0;0",
        "reply_authors": "5;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            317.6666666666667,
            110.88232000138204
        ],
        "wc_reply_reviewers_avg": [
            51.333333333333336,
            72.59629620181887
        ],
        "wc_reply_authors_avg": [
            678.0,
            577.7133083690098
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.8856180831641267
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 188,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12991236712487678100&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1eB5xSFvr",
        "title": "DiffTaichi: Differentiable Programming for Physical Simulation",
        "track": "main",
        "status": "Poster",
        "tldr": "We study the problem of learning and optimizing through physical simulations via differentiable programming, using our proposed DiffSim programming language and compiler.",
        "abstract": "We present DiffTaichi, a new differentiable programming language tailored for building high-performance differentiable physical simulators. Based on an imperative programming language, DiffTaichi generates gradients of simulation steps using source code transformations that preserve arithmetic intensity and parallelism. A light-weight tape is used to record the whole simulation program structure and replay the gradient kernels in a reversed order, for end-to-end backpropagation.\nWe demonstrate the performance and productivity of our language in gradient-based learning and optimization tasks on 10 different physical simulators. For example, a differentiable elastic object simulator written in our language is 4.2x shorter than the hand-engineered CUDA version yet runs as fast, and is 188x faster than the TensorFlow implementation.\nUsing our differentiable programs, neural network controllers are typically optimized within only tens of iterations.",
        "keywords": "Differentiable programming;robotics;optimal control;physical simulation;machine learning system",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuanming Hu;Luke Anderson;Tzu-Mao Li;Qi Sun;Nathan Carr;Jonathan Ragan-Kelley;Fredo Durand",
        "authorids": "yuanmhu@gmail.com;lukea@mit.edu;tzumao@berkeley.edu;qisu@adobe.com;ncarr@adobe.com;jrk@berkeley.edu;fredo@mit.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nHu2020DiffTaichi:,\ntitle={DiffTaichi: Differentiable Programming for Physical Simulation},\nauthor={Yuanming Hu and Luke Anderson and Tzu-Mao Li and Qi Sun and Nathan Carr and Jonathan Ragan-Kelley and Fredo Durand},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eB5xSFvr}\n}",
        "github": "https://github.com/yuanming-hu/difftaichi",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1eB5xSFvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "428;427;276",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "305;450;248",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            377.0,
            71.41895173318261
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            334.3333333333333,
            85.0346334671285
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 486,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16308007401739546779&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "B1eBoJStwr",
        "title": "Semi-supervised semantic segmentation needs strong, high-dimensional perturbations",
        "track": "main",
        "status": "Reject",
        "tldr": "Why semi-supervised semantic segmentation is a challenging problem (no cluster assumption) and how to get consistency regularisation to work",
        "abstract": "Consistency regularization describes a class of approaches that have yielded ground breaking results in semi-supervised classification problems. Prior work has established the cluster assumption\\,---\\,under which the data distribution consists of uniform class clusters of samples separated by low density regions\\,---\\,as key to its success. We analyze the problem of semantic segmentation and find that the data distribution does not exhibit low density regions separating classes and offer this as an explanation for why semi-supervised segmentation is a challenging problem. \nWe then identify the conditions that allow consistency regularization to work even without such low-density regions. \nThis allows us to generalize the recently proposed CutMix augmentation technique to a powerful masked variant, CowMix, \nleading to a successful application of consistency regularization in the semi-supervised semantic segmentation setting and\nreaching state-of-the-art results in several standard datasets.",
        "keywords": "computer vision;semantic segmentation;semi-supervised;consistency regularisation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Geoff French;Timo Aila;Samuli Laine;Michal Mackiewicz;Graham Finlayson",
        "authorids": "g.french@uea.ac.uk;taila@nvidia.com;slaine@nvidia.com;m.mackiewicz@uea.ac.uk;g.finlayson@uea.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nfrench2020semisupervised,\ntitle={Semi-supervised semantic segmentation needs strong, high-dimensional perturbations},\nauthor={Geoff French and Timo Aila and Samuli Laine and Michal Mackiewicz and Graham Finlayson},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eBoJStwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1eBoJStwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "272;238;662",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "367;302;558",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            390.6666666666667,
            192.36308262126482
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            409.0,
            108.64928286310345
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 123,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=327565071745453507&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1eCk1StPH",
        "title": "The Generalization-Stability Tradeoff in Neural Network Pruning",
        "track": "main",
        "status": "Reject",
        "tldr": "We demonstrate that pruning methods which introduce greater instability into the loss also confer improved generalization, and explore the mechanisms underlying this effect.",
        "abstract": "Pruning neural network parameters is often viewed as a means to compress models, but pruning has also been motivated by the desire to prevent overfitting. This motivation is particularly relevant given the perhaps surprising observation that a wide variety of pruning approaches increase test accuracy despite sometimes massive reductions in parameter counts. To better understand this phenomenon, we analyze the behavior of pruning over the course of training, finding that pruning's effect on generalization relies more on the instability it generates (defined as the drops in test accuracy immediately following pruning) than on the final size of the pruned model. We demonstrate that even the pruning of unimportant parameters can lead to such instability, and show similarities between pruning and regularizing by injecting noise, suggesting a mechanism for pruning-based generalization improvements that is compatible with the strong generalization recently observed in over-parameterized networks.",
        "keywords": "pruning;generalization;stability;dynamics;regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Brian R. Bartoldson;Ari S. Morcos;Adrian Barbu;Gordon Erlebacher",
        "authorids": "bbartoldson@fsu.edu;arimorcos@gmail.com;abarbu@stat.fsu.edu;gerlebacher@fsu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nbartoldson2020the,\ntitle={The Generalization-Stability Tradeoff in Neural Network Pruning},\nauthor={Brian R. Bartoldson and Ari S. Morcos and Adrian Barbu and Gordon Erlebacher},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eCk1StPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1eCk1StPH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "397;199;476",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "778;1346;845",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;2",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.3333333333333,
            116.51132515291759
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            989.6666666666666,
            253.44602756580915
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 110,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2374906680152964127&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "B1eP504YDr",
        "title": "Independence-aware Advantage Estimation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Most of existing advantage function estimation methods in reinforcement learning suffer from the problem of high variance, which scales unfavorably with the time horizon. To address this challenge, we propose to identify the independence property between current action and future states in environments, which can be further leveraged to effectively reduce the variance of the advantage estimation. In particular, the recognized independence property can be naturally utilized to construct a novel importance sampling advantage estimator with close-to-zero variance even when the Monte-Carlo return signal yields a large variance. To further remove the risk of the high variance introduced by the new estimator, we combine it with existing Monte-Carlo estimator via a reward decomposition model learned by minimizing the estimation variance. Experiments demonstrate that our method achieves higher sample efficiency compared with existing advantage estimation methods in complex environments. ",
        "keywords": "Reinforcement Learning;Advantage Estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pushi Zhang;Li Zhao;Guoqing Liu;Jiang Bian;Minglie Huang;Tao Qin;Tie-Yan Liu",
        "authorids": "zpschang@gmail.com;lizo@microsoft.com;lgq1001@mail.ustc.edu.cn;jiang.bian@microsoft.com;aihuang@mails.tsinghua.edu.cn;taoqin@microsoft.com;tie-yan.liu@microsoft.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nzhang2020independenceaware,\ntitle={Independence-aware Advantage Estimation},\nauthor={Pushi Zhang and Li Zhao and Guoqing Liu and Jiang Bian and Minglie Huang and Tao Qin and Tie-Yan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eP504YDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1eP504YDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "294;985;588",
        "wc_reply_reviewers": "249;19;16",
        "wc_reply_authors": "1210;310;483",
        "reply_reviewers": "1;1;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            622.3333333333334,
            283.14228853273676
        ],
        "wc_reply_reviewers_avg": [
            94.66666666666667,
            109.13701887484781
        ],
        "wc_reply_authors_avg": [
            667.6666666666666,
            389.937031953736
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8968843976194939566&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1eQcCEtDB",
        "title": "Calibration, Entropy Rates, and Memory in Language Models",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Building accurate language models that capture meaningful long-term dependencies is a core challenge in natural language processing. Towards this end, we present a calibration-based approach to measure long-term discrepancies between a generative sequence model and the true distribution, and use these discrepancies to improve the model. Empirically, we show that state-of-the-art language models, including LSTMs and Transformers, are \\emph{miscalibrated}: the entropy rates of their generations drift dramatically upward over time. We then provide provable methods to mitigate this phenomenon. Furthermore, we show how this calibration-based approach can also be used to measure the amount of memory that language models use for prediction.",
        "keywords": "information theory;natural language processing;calibration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mark Braverman;Xinyi Chen;Sham Kakade;Karthik Narasimhan;Cyril Zhang;Yi Zhang",
        "authorids": "mbraverm@cs.princeton.edu;xinyic@google.com;sham@cs.washington.edu;karthikn@cs.princeton.edu;cyril.zhang@cs.princeton.edu;y.zhang@cs.princeton.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nbraverman2020calibration,\ntitle={Calibration, Entropy Rates, and Memory in Language Models},\nauthor={Mark Braverman and Xinyi Chen and Sham Kakade and Karthik Narasimhan and Cyril Zhang and Yi Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eQcCEtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1eQcCEtDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "704;755;129",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "398;335;104",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            529.3333333333334,
            283.8430708840519
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            279.0,
            126.38829059687451
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6805814744150690941&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 14
    },
    {
        "id": "B1eWOJHKvB",
        "title": "Kernel of CycleGAN as a principal homogeneous space",
        "track": "main",
        "status": "Poster",
        "tldr": "The space of approximate solutions of CycleGAN admits a lot of symmetry, and an identity loss does not fix this.",
        "abstract": "Unpaired image-to-image translation has attracted significant interest due to the invention of CycleGAN, a method which utilizes a combination of adversarial and cycle consistency losses to avoid the need for paired data. It is known that the CycleGAN problem might admit multiple solutions, and our goal in this paper is to analyze the space of exact solutions and to give perturbation bounds for approximate solutions. We show theoretically that the exact solution space is invariant with respect to automorphisms of the underlying probability spaces, and, furthermore, that the group of automorphisms acts freely and transitively on the space of exact solutions. We examine the case of zero pure CycleGAN loss first in its generality, and, subsequently, expand our analysis to approximate solutions for extended CycleGAN loss where identity loss term is included. In order to demonstrate that these results are applicable, we show that under mild conditions nontrivial smooth automorphisms exist. Furthermore, we provide empirical evidence that neural networks can learn these automorphisms with unexpected and unwanted results. We conclude that finding optimal solutions to the CycleGAN loss does not necessarily lead to the envisioned result in image-to-image translation tasks and that underlying hidden symmetries can render the result useless.",
        "keywords": "Generative models;CycleGAN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nikita Moriakov;Jonas Adler;Jonas Teuwen",
        "authorids": "nikita.moriakov@radboudumc.nl;jonasadl@kth.se;jonas.teuwen@radboudumc.nl",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nMoriakov2020Kernel,\ntitle={Kernel of CycleGAN as a principal homogeneous space},\nauthor={Nikita Moriakov and Jonas Adler and Jonas Teuwen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eWOJHKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1eWOJHKvB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "323;318;104",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "421;250;39",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            248.33333333333334,
            102.07948972355916
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            236.66666666666666,
            156.23557711211475
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16616791058364409013&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "B1eWbxStPH",
        "title": "Directional Message Passing for Molecular Graphs",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Directional message passing incorporates spatial directional information to improve graph neural networks.",
        "abstract": "Graph neural networks have recently achieved great successes in predicting quantum mechanical properties of molecules. These models represent a molecule as a graph using only the distance between atoms (nodes). They do not, however, consider the spatial direction from one atom to another, despite directional information playing a central role in empirical potentials for molecules, e.g. in angular potentials. To alleviate this limitation we propose directional message passing, in which we embed the messages passed between atoms instead of the atoms themselves. Each message is associated with a direction in coordinate space. These directional message embeddings are rotationally equivariant since the associated directions rotate with the molecule. We propose a message passing scheme analogous to belief propagation, which uses the directional information by transforming messages based on the angle between them. Additionally, we use spherical Bessel functions and spherical harmonics to construct theoretically well-founded, orthogonal representations that achieve better performance than the currently prevalent Gaussian radial basis representations while using fewer than 1/4 of the parameters. We leverage these innovations to construct the directional message passing neural network (DimeNet). DimeNet outperforms previous GNNs on average by 76% on MD17 and by 31% on QM9. Our implementation is available online.",
        "keywords": "GNN;Graph neural network;message passing;graphs;equivariance;molecules",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Johannes Gasteiger;Janek Gro\u00df;Stephan G\u00fcnnemann",
        "authorids": "j.gasteiger@in.tum.de;grossja@in.tum.de;guennemann@in.tum.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nGasteiger2020Directional,\ntitle={Directional Message Passing for Molecular Graphs},\nauthor={Johannes Gasteiger and Janek Gro\u00df and Stephan G\u00fcnnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eWbxStPH}\n}",
        "github": "https://www.daml.in.tum.de/dimenet",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1eWbxStPH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "283;139;464",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "256;67;204",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.3333333333333,
            132.96699674062816
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            175.66666666666666,
            79.71755696763974
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 584,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18349010234285626260&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1eWu0NtDS",
        "title": "Neuron ranking - an informed way to compress convolutional neural networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose CNN neuron ranking with two different methods and show their consistency in producing the result which allows to interpret what network deems important and compress the network by keeping the most relevant nodes.",
        "abstract": "Convolutional neural networks (CNNs) in recent years have made a dramatic impact in science, technology and industry, yet the theoretical mechanism of CNN architecture design remains surprisingly vague. The CNN neurons, including its distinctive element, convolutional filters, are known to be learnable features, yet their individual role in producing the output is rather unclear. The thesis of this work is that not all neurons are equally important and some of them contain more useful information to perform a given task. Hence, we propose to quantify and rank neuron importance, and directly incorporate neuron importance in the objective function under two formulations: (1) a game theoretical approach based on Shapley value which computes the marginal contribution of each filter; and (2) a probabilistic approach based on what-we-call, the importance switch using variational inference. Using these two methods we confirm the general theory that some of the neurons are inherently more important than the others. Various experiments illustrate that learned ranks can be readily useable for structured network compression and interpretability of learned features. ",
        "keywords": "convolutional neural network;compression;shapley value;importance switch;variational inference;interpretability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kamil Adamczewski;Mijung Park",
        "authorids": "kamil.m.adamczewski@gmail.com;mijung.park@tuebingen.mpg.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1eWu0NtDS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "255;961;371",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            529.0,
            309.1191787428704
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:r1bZzqANhn0J:scholar.google.com/&scioq=Neuron+ranking+-+an+informed+way+to+compress+convolutional+neural+networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1eX_a4twH",
        "title": "Superseding Model Scaling by Penalizing Dead Units and Points with Separation Constraints",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose using a set of constraints to penalize dead neurons and points in order to train very deep networks of constant width.",
        "abstract": "In this article, we study a proposal that enables to train extremely thin (4 or 8 neurons per layer) and relatively deep (more than 100 layers) feedforward networks without resorting to any architectural modification such as Residual or Dense connections, data normalization or model scaling. We accomplish that by alleviating two problems. One of them are neurons whose output is zero for all the dataset, which renders them useless. This problem is known to the academic community as \\emph{dead neurons}. The other is a less studied problem, dead points. Dead points refers to data points that are mapped to zero during the forward pass of the network. As such, the gradient generated by those points is not propagated back past the layer where they die, thus having no effect in the training process. In this work, we characterize both problems and propose a constraint formulation that added to the standard loss function solves them both. As an additional benefit, the proposed method allows to initialize the network weights with constant or even zero values and still allowing the network to converge to reasonable results. We show very promising results on a toy, MNIST, and CIFAR-10 datasets.",
        "keywords": "Dead Point;Dead Unit;Model Scaling;Separation Constraints;Dying ReLU;Constant Width;Deep Neural Networks;Backpropagation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Carles Riera;Camilo Rey-Torres;Eloi Puertas;Oriol Pujol",
        "authorids": "blauigris@gmail.com;camilorey@gmail.com;epuertas@ub.edu;oriol_pujol@ub.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nriera2020superseding,\ntitle={Superseding Model Scaling by Penalizing Dead Units and Points with Separation Constraints},\nauthor={Carles Riera and Camilo Rey-Torres and Eloi Puertas and Oriol Pujol},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eX_a4twH}\n}",
        "github": "https://www.dropbox.com/s/kl96825sae12zkc/sep_cons.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1eX_a4twH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "278;821;126",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            408.3333333333333,
            298.3245808771975
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:97uB-W_74NwJ:scholar.google.com/&scioq=Superseding+Model+Scaling+by+Penalizing+Dead+Units+and+Points+with+Separation+Constraints&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1eXvyHKwS",
        "title": "THE EFFECT OF ADVERSARIAL TRAINING: A THEORETICAL CHARACTERIZATION",
        "track": "main",
        "status": "Reject",
        "tldr": "We prove adversarial training within linear classifier can rapidly converge to a robust solution. In addition, adversarial training is stable to outliers in dataset.  ",
        "abstract": "It has widely shown that adversarial training (Madry et al., 2018) is effective in defending adversarial attack empirically. However, the theoretical understanding of the difference between the solution of adversarial training and that of standard training is limited. In this paper, we characterize the solution of adversarial training for linear classification problem for a full range of adversarial radius \". Specifically, we show that if the data themselves are \u201d-strongly linearly-separable\u201d, adversarial\ntraining with radius smaller than \" converges to the hard margin solution of SVM with a faster rate than standard training. If the data themselves are not \u201d-strongly linearly-separable\u201d, we show that adversarial training with radius \" is stable to outliers while standard training is not. Moreover, we prove that the classifier returned by adversarial training with a large radius \" has low confidence in each data point. Experiments corroborate our theoretical finding well.",
        "keywords": "adversarial training;robustness;separable data",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mingyang Yi;Huishuai Zhang;Wei Chen;Zhi-Ming Ma;Tie-Yan Liu",
        "authorids": "yimingyang17@mails.ucas.edu.cn;huzhang@microsoft.com;wche@microsoft.com;mazm@amt.ac.cn;tie-yan.liu@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyi2020the,\ntitle={{\\{}THE{\\}} {\\{}EFFECT{\\}} {\\{}OF{\\}} {\\{}ADVERSARIAL{\\}} {\\{}TRAINING{\\}}: A {\\{}THEORETICAL{\\}} {\\{}CHARACTERIZATION{\\}}},\nauthor={Mingyang Yi and Huishuai Zhang and Wei Chen and Zhi-Ming Ma and Tie-Yan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eXvyHKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1eXvyHKwS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "828;397;594",
        "wc_reply_reviewers": "0;91;0",
        "wc_reply_authors": "363;677;518",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            606.3333333333334,
            176.17100278485736
        ],
        "wc_reply_reviewers_avg": [
            30.333333333333332,
            42.897811391983886
        ],
        "wc_reply_authors_avg": [
            519.3333333333334,
            128.19343023554507
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZceMslkvVsYJ:scholar.google.com/&scioq=THE+EFFECT+OF+ADVERSARIAL+TRAINING:+A+THEORETICAL+CHARACTERIZATION&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1eXygBFPH",
        "title": "Attacking Graph Convolutional Networks via Rewiring",
        "track": "main",
        "status": "Reject",
        "tldr": "Using rewiring operation to conduct adversarial attacks on graph structured data.",
        "abstract": "Graph Neural Networks (GNNs) have boosted the performance of many graph related tasks such as node classification and graph classification. Recent researches show that graph neural networks are vulnerable to adversarial attacks, which deliberately add carefully created unnoticeable perturbation to the graph structure. The perturbation is usually created by adding/deleting a few edges, which might be noticeable even when the number of edges modified is small. In this paper, we propose a graph rewiring operation which affects the graph in a less noticeable way compared to adding/deleting edges. We then use reinforcement learning to learn the attack strategy based on the proposed rewiring operation. Experiments on real world graphs demonstrate the effectiveness of the proposed framework. To understand the proposed framework, we further analyze how its generated perturbation to the graph structure affects the output of the target model.",
        "keywords": "Graph Neural Networks;Rewiring;Adversarial Attacks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yao Ma;Suhang Wang;Tyler Derr;Lingfei Wu;Jiliang Tang",
        "authorids": "mayao4@msu.edu;szw494@psu.edu;derrtyle@msu.edu;wuli@us.ibm.com;tangjili@msu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nma2020attacking,\ntitle={Attacking Graph Convolutional Networks via Rewiring},\nauthor={Yao Ma and Suhang Wang and Tyler Derr and Lingfei Wu and Jiliang Tang},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eXygBFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1eXygBFPH",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "477;304;224;290",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "1111;686;641;697",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;1;1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            323.75,
            93.49431800917101
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            783.75,
            190.09915176033795
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 94,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=943873232204954325&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "B1eYGkBKDB",
        "title": "Fully Quantized Transformer for Improved Translation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We fully quantize the Transformer to 8-bit and improve translation quality compared to the full precision model.",
        "abstract": "State-of-the-art neural machine translation methods employ massive amounts of parameters. Drastically reducing computational costs of such methods without affecting performance has been up to this point unsolved. In this work, we propose a quantization strategy tailored to the Transformer architecture. We evaluate our method on the WMT14 EN-FR and WMT14 EN-DE translation tasks and achieve state-of-the-art quantization results for the Transformer, obtaining no loss in BLEU scores compared to the non-quantized baseline. We further compress the Transformer by showing that, once the model is trained, a good portion of the nodes in the encoder can be removed without causing any loss in BLEU.",
        "keywords": "Transformer;quantization;machine translation;compression;pruning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gabriele Prato;Ella Charlaix;Mehdi Rezagholizadeh",
        "authorids": "prato.gab@gmail.com;ella.charlaix@huawei.com;mehdi.rezagholizadeh@huawei.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nprato2020fully,\ntitle={Fully Quantized Transformer for Improved Translation},\nauthor={Gabriele Prato and Ella Charlaix and Mehdi Rezagholizadeh},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eYGkBKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1eYGkBKDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "287;252;406",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "323;475;249",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            315.0,
            65.91408549518583
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            349.0,
            94.07798183776407
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 29,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11666785427644973911&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "B1eY_pVYvB",
        "title": "Efficient and Information-Preserving Future Frame Prediction and Beyond",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Applying resolution-preserving blocks is a common practice to maximize information preservation in video prediction, yet their high memory consumption greatly limits their application scenarios. We propose CrevNet, a Conditionally Reversible Network that uses reversible architectures to build a bijective two-way autoencoder and its complementary recurrent predictor. Our model enjoys the theoretically guaranteed property of no information loss during the feature extraction, much lower memory consumption and computational efficiency. The lightweight nature of our model enables us to incorporate 3D convolutions without concern of memory bottleneck, enhancing the model's ability to capture both short-term and long-term temporal dependencies. Our proposed approach achieves state-of-the-art results on Moving MNIST, Traffic4cast and KITTI datasets. We further demonstrate the transferability of our self-supervised learning method by exploiting its learnt features for object detection on KITTI. Our competitive results indicate the potential of using CrevNet as a generative pre-training strategy to guide downstream tasks.",
        "keywords": "self-supervised learning;generative pre-training;video prediction;reversible architecture",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei Yu;Yichao Lu;Steve Easterbrook;Sanja Fidler",
        "authorids": "gnosis@cs.toronto.edu;yichao@cs.toronto.edu;sme@cs.toronto.edu;fidler@cs.toronto.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nYu2020Efficient,\ntitle={Efficient and Information-Preserving Future Frame Prediction and Beyond},\nauthor={Wei Yu and Yichao Lu and Steve Easterbrook and Sanja Fidler},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eY_pVYvB}\n}",
        "github": "https://drive.google.com/file/d/1koVpH2RhkOl4_Xm_q8Iy1FuX3zQxC9gd/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1eY_pVYvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "149;309;243",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "452;767;240",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            233.66666666666666,
            65.65228268858762
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            486.3333333333333,
            216.5122526684242
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 142,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7190900656259459167&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "B1eYlgBYPH",
        "title": "A Deep Recurrent Neural Network via Unfolding Reweighted l1-l1 Minimization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep unfolding methods design deep neural networks as learned variations of optimization methods. These networks have been shown to achieve faster convergence and higher accuracy than the original optimization methods. In this line of research, this paper develops a novel deep recurrent neural network (coined reweighted-RNN) by unfolding a reweighted l1-l1 minimization algorithm and applies it to the task of sequential signal reconstruction. To the best of our knowledge, this is the first deep unfolding method that explores reweighted minimization. Due to the underlying reweighted minimization model, our RNN has a different soft-thresholding function (alias, different activation function) for each hidden unit in each layer. Furthermore, it has higher network expressivity than existing deep unfolding RNN models due to the over-parameterizing weights. Moreover, we establish theoretical generalization error bounds for the proposed reweighted-RNN model by means of Rademacher complexity. The bounds reveal that the parameterization of the proposed reweighted-RNN ensures good generalization. We apply the proposed reweighted-RNN to the problem of video-frame reconstruction from low-dimensional measurements, that is, sequential frame reconstruction. The experimental results on the moving MNIST dataset demonstrate that the proposed deep reweighted-RNN significantly outperforms existing RNN models.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Huynh Van Luong;Duy Hung Le;Nikos Deligiannis",
        "authorids": "hvanluon@etrovub.be;dle@etrovub.be;ndeligia@etrovub.be",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nluong2020a,\ntitle={A Deep Recurrent Neural Network via Unfolding Reweighted l1-l1 Minimization},\nauthor={Huynh Van Luong and Duy Hung Le and Nikos Deligiannis},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eYlgBYPH}\n}",
        "github": "https://1drv.ms/u/s!ApHn770BvhH2aWay9xEhAiXydfo?e=aCX1X0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer5;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1eYlgBYPH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "255;340;216",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "495;641;543",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            270.3333333333333,
            51.77086267604802
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            559.6666666666666,
            60.75817274701039
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eJCcDW8dHjIJ:scholar.google.com/&scioq=A+Deep+Recurrent+Neural+Network+via+Unfolding+Reweighted+l1-l1+Minimization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1eZYkHYPS",
        "title": "Shifted Randomized Singular Value Decomposition",
        "track": "main",
        "status": "Reject",
        "tldr": "A randomized algorithm to estimate the SVD of a shifted data matrix without explicitly constructing the matrix in the memory.",
        "abstract": "We extend the randomized singular value decomposition (SVD) algorithm (Halko et al., 2011) to estimate the SVD of a shifted data matrix without explicitly constructing the matrix in the memory. With no loss in the accuracy of the original algorithm, the extended algorithm provides for a more efficient way of matrix factorization. The algorithm facilitates the low-rank approximation and principal component analysis (PCA) of off-center data matrices. When applied to different types of data matrices, our experimental results confirm the advantages of the extensions made to the original algorithm.",
        "keywords": "SVD;PCA;Randomized Algorithms",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ali Basirat",
        "authorids": "ali.basirat@lingfil.uu.se",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nbasirat2020shifted,\ntitle={Shifted Randomized Singular Value Decomposition},\nauthor={Ali Basirat},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eZYkHYPS}\n}",
        "github": "https://drive.google.com/file/d/1bjG5kAQ9WoTbQKFX41SnHW9eaik_SujD/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1eZYkHYPS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "104;131;332",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            189.0,
            101.7152889196113
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Bcpy3bwlTRIJ:scholar.google.com/&scioq=Shifted+Randomized+Singular+Value+Decomposition&hl=en&as_sdt=0,5",
        "gs_version_total": 6
    },
    {
        "id": "B1eZweHFwr",
        "title": "Statistical Verification of General Perturbations by Gaussian Smoothing",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a statistical certification method to certify robustness for rotations, translations and other transformations.",
        "abstract": "We present a novel statistical certification method that generalizes prior work based on smoothing to handle richer perturbations. Concretely, our method produces a provable classifier which can establish statistical robustness against geometric perturbations (e.g., rotations, translations) as well as volume changes and pitch shifts on audio data. The generalization is non-trivial and requires careful handling of operations such as interpolation. Our method is agnostic to the choice of classifier and scales to modern architectures such as ResNet-50 on ImageNet.",
        "keywords": "adversarial robustness;certified network;randomised smoothing;geometric perturbations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marc Fischer;Maximilian Baader;Martin Vechev",
        "authorids": "marcfisc@student.ethz.ch;mbaader@inf.ethz.ch;martin.vechev@inf.ethz.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfischer2020statistical,\ntitle={Statistical Verification of General Perturbations by Gaussian Smoothing},\nauthor={Marc Fischer and Maximilian Baader and Martin Vechev},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eZweHFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1eZweHFwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "206;267;350",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "122;209;167",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            274.3333333333333,
            59.01600536201081
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            166.0,
            35.52463933666322
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5256003582494489364&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1ecVlrtDr",
        "title": "Symmetric-APL Activations: Training Insights and Robustness to Adversarial Attacks",
        "track": "main",
        "status": "Reject",
        "tldr": "Symmetric Adaptive Piecewise Linear activations are proposed as new activation function with deep explanation on training behavior and robustness to adversarial attacks.",
        "abstract": "Deep neural networks with learnable activation functions have shown superior performance over deep neural networks with fixed activation functions for many different problems. The adaptability of learnable activation functions adds expressive power to the model which results in better performance. Here, we propose a new learnable activation function based on Adaptive Piecewise Linear units (APL), which 1) gives equal expressive power to both the positive and negative halves on the input space and 2) is able to approximate any zero-centered continuous non-linearity in a closed interval. We investigate how the shape of the Symmetric-APL function changes during training and perform ablation studies to gain insight into the reason behind these changes. We hypothesize that these activation functions go through two distinct stages: 1) adding gradient information and 2) adding expressive power. Finally, we show that the use of Symmetric-APL activations can significantly increase the robustness of deep neural networks to adversarial attacks. Our experiments on both black-box and open-box adversarial attacks show that commonly-used architectures, namely Lenet, Network-in-Network, and ResNet-18 can be up to 51% more resistant to adversarial fooling by only using the proposed activation functions instead of ReLUs.",
        "keywords": "Activation function;Adaptive;Training;Robustness;Adversarial attack",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohammadamin Tavakoli;Forest Agostinelli;Pierre Baldi",
        "authorids": "mohamadt@uci.edu;fagostin@uci.edu;pfbaldi@ics.uci.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntavakoli2020symmetricapl,\ntitle={Symmetric-{\\{}APL{\\}} Activations: Training Insights and Robustness to Adversarial Attacks},\nauthor={Mohammadamin Tavakoli and Forest Agostinelli and Pierre Baldi},\nyear={2020},\nurl={https://openreview.net/forum?id=B1ecVlrtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1ecVlrtDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "473;258;606",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "416;517;441",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            445.6666666666667,
            143.37906247271795
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            458.0,
            42.949582846247374
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_pqnYdbRaiwJ:scholar.google.com/&scioq=Symmetric-APL+Activations:+Training+Insights+and+Robustness+to+Adversarial+Attacks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1eiJyrtDB",
        "title": "Improved Generalization Bound of Permutation Invariant Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We theoretically prove that a permutation invariant property of deep neural networks largely improves its generalization performance.",
        "abstract": "We theoretically prove that a permutation invariant property of deep neural networks largely improves its generalization performance. Learning problems with data that are invariant to permutations are frequently observed in various applications, for example, point cloud data and graph neural networks. Numerous methodologies have been developed and they achieve great performances, however, understanding a mechanism of the performance is still a developing problem. In this paper, we derive a theoretical generalization bound for invariant deep neural networks with a ReLU activation to clarify their mechanism. Consequently, our bound shows that the main term of their generalization gap is improved by $\\sqrt{n!}$ where $n$ is a number of permuting coordinates of data. Moreover, we prove that an approximation power of invariant deep neural networks can achieve an optimal rate, though the networks are restricted to be invariant. To achieve the results, we develop several new proof techniques such as correspondence with a fundamental domain and a scale-sensitive metric entropy.",
        "keywords": "Deep Neural Network;Invariance;Symmetry;Group;Generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akiyoshi Sannai;Masaaki Imaizumi",
        "authorids": "akiyoshi.sannai@riken.jp;imaizumi@ism.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsannai2020improved,\ntitle={Improved Generalization Bound of Permutation Invariant Deep Neural Networks},\nauthor={Akiyoshi Sannai and Masaaki Imaizumi},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eiJyrtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1eiJyrtDB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "188;477;397",
        "wc_reply_reviewers": "96;0;0",
        "wc_reply_authors": "272;116;164",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            354.0,
            121.83869117265938
        ],
        "wc_reply_reviewers_avg": [
            32.0,
            45.254833995939045
        ],
        "wc_reply_authors_avg": [
            184.0,
            65.23802572120036
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8353791521378005488&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1eibJrtwr",
        "title": "Abstractive Dialog Summarization with Semantic Scaffolds",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel end-to-end model (SPNet) to incorporate semantic scaffolds for improving abstractive dialog summarization.",
        "abstract": "The demand for abstractive dialog summary is growing in real-world applications. For example, customer service center or hospitals would like to summarize customer service interaction and doctor-patient interaction. However, few researchers explored abstractive summarization on dialogs due to the lack of  suitable datasets. We propose an abstractive dialog summarization dataset based on MultiWOZ. If we directly apply previous state-of-the-art document summarization methods on dialogs, there are two significant drawbacks: the informative entities such as restaurant names are difficult to preserve, and the contents from different dialog domains are sometimes mismatched. To address these two drawbacks, we propose Scaffold Pointer Network (SPNet) to utilize the existing annotation on speaker role, semantic slot and dialog domain. SPNet incorporates these semantic scaffolds for dialog summarization. Since ROUGE cannot capture the two drawbacks mentioned, we also propose a new evaluation metric that considers critical informative entities in the text. On MultiWOZ, our proposed SPNet outperforms state-of-the-art abstractive summarization methods on all the automatic and human evaluation metrics.",
        "keywords": "Abstractive Summarization;Dialog;Multi-task Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lin Yuan;Zhou Yu",
        "authorids": "yuanlinzju@gmail.com;joyu@ucdavis.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nyuan2020abstractive,\ntitle={Abstractive Dialog Summarization with Semantic Scaffolds},\nauthor={Lin Yuan and Zhou Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eibJrtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1eibJrtwr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "138;403;274",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            271.6666666666667,
            108.19837747807907
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13110243739500243556&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "B1eksh4KvH",
        "title": "CurricularFace: Adaptive Curriculum Learning Loss for Deep Face Recognition",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A novel  Adaptive Curriculum Learning loss for deep face recognition",
        "abstract": "As an emerging topic in face recognition, designing margin-based loss functions can increase the feature margin between different classes for enhanced discriminability. More recently, absorbing the idea of mining-based strategies is adopted to emphasize the misclassified samples and achieve promising results. However, during the entire training process, the prior methods either do not explicitly emphasize the sample based on its importance that renders the hard samples not fully exploited or explicitly emphasize the effects of semi-hard/hard samples even at the early training stage that may lead to convergence issues. In this work, we propose a novel Adaptive Curriculum Learning loss (CurricularFace) that embeds the idea of curriculum learning into the loss function to achieve a novel training strategy for deep face recognition, which mainly addresses easy samples in the early training stage and hard ones in the later stage. Specifically, our CurricularFace adaptively adjusts the relative importance of easy and hard samples during different training stages. In each stage, different samples are assigned with different importance according to their corresponding difficultness. Extensive experimental results on popular benchmarks demonstrate the superiority of our CurricularFace over the state-of-the-art competitors. Code will be available upon publication.",
        "keywords": "CurricularFace;Adaptive Curriculum Learning;Face Recognition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuge Huang;Yuhan Wang;Ying Tai;Xiaoming Liu;Pengcheng Shen;Shaoxin Li;Jilin Li;Feiyue Huang",
        "authorids": "huangyg@zju.edu.cn;wang_yuhan@zju.edu.cn;yingtai@tencent.com;liuxm@cse.msu.edu;quantshen@tencent.com;darwinli@tencent.com;jerolinli@tencent.com;garyhuang@tencent.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1eksh4KvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "217;279;316",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "364;366;202",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            270.6666666666667,
            40.84387618997764
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            310.6666666666667,
            76.84327484490032
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 686,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17543857641780685133&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "B1elCp4KwH",
        "title": "Learning Hierarchical Discrete Linguistic Units from Visually-Grounded Speech",
        "track": "main",
        "status": "Talk",
        "tldr": "Vector quantization layers incorporated into a self-supervised neural model of speech audio learn hierarchical and discrete linguistic units (phone-like, word-like) when trained with a visual-grounding objective. ",
        "abstract": "In this paper, we present a method for learning discrete linguistic units by incorporating vector quantization layers into neural models of visually grounded speech. We show that our method is capable of capturing both word-level and sub-word units, depending on how it is configured. What differentiates this paper from prior work on speech unit learning is the choice of training objective. Rather than using a reconstruction-based loss, we use a discriminative, multimodal grounding objective which forces the learned units to be useful for semantic image retrieval. We evaluate the sub-word units on the ZeroSpeech 2019 challenge, achieving a 27.3% reduction in ABX error rate over the top-performing submission, while keeping the bitrate approximately the same. We also present experiments demonstrating the noise robustness of these units. Finally, we show that a model with multiple quantizers can simultaneously learn phone-like detectors at a lower layer and word-like detectors at a higher layer. We show that these detectors are highly accurate, discovering 279 words with an F1 score of greater than 0.5.",
        "keywords": "visually-grounded speech;self-supervised learning;discrete representation learning;vision and language;vision and speech;hierarchical representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David Harwath*;Wei-Ning Hsu*;James Glass",
        "authorids": "dharwath@csail.mit.edu;wnhsu@mit.edu;glass@mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nHarwath*2020Learning,\ntitle={Learning Hierarchical Discrete Linguistic Units from Visually-Grounded Speech},\nauthor={David Harwath* and Wei-Ning Hsu* and James Glass},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1elCp4KwH}\n}",
        "github": "https://github.com/wnhsu/ResDAVEnet-VQ",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1elCp4KwH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "124;585;331",
        "wc_reply_reviewers": "0;21;0",
        "wc_reply_authors": "194;1467;722",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            346.6666666666667,
            188.5282177523802
        ],
        "wc_reply_reviewers_avg": [
            7.0,
            9.899494936611665
        ],
        "wc_reply_authors_avg": [
            794.3333333333334,
            522.2108982223774
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 108,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11078660580062138123&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "B1elqkrKPH",
        "title": "Learning robust visual representations using data augmentation invariance",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose data augmentation invariance: a simple, yet effective and efficient way of learning robust features by adding a layer-wise invariance objective in the loss function.",
        "abstract": "Deep convolutional neural networks trained for image object categorization have shown remarkable similarities with representations found across the primate ventral visual stream. Yet, artificial and biological networks still exhibit important differences. Here we investigate one such property: increasing invariance to identity-preserving image transformations found along the ventral stream. Despite theoretical evidence that invariance should emerge naturally from the optimization process, we present empirical evidence that the activations of convolutional neural networks trained for object categorization are not robust to identity-preserving image transformations commonly used in data augmentation. As a solution, we propose data augmentation invariance, an unsupervised learning objective which improves the robustness of the learned representations by promoting the similarity between the activations of augmented image samples. Our results show that this approach is a simple, yet effective and efficient (10 % increase in training time) way of increasing the invariance of the models while obtaining similar categorization performance.",
        "keywords": "deep neural networks;visual cortex;invariance;data augmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alex Hernandez-Garcia;Peter K\u00f6nig;Tim C. Kietzmann",
        "authorids": "alexhg15@gmail.com;pkoenig@uos.de;t.kietzmann@donders.ru.nl",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhernandez-garcia2020learning,\ntitle={Learning robust visual representations using data augmentation invariance},\nauthor={Alex Hernandez-Garcia and Peter K{\\\"o}nig and Tim C. Kietzmann},\nyear={2020},\nurl={https://openreview.net/forum?id=B1elqkrKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1elqkrKPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "254;171;684",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "547;751;614",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.6666666666667,
            224.83524832394252
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            637.3333333333334,
            84.90124982720938
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=46160622919673818&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "B1em8TVtPr",
        "title": "Discourse-Based Evaluation of Language Understanding",
        "track": "main",
        "status": "Reject",
        "tldr": "Semantics is not all you need",
        "abstract": "New models for natural language understanding have made unusual progress recently, leading to claims of universal text representations. However, current benchmarks are predominantly targeting semantic phenomena;  we make the case that discourse and pragmatics need to take center stage in the evaluation of natural language understanding.\nWe introduce DiscEval, a new benchmark for the evaluation of natural language understanding, that unites 11 discourse-focused evaluation datasets. \nDiscEval can be used as supplementary training data in a multi-task learning setup, and is publicly available, alongside the code for gathering and preprocessing the datasets.\nUsing our evaluation suite, we show that natural language inference, a widely used pretraining task, does not result in genuinely universal representations, which opens a new challenge for multi-task learning.",
        "keywords": "Natural Language Understanding;Pragmatics;Discourse;Semantics;Evaluation;BERT;Natural Language Processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Damien Sileo;Tim Van-De-Cruys;Camille Pradel;Philippe Muller",
        "authorids": "damien.sileo@irit.fr;tim.vandecruys@irit.fr;camille.pradel@synapse-fr.com;philippe.muller@irit.fr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsileo2020discoursebased,\ntitle={Discourse-Based Evaluation of Language Understanding},\nauthor={Damien Sileo and Tim Van-De-Cruys and Camille Pradel and Philippe Muller},\nyear={2020},\nurl={https://openreview.net/forum?id=B1em8TVtPr}\n}",
        "github": "https://github.com/disceval/DiscEval",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1em8TVtPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "293;661;358",
        "wc_reply_reviewers": "0;309;0",
        "wc_reply_authors": "131;399;252",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            437.3333333333333,
            160.36694034480908
        ],
        "wc_reply_reviewers_avg": [
            103.0,
            145.6639969244288
        ],
        "wc_reply_authors_avg": [
            260.6666666666667,
            109.58203421283172
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2599848560701200713&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "B1em9h4KDS",
        "title": "Generative Imputation and Stochastic Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "A method to generate imputations and measure uncertainties over target class assignments based on incomplete feature vectors",
        "abstract": "In many machine learning applications, we are faced with incomplete datasets. In the literature, missing data imputation techniques have been mostly concerned with filling missing values. However, the existence of missing values is synonymous with uncertainties not only over the distribution of missing values but also over target class assignments that require careful consideration. In this paper, we propose a simple and effective method for imputing missing features and estimating the distribution of target assignments given incomplete data. In order to make imputations, we train a simple and effective generator network to generate imputations that a discriminator network is tasked to distinguish. Following this, a predictor network is trained using the imputed samples from the generator network to capture the classification uncertainties and make predictions accordingly. The proposed method is evaluated on CIFAR-10 image dataset as well as three real-world tabular classification datasets, under different missingness rates and structures. Our experimental results show the effectiveness of the proposed method in generating imputations as well as providing estimates for the class uncertainties in a classification task when faced with missing values.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohammad Kachuee;Kimmo K\u00e4rkk\u00e4inen;Orpaz Goldstein;Sajad Darabi;Majid Sarrafzadeh",
        "authorids": "mkachuee@ucla.edu;kimmo@cs.ucla.edu;orpgol@cs.ucla.edu;sajad.darabi@cs.ucla.edu;majid@cs.ucla.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nkachuee2020generative,\ntitle={Generative Imputation and Stochastic Prediction},\nauthor={Mohammad Kachuee and Kimmo K{\\\"a}rkk{\\\"a}inen and Orpaz Goldstein and Sajad Darabi and Majid Sarrafzadeh},\nyear={2020},\nurl={https://openreview.net/forum?id=B1em9h4KDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1em9h4KDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "350;535;431",
        "wc_reply_reviewers": "24;0;0",
        "wc_reply_authors": "1066;1036;807",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;2;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            438.6666666666667,
            75.72024534796196
        ],
        "wc_reply_reviewers_avg": [
            8.0,
            11.313708498984761
        ],
        "wc_reply_authors_avg": [
            969.6666666666666,
            115.67291049429949
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 31,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7755081327323779852&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "B1eoyAVFwH",
        "title": "Feature Partitioning for Efficient Multi-Task Architectures",
        "track": "main",
        "status": "Reject",
        "tldr": "automatic search for multi-task architectures that reduce per-task feature use",
        "abstract": "Multi-task learning promises to use less data, parameters, and time than training separate single-task models. But realizing these benefits in practice is challenging. In particular, it is difficult to define a suitable architecture that has enough capacity to support many tasks while not requiring excessive compute for each individual task. There are difficult trade-offs when deciding how to allocate parameters and layers across a large set of tasks. To address this, we propose a method for automatically searching over multi-task architectures that accounts for resource constraints. We define a parameterization of feature sharing strategies for effective coverage and sampling of architectures. We also present a method for quick evaluation of such architectures with feature distillation. Together these contributions allow us to quickly optimize for parameter-efficient multi-task models. We benchmark on Visual Decathlon, demonstrating that we can automatically search for and identify architectures that effectively make trade-offs between task resource requirements while maintaining a high level of final performance.",
        "keywords": "multi-task learning;neural architecture search;multi-task architecture search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alejandro Newell;Lu Jiang;Chong Wang;Li-Jia Li;Jia Deng",
        "authorids": "anewell@cs.princeton.edu;lujiang@google.com;chong.wang@bytedance.com;lijiali@cs.stanford.edu;jiadeng@princeton.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nnewell2020feature,\ntitle={Feature Partitioning for Efficient Multi-Task Architectures},\nauthor={Alejandro Newell and Lu Jiang and Chong Wang and Li-Jia Li and Jia Deng},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eoyAVFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1eoyAVFwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "213;555;491",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "213;445;389",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            419.6666666666667,
            148.4527609114166
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            349.0,
            98.84668262853674
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2965526537438711372&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1erJJrYPH",
        "title": "Optimizing Loss Landscape Connectivity via Neuron Alignment",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigate the effect of weight symmetry on the loss landscape of deep networks. ",
        "abstract": "The loss landscapes of deep neural networks are poorly understood due to their high nonconvexity. Empirically, the local optima of these loss functions can be connected by a simple curve in model space, along which the loss remains fairly constant. Yet, current path finding algorithms do not consider the influence of symmetry in the loss surface caused by weight permutations of the networks corresponding to the minima. We propose a framework to investigate the effect of symmetry on the landscape connectivity by directly optimizing the weight permutations of the networks being connected. Through utilizing an existing neuron alignment technique, we derive an initialization for the weight permutations. Empirically, this initialization is critical for efficiently learning a simple, planar, low-loss curve between networks that successfully generalizes. Additionally, we introduce a proximal alternating minimization scheme to address if an optimal permutation can be learned, with some provable convergence guarantees. We find that the learned parameterized curve is still a low-loss curve after permuting the weights of the endpoint models, for a subset of permutations. We also show that there is small but steady performance gain in performance of the ensembles constructed from the learned curve, when considering weight space symmetry.",
        "keywords": "deep learning;optimization;non-convex optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "N. Joseph Tatro;Pin-Yu Chen;Payel Das;Igor Melnyk;Prasanna Sattigeri;Rongjie Lai",
        "authorids": "tatron@rpi.edu;pin-yu.chen@ibm.com;daspa@us.ibm.com;igor.melnyk@ibm.com;psattig@us.ibm.com;lair@rpi.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ntatro2020optimizing,\ntitle={Optimizing Loss Landscape Connectivity via Neuron Alignment},\nauthor={N. Joseph Tatro and Pin-Yu Chen and Payel Das and Igor Melnyk and Prasanna Sattigeri and Rongjie Lai},\nyear={2020},\nurl={https://openreview.net/forum?id=B1erJJrYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1erJJrYPH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "476;330;616",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "201;124;149",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            474.0,
            116.76757540801584
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            158.0,
            32.072833779799794
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IMIU7z-mBxEJ:scholar.google.com/&scioq=Optimizing+Loss+Landscape+Connectivity+via+Neuron+Alignment&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1esx6EYvr",
        "title": "A critical analysis of self-supervision, or what we can learn from a single image",
        "track": "main",
        "status": "Poster",
        "tldr": "We evaluate self-supervised feature learning methods and find that with sufficient data augmentation early layers can be learned using just one image.  This is informative about self-supervision and the role of augmentations.",
        "abstract": "We look critically at popular self-supervision techniques for learning deep convolutional neural networks without manual labels. We show that three different and representative methods, BiGAN, RotNet and DeepCluster, can learn the first few layers of a convolutional network from a single image as well as using millions of images and manual labels, provided that strong data augmentation is used. However, for deeper layers the gap with manual supervision cannot be closed even if millions of unlabelled images are used for training.\nWe conclude that:\n(1) the weights of the early layers of deep networks contain limited information about the statistics of natural images, that\n(2) such low-level statistics can be learned through self-supervision just as well as through strong supervision, and that\n(3) the low-level statistics can be captured via synthetic transformations instead of using a large image dataset.",
        "keywords": "self-supervision;feature representation learning;CNN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Asano YM.;Rupprecht C.;Vedaldi A.",
        "authorids": "yuki@robots.ox.ac.uk;chrisr@robots.ox.ac.uk;vedaldi@robots.ox.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nYM.2020A,\ntitle={A critical analysis of self-supervision, or what we can learn from a single image},\nauthor={Asano YM. and Rupprecht C. and Vedaldi A.},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1esx6EYvr}\n}",
        "github": "[![github](/images/github_icon.svg) yukimasano/linear-probes](https://github.com/yukimasano/linear-probes) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1esx6EYvr)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1esx6EYvr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "325;525;536",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "416;537;740",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            462.0,
            96.9776606578374
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            564.3333333333334,
            133.6770569527754
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 171,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1196793253523325509&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "B1esygHFwS",
        "title": "Detecting Change in Seasonal Pattern via Autoencoder and Temporal Regularization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Change-point detection problem consists of discovering abrupt property changes in the generation process of time-series. Most state-of-the-art models are optimizing the power of a kernel two-sample test, with only a few assumptions on the distribution of the data. Unfortunately, because they presume the samples are distributed i.i.d, they are not able to use information about the seasonality of a time-series. In this paper, we present a novel approach - ATR-CSPD allowing the detection of changes in the seasonal pattern of a time-series. Our method uses an autoencoder together with a temporal regularization, to learn the pattern of each seasonal cycle. Using low dimensional representation of the seasonal patterns, it is possible to accurately and efficiently estimate the existence of a change point using a clustering algorithm. Through experiments on artificial and real-world data sets, we demonstrate the usefulness of the proposed method for several applications.",
        "keywords": "Autoencoder;Change Point Detection;Timeseries",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Raphael Fettaya;Dor Bank;Rachel Lemberg;Linoy Barel",
        "authorids": "raphaelfettaya@gmail.com;doban@microsoft.com;rlemberg@microsoft.com;t-libare@microsoft.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfettaya2020detecting,\ntitle={Detecting Change in Seasonal Pattern via Autoencoder and Temporal Regularization},\nauthor={Raphael Fettaya and Dor Bank and Rachel Lemberg and Linoy Barel},\nyear={2020},\nurl={https://openreview.net/forum?id=B1esygHFwS}\n}",
        "github": "https://anonymous.4open.science/r/3655aebd-63f0-4dd1-a5f8-be9dbb5ed060/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1esygHFwS",
        "pdf_size": 0,
        "rating": "1;1;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "1073;122;358;542",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            523.75,
            350.31583963617743
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:d_VdqwHutFYJ:scholar.google.com/&scioq=Detecting+Change+in+Seasonal+Pattern+via+Autoencoder+and+Temporal+Regularization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1evfa4tPB",
        "title": "Neural Network Branching for Neural Network Verification",
        "track": "main",
        "status": "Talk",
        "tldr": "We propose a novel learning to branch framework using graph neural networks to improve branch and bound based neural network verification methods. ",
        "abstract": "Formal verification of neural networks is essential for their deployment in safety-critical areas. Many available formal verification methods have been shown to be instances of a unified Branch and Bound (BaB) formulation. We propose a novel framework for designing an effective branching strategy for BaB. Specifically, we learn a graph neural network (GNN) to imitate the strong branching heuristic behaviour. Our framework differs from previous methods for learning to branch in two main aspects. Firstly, our framework directly treats the neural network we want to verify as a graph input for the GNN. Secondly, we develop an intuitive forward and backward embedding update schedule. Empirically, our framework achieves roughly $50\\%$ reduction in both the number of branches and the time required for verification on various convolutional networks when compared to the best available hand-designed branching strategy. In addition, we show that our GNN model enjoys both horizontal and vertical transferability. Horizontally, the model trained on easy properties performs well on properties of increased difficulty levels. Vertically, the model trained on small neural networks achieves similar performance on large neural networks.",
        "keywords": "Neural Network Verification;Branch and Bound;Graph Neural Network;Learning to branch",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jingyue Lu;M. Pawan Kumar",
        "authorids": "jingyue.lu@spc.ox.ac.uk;pawan@robots.ox.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLu2020Neural,\ntitle={Neural Network Branching for Neural Network Verification },\nauthor={Jingyue Lu and M. Pawan Kumar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1evfa4tPB}\n}",
        "github": "[![github](/images/github_icon.svg) oval-group/GNN_branching](https://github.com/oval-group/GNN_branching)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1evfa4tPB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "889;454;920",
        "wc_reply_reviewers": "0;0;69",
        "wc_reply_authors": "751;602;1239",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;3",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            754.3333333333334,
            212.74449986362097
        ],
        "wc_reply_reviewers_avg": [
            23.0,
            32.526911934581186
        ],
        "wc_reply_authors_avg": [
            864.0,
            272.05269097486735
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 78,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3408814607972511538&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "B1eyA3VFwS",
        "title": "Enforcing Physical Constraints in Neural Neural Networks through Differentiable PDE Layer",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel way of enforcing hard linear constraints within a convolutional neural network using a differentiable PDE layer.",
        "abstract": "Recent studies at the intersection of physics and deep learning have illustrated successes in the application of deep neural networks to partially or fully replace costly physics simulations. Enforcing physical constraints to solutions generated\nby neural networks remains a challenge, yet it is essential to the accuracy and trustworthiness of such model predictions. Many systems in the physical sciences are governed by Partial Differential Equations (PDEs). Enforcing these as hard\nconstraints, we show, are inefficient in conventional frameworks due to the high dimensionality of the generated fields. To this end, we propose the use of a novel differentiable spectral projection layer for neural networks that efficiently enforces\nspatial PDE constraints using spectral methods, yet is fully differentiable, allowing for its use as a layer in neural networks that supports end-to-end training. We show that its computational cost is cheaper than a regular convolution layer. We apply it to\nan important class of physical systems \u2013 incompressible turbulent flows, where the divergence-free PDE constraint is required. We train a 3D Conditional Generative Adversarial Network (CGAN) for turbulent flow super-resolution efficiently, whilst\nguaranteeing the spatial PDE constraint of zero divergence. Furthermore, our empirical results show that the model produces realistic flow fields with more accurate flow statistics when trained with hard constraints imposed via the proposed\nnovel differentiable spectral projection layer, as compared to soft constrained and unconstrained counterparts.",
        "keywords": "PDE;Hard Constraints;Turbulence;Super-Resolution;Spectral Methods",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chiyu \"Max\" Jiang;Karthik Kashinath;Prabhat;Philip Marcus",
        "authorids": "chiyu.jiang@berkeley.edu;kkashinath@lbl.gov;prabhat@lbl.gov;pmarcus@me.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njiang2020enforcing,\ntitle={Enforcing Physical Constraints in Neural Neural Networks through Differentiable {\\{}PDE{\\}} Layer},\nauthor={Chiyu ''Max'' Jiang and Karthik Kashinath and Prabhat and Philip Marcus},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eyA3VFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1eyA3VFwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "414;212;715",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            447.0,
            206.670430073261
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9673223462348548782&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "B1eyO1BFPr",
        "title": "Don't Use Large Mini-batches, Use Local SGD",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Mini-batch stochastic gradient methods (SGD) are state of the art for distributed training of deep neural networks. \nDrastic increases in the mini-batch sizes have lead to key efficiency and scalability gains in recent years. \nHowever, progress faces a major roadblock, as models trained with large batches often do not generalize well, i.e. they do not show good accuracy on new data.\nAs a remedy, we propose a \\emph{post-local} SGD and show that it significantly improves the generalization performance compared to large-batch training on standard benchmarks while enjoying the same efficiency (time-to-accuracy) and scalability. We further provide an extensive study of the communication efficiency vs. performance trade-offs associated with a host of \\emph{local SGD} variants. \n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tao Lin;Sebastian U. Stich;Kumar Kshitij Patel;Martin Jaggi",
        "authorids": "tao.lin@epfl.ch;sebastian.stich@epfl.ch;kumarkshitijpatel@gmail.com;martin.jaggi@epfl.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLin2020Don't,\ntitle={Don't Use Large Mini-batches, Use Local SGD},\nauthor={Tao Lin and Sebastian U. Stich and Kumar Kshitij Patel and Martin Jaggi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eyO1BFPr}\n}",
        "github": "[![github](/images/github_icon.svg) epfml/LocalSGD-Code](https://github.com/epfml/LocalSGD-Code) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1eyO1BFPr)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1eyO1BFPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "284;352;397",
        "wc_reply_reviewers": "29;0;0",
        "wc_reply_authors": "399;310;511",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.3333333333333,
            46.449494674921446
        ],
        "wc_reply_reviewers_avg": [
            9.666666666666666,
            13.67073110293992
        ],
        "wc_reply_authors_avg": [
            406.6666666666667,
            82.23678549705653
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 516,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3406394348267726989&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "B1eySTVtvB",
        "title": "Combiner: Inductively Learning Tree Structured Attention in Transformers",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Transformers employ dense attention mechanisms over text which can fail to capture or utilize the strong intrinsic structures present in natural language. This paper presents the Combiner model, a new Transformer architecture that learns tree-structured attention patterns inductively from language. Instead of dense or pre-specified structures, Combiner automatically learns tree-structured attention connections using a novel sparse residual attention mechanism. It first employs a sparsity-inducing gate that learns to prune attention connections in each network layer, so as to determine the nodes to be combined. Then the learned connections are propagated through layers using hierarchical attention blocks, which combine the sub-tree nodes in a bottom-up manner. Our experiments demonstrate the robust modeling performance of Combiner and usefulness of structures it learns in various information retrieval and unsupervised sentence parsing tasks. By leveraging search session structures, Combiner outperforms other pre-trained Transformers in generative query suggestion. Moreover, the learned tree structures align well with linguistic structures and improve the current state-of-the-art unsupervised constituency parsing by 8 average sentence-level F1.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiawei Wu;Chenyan Xiong;Tobias Schnabel;Yizhe Zhang;William Yang Wang;Paul Bennett",
        "authorids": "jiawei_wu@cs.ucsb.edu;chenyan.xiong@microsoft.com;tobias.schnabel@microsoft.com;yizhe.zhang@microsoft.com;william@cs.ucsb.edu;paul.n.bennett@microsoft.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=B1eySTVtvB",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4076295307996107422&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1g5qyHYPS",
        "title": "Pruning Depthwise Separable Convolutions for Extra Efficiency Gain of Lightweight Models",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Deep convolutional neural networks are good at accuracy while bad at efficiency. To improve the inference speed, two kinds of directions are developed, lightweight model designing and network weight pruning. Lightweight models have been proposed to improve the speed with good enough accuracy. It is, however, not trivial if we can further speed up these \u201ccompact\u201d models by weight pruning. In this paper, we present a technique to gradually prune the depthwise separable convolution networks, such as MobileNet, for improving the speed of this kind of \u201cdense\u201d network. When pruning depthwise separable convolutions, we need to consider more structural constraints to ensure the speedup of inference. Instead of pruning the model with the desired ratio in one stage, the proposed multi-stage gradual pruning approach can stably prune the filters with a finer pruning ratio. Our method achieves 1.68 times speedup with neglectable accuracy drop for MobileNetV2.",
        "keywords": "Deep Learning;Network Pruning;Lightweight CNN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cheng-Hao Tu;Jia-Hong Lee;Yi-Ming Chan;Chu-Song Chen",
        "authorids": "andytu28@iis.sinica.edu.tw;honghenry.lee@iis.sinica.edu.tw;yiming@iis.sinica.edu.tw;song@iis.sinica.edu.tw",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1g5qyHYPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "137;289;534",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            320.0,
            163.55019616823046
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8600895004223954891&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1g5sA4twr",
        "title": "Deep Double Descent: Where Bigger Models and More Data Hurt",
        "track": "main",
        "status": "Poster",
        "tldr": "We demonstrate, and characterize, realistic settings where bigger models are worse, and more data hurts.",
        "abstract": "We show that a variety of modern deep learning tasks exhibit a \"double-descent\" phenomenon where, as we increase model size, performance first gets worse and then gets  better.  Moreover, we show that double descent occurs not just as a function of model size, but also as a function of the number of training epochs. We unify the above phenomena by defining a new complexity measure we call the effective model complexity, and conjecture a generalized double descent with respect to this measure. Furthermore, our notion of model complexity allows us to identify certain regimes where increasing (even quadrupling) the number of train samples actually hurts test performance.",
        "keywords": "deep learning;double descent;optimization;SGD;complexity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Preetum Nakkiran;Gal Kaplun;Yamini Bansal;Tristan Yang;Boaz Barak;Ilya Sutskever",
        "authorids": "preetum@cs.harvard.edu;galkaplun@g.harvard.edu;ybansal@g.harvard.edu;tristanyang@college.harvard.edu;b@boazbarak.org;ilyasu@openai.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nNakkiran2020Deep,\ntitle={Deep Double Descent: Where Bigger Models and More Data Hurt},\nauthor={Preetum Nakkiran and Gal Kaplun and Yamini Bansal and Tristan Yang and Boaz Barak and Ilya Sutskever},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1g5sA4twr}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=B1g5sA4twr)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1g5sA4twr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "370;710;96",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "148;287;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            392.0,
            251.14670347561136
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            145.0,
            117.1864611065061
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1228,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9967079231665217897&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "B1g79grKPr",
        "title": "Goal-Conditioned Video Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new class of visual generative models: goal-conditioned predictors. We show experimentally that conditioning on the goal allows to reduce uncertainty and produce predictions over much longer horizons.",
        "abstract": "Many processes can be concisely represented as a sequence of events leading from a starting state to an end state. Given raw ingredients, and a finished cake, an experienced chef can surmise the recipe. Building upon this intuition,  we propose a new class of visual generative models: goal-conditioned predictors (GCP). Prior work on video generation largely focuses on prediction models that only observe frames from the beginning of the video. GCP instead treats videos as start-goal transformations, making video generation easier by conditioning on the more informative context provided by the first and final frames.  Not only do existing forward prediction approaches synthesize better and longer videos when modified to become goal-conditioned,  but GCP models can also utilize structures that are not linear in time, to accomplish hierarchical prediction. To this end, we study both auto-regressive GCP models and novel tree-structured GCP models that generate frames recursively, splitting the video iteratively into finer and finer segments delineated by subgoals. In experiments across simulated and real datasets, our GCP methods generate high-quality sequences over long horizons.  Tree-structured GCPs are also substantially easier to parallelize than auto-regressive GCPs, making training  and  inference  very  efficient, and allowing the model to train on sequences that are thousands of frames in length.Finally, we demonstrate the utility of GCP approaches for imitation learning in the setting without access to expert actions. Videos are on the supplementary website: https://sites.google.com/view/video-gcp",
        "keywords": "predictive models;video prediction;latent variable models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Oleh Rybkin;Karl Pertsch;Frederik Ebert;Dinesh Jayaraman;Chelsea Finn;Sergey Levine",
        "authorids": "oleh@seas.upenn.edu;pertsch@usc.edu;febert@berkeley.edu;dineshjayaraman@berkeley.edu;cbfinn@cs.stanford.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nrybkin2020goalconditioned,\ntitle={Goal-Conditioned Video Prediction},\nauthor={Oleh Rybkin and Karl Pertsch and Frederik Ebert and Dinesh Jayaraman and Chelsea Finn and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=B1g79grKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1g79grKPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "375;305;306",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "584;314;236",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            328.6666666666667,
            32.76515764582181
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            378.0,
            149.10399055692642
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jQHh3mWxkmAJ:scholar.google.com/&scioq=Goal-Conditioned+Video+Prediction&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1g8VkHFPH",
        "title": "Rethinking the Hyperparameters for Fine-tuning",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper re-examines several common practices of setting hyper-parameters for fine-tuning and identify optimal hyperparameter depends on source-target domain similarity.",
        "abstract": "Fine-tuning from pre-trained ImageNet models has become the de-facto standard for various computer vision tasks. Current practices for fine-tuning typically involve selecting an ad-hoc choice of hyperparameters and keeping them fixed to values normally used for training from scratch. This paper re-examines several common practices of setting hyperparameters for fine-tuning.  Our findings are based on extensive empirical evaluation for fine-tuning on various transfer learning benchmarks. (1) While prior works have thoroughly investigated learning rate and batch size, momentum for fine-tuning is a relatively unexplored parameter. We find that the value of momentum also affects fine-tuning performance and connect it with previous theoretical findings.  (2) Optimal hyperparameters for fine-tuning, in particular, the effective learning rate, are not only dataset dependent but also sensitive to the similarity between the source domain and target domain. This is in contrast to hyperparameters for training from scratch. (3) Reference-based regularization that keeps models close to the initial model does not necessarily apply for \"dissimilar\" datasets. Our findings challenge common practices of fine-tuning and encourages deep learning practitioners to rethink the hyperparameters for fine-tuning.",
        "keywords": "fine-tuning;hyperparameter search;transfer learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hao Li;Pratik Chaudhari;Hao Yang;Michael Lam;Avinash Ravichandran;Rahul Bhotika;Stefano Soatto",
        "authorids": "hao.li.ict@gmail.com;pratikac@seas.upenn.edu;lancelot365@gmail.com;michlam@amazon.com;avinash.a.ravichandran@gmail.com;bhotikar@amazon.com;soatto@ucla.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nLi2020Rethinking,\ntitle={Rethinking the Hyperparameters for Fine-tuning},\nauthor={Hao Li and Pratik Chaudhari and Hao Yang and Michael Lam and Avinash Ravichandran and Rahul Bhotika and Stefano Soatto},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1g8VkHFPH}\n}",
        "github": "[![github](/images/github_icon.svg) richardaecn/cvpr18-inaturalist-transfer](https://github.com/richardaecn/cvpr18-inaturalist-transfer)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1g8VkHFPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "414;215;456",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "510;265;1494",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            361.6666666666667,
            105.1168661802451
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            756.3333333333334,
            531.1122501150037
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 184,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14029720773108023404&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "B1gF56VYPH",
        "title": "Deep 3D Pan via local adaptive \"t-shaped\" convolutions with global and local adaptive dilations",
        "track": "main",
        "status": "Poster",
        "tldr": "Novel architecture for stereoscopic view synthesis at arbitrary camera shifts utilizing adaptive t-shaped kernels with adaptive dilations.",
        "abstract": "Recent advances in deep learning have shown promising results in many low-level vision tasks. However, solving the single-image-based view synthesis is still an open problem. In particular, the generation of new images at parallel camera views given a single input image is of great interest, as it enables 3D visualization of the 2D input scenery. We propose a novel network architecture to perform stereoscopic view synthesis at arbitrary camera positions along the X-axis, or \u201cDeep 3D Pan\u201d, with \u201ct-shaped\u201d adaptive kernels equipped with globally and locally adaptive dilations. Our proposed network architecture, the monster-net, is devised with a novel t-shaped adaptive kernel with globally and locally adaptive dilation, which can efficiently incorporate global camera shift into and handle local 3D geometries of the target image\u2019s pixels for the synthesis of naturally looking 3D panned views when a 2-D input image is given. Extensive experiments were performed on the KITTI, CityScapes, and our VICLAB_STEREO indoors dataset to prove the efficacy of our method. Our monster-net significantly outperforms the state-of-the-art method (SOTA) by a large margin in all metrics of RMSE, PSNR, and SSIM. Our proposed monster-net is capable of reconstructing more reliable image structures in synthesized images with coherent geometry. Moreover, the disparity information that can be extracted from the \u201ct-shaped\u201d kernel is much more reliable than that of the SOTA for the unsupervised monocular depth estimation task, confirming the effectiveness of our method.",
        "keywords": "Deep learning;Stereoscopic view synthesis;Monocular depth;Deep 3D Pan",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Juan Luis Gonzalez Bello;Munchurl Kim",
        "authorids": "juanluisgb@kaist.ac.kr;mkimee@kaist.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nBello2020Deep,\ntitle={Deep 3D Pan via local adaptive \"t-shaped\" convolutions with global and local adaptive dilations},\nauthor={Juan Luis Gonzalez Bello and Munchurl Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gF56VYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1gF56VYPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "367;239;295",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "275;0;206",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            300.3333333333333,
            52.39168721170267
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            160.33333333333334,
            116.81989937030801
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11744291359883831287&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "B1gHokBKwS",
        "title": "Learning to Guide Random Search",
        "track": "main",
        "status": "Poster",
        "tldr": "We improve the sample-efficiency of the random search for functions defined on low-dimensional manifolds. Our method jointly learns the underlying manifold and optimizes the function.",
        "abstract": "We are interested in derivative-free optimization of high-dimensional functions. The sample complexity of existing methods is high and depends on problem dimensionality, unlike the dimensionality-independent rates of first-order methods. The recent success of deep learning suggests that many datasets lie on low-dimensional manifolds that can be represented by deep nonlinear models. We therefore consider derivative-free optimization of a high-dimensional function that lies on a latent low-dimensional manifold. We develop an online learning approach that learns this manifold while performing the optimization. In other words, we jointly learn the manifold and optimize the function. Our analysis suggests that the presented method significantly reduces sample complexity. We empirically evaluate the method on continuous optimization benchmarks and high-dimensional continuous control problems. Our method achieves significantly lower sample complexity than Augmented Random Search, Bayesian optimization, covariance matrix adaptation (CMA-ES), and other derivative-free optimization algorithms.",
        "keywords": "Random search;Derivative-free optimization;Learning continuous control",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ozan Sener;Vladlen Koltun",
        "authorids": "ozansener@gmail.com;vkoltun@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nSener2020Learning,\ntitle={Learning to Guide Random Search},\nauthor={Ozan Sener and Vladlen Koltun},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gHokBKwS}\n}",
        "github": "https://github.com/intel-isl/LMRS",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1gHokBKwS",
        "pdf_size": 0,
        "rating": "6;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "854;588;311;533",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "802;255;200;653",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            6.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            571.5,
            193.2750630578089
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            477.5,
            256.2289015704513
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10046802470639742746&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "B1gKVeBtDH",
        "title": "Faster and Just As Accurate: A Simple Decomposition for Transformer Models",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Inference in large Transformers is expensive due to the self-attention in multiple layers. We show a simple decomposition technique can yield a faster, low memory-footprint model that is just as accurate of the original models.",
        "abstract": "Large pre-trained Transformers such as BERT have been tremendously effective for many NLP tasks.  However, inference in these large-capacity models is prohibitively slow and expensive. Transformers are essentially a stack of self-attention layers which encode each input position using the entire input sequence as its context. However, we find that it may not be necessary to apply this expensive sequence-wide self-attention over at all layers. Based on this observation, we propose a decomposition to a pre-trained Transformer that allows the lower layers to process segments of the input independently enabling parallelism and caching. We show that the information loss due to this decomposition can be recovered in the upper layers with auxiliary supervision during fine-tuning.  We evaluate de-composition with pre-trained BERT models on five different paired-input tasks in question answering, sentence similarity, and natural language inference.  Results show that decomposition enables faster inference (up to 4x), significant memory reduction (up to 70%) while retaining most (up to 99%) of the original performance. We will release the code at<anonymized url>.",
        "keywords": "Faster Inference;Transformers;Pre-trained Representations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qingqing Cao;Harsh Trivedi;Aruna Balasubramanian;Niranjan Balasubramanian",
        "authorids": "qicao@cs.stonybrook.edu;hjtrivedi@cs.stonybrook.edu;arunab@cs.stonybrook.edu;niranjan@cs.stonybrook.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1gKVeBtDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "186;319;263",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "233;702;484",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            256.0,
            54.5221667458903
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            473.0,
            191.62637257608012
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7004729919793776023&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1gL904FwH",
        "title": "SIMULTANEOUS ATTRIBUTED NETWORK EMBEDDING AND CLUSTERING",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper propose a novel matrix decomposition framework for simultaneous attributed network data embedding and clustering.",
        "abstract": "To deal simultaneously with both, the attributed network embedding and clustering, we propose a new model. It exploits both content and structure information, capitalising on their simultaneous use. The proposed model relies on the approximation of the relaxed continuous embedding solution by the true discrete clustering one. Thereby, we show that incorporating an embedding representation provides simpler and more interpretable solutions. Experiment results demonstrate that the proposed algorithm performs better, in terms of clustering and embedding, than the state-of-art algorithms, including deep learning methods devoted to similar tasks for attributed network datasets with different proprieties.",
        "keywords": "Attributed network;Embedding;clustering;matrix decomposition;spectral rotation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lazhar labiod;Mohamed Nadif",
        "authorids": "lazhar.labiod@parisdescartes.fr;mohamed.nadif@parisdescartes.fr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1gL904FwH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "392;172;169",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            244.33333333333334,
            104.42328390843788
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_VB8Ncal3jUJ:scholar.google.com/&scioq=SIMULTANEOUS+ATTRIBUTED+NETWORK+EMBEDDING+AND+CLUSTERING&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1gNKxrYPB",
        "title": "Attributed Graph Learning with 2-D Graph Convolution",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel 2-D graph convolution framework to jointly model node relations and attribute relations for attributed graph learning.",
        "abstract": "Graph convolutional neural networks have demonstrated promising performance in attributed graph learning, thanks to the use of graph convolution that effectively combines graph structures and node features for learning node representations. However, one intrinsic limitation of the commonly adopted 1-D graph convolution is that it only exploits graph connectivity for feature smoothing, which may lead to inferior performance on sparse and noisy real-world attributed networks. To address this problem, we propose to explore relational information among node attributes to complement node relations for representation learning. In particular, we propose to use 2-D graph convolution to jointly model the two kinds of relations and develop a computationally efficient dimensionwise separable 2-D graph convolution (DSGC). Theoretically, we show that DSGC can reduce intra-class variance of node features on both the node dimension and the attribute dimension to facilitate learning. Empirically, we demonstrate that by incorporating attribute relations, DSGC achieves significant performance gain over state-of-the-art methods on node classification and clustering on several real-world attributed networks. \n",
        "keywords": "2-D Graph Convolution;Attributed Graph;Representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qimai Li;Xiaotong Zhang;Han Liu;Xiao-Ming Wu",
        "authorids": "csqmli@comp.polyu.edu.hk;zxt.dut@hotmail.com;liu.han.dut@gmail.com;xiao-ming.wu@polyu.edu.hk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nli2020attributed,\ntitle={Attributed Graph Learning with 2-D Graph Convolution},\nauthor={Qimai Li and Xiaotong Zhang and Han Liu and Xiao-Ming Wu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gNKxrYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1gNKxrYPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "528;108;305",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "515;34;273",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            313.6666666666667,
            171.57376126772868
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            274.0,
            196.368700832558
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4876580570566637484&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1gNfkrYvS",
        "title": "Capsule Networks without Routing Procedures",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Routing procedures are not necessary for CapsNets",
        "abstract": "We propose Pure CapsNets (P-CapsNets) without routing procedures. Specifically, we make three modifications to CapsNets.  First, we remove routing procedures from CapsNets based on the observation that the coupling coefficients can be learned implicitly. Second, we replace the convolutional layers in CapsNets to improve efficiency. Third, we package the capsules into rank-3 tensors to further improve efficiency. The experiment shows that P-CapsNets achieve better performance than CapsNets with varied routine procedures by using significantly fewer parameters on MNIST&CIFAR10. The high efficiency of P-CapsNets is even comparable to some deep compressing models. For example, we achieve more than 99% percent accuracy on MNIST by using only 3888 parameters.  We visualize the capsules as well as the corresponding correlation matrix to show a possible way of initializing CapsNets in the future. We also explore the adversarial robustness of P-CapsNets compared to CNNs. ",
        "keywords": "CapsNets;routing procedures",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhenhua Chen;Xiwen Li;Chuhua Wang;David Crandall",
        "authorids": "chen478@iu.edu;xiwenli@wustl.edu;cw234@iu.edu;djcran@indiana.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://github.com/chenzhenhua986/CAFFE-CapsNet",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1gNfkrYvS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "299;308;271",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "228;238;100",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.6666666666667,
            15.755069730795297
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            188.66666666666666,
            62.829575484444874
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            20,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "B1gOe6NKPB",
        "title": "MDE: Multiple Distance Embeddings for Link Prediction in Knowledge Graphs",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A novel method of modelling Knowledge Graphs based on Distance Embeddings and Neural Networks",
        "abstract": "Over the past decade, knowledge graphs became popular for capturing structured domain knowledge. \nRelational learning models enable the prediction of missing links inside knowledge graphs. More specifically, latent distance approaches model the relationships among entities via a distance between latent representations.\nTranslating embedding models (e.g., TransE) are among the most popular latent distance approaches which use one distance function to learn multiple relation patterns. \nHowever, they are mostly inefficient in capturing symmetric relations since the representation vector norm for all the symmetric relations becomes equal to zero. They also lose information when learning relations with reflexive patterns since they become symmetric and transitive.\nWe propose the Multiple Distance Embedding model (MDE) that addresses these limitations and a framework which enables collaborative combinations of latent distance-based terms (MDE).\nOur solution is based on two principles: 1) using limit-based loss instead of margin ranking loss and 2) by learning independent embedding vectors for each of terms we can collectively train and predict using contradicting distance terms.\nWe further demonstrate that MDE allows modeling relations with (anti)symmetry, inversion, and composition patterns. We propose MDE as a neural network model which allows us to map non-linear relations between the embedding vectors and the expected output of the score function.\nOur empirical results show that MDE outperforms the state-of-the-art embedding models on several benchmark datasets.",
        "keywords": "Representation Learning;Knowledge Graph embedding;Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Afshin Sadeghi;Damien Graux;Hamed Shariat Yazdi;Jens Lehmann",
        "authorids": "sadeghi@cs.uni-bonn.de;dam.graux@gmail.com;shariat@cs.uni-bonn.de;jens.lehmann@cs.uni-bonn.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://drive.google.com/open?id=1eE5KvWtg6IJDlBKW-D7vR7lURCQNLich",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=B1gOe6NKPB",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9398581189907500448&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "B1gR3ANFPS",
        "title": "Non-linear System Identification from Partial Observations via Iterative Smoothing and Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "This work presents a scalable algorithm for non-linear offline system identification from partial observations.",
        "abstract": "System identification is the process of building a mathematical model of an unknown system from measurements of its inputs and outputs. It is a key step for model-based control, estimator design, and output prediction. This work presents an algorithm for non-linear offline system identification from partial observations, i.e. situations in which the system's full-state is not directly observable. The algorithm presented, called SISL, iteratively infers the system's full state through non-linear optimization and then updates the model parameters. We test our algorithm on a simulated system of coupled Lorenz attractors, showing our algorithm's ability to identify high-dimensional systems that prove intractable for particle-based approaches. We also use SISL to identify the dynamics of an aerobatic helicopter. By augmenting the state with unobserved fluid states, we learn a model that predicts the acceleration of the helicopter better than state-of-the-art approaches.",
        "keywords": "System Identification;Dynamical Systems;Partial Observations;Non-linear Programming;Expectation Maximization;Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kunal Menda;Jean de Becdeli\u00e8vre;Jayesh K Gupta;Ilan Kroo;Mykel J. Kochenderfer;Zachary Manchester",
        "authorids": "kmenda@stanford.edu;jeandb@stanford.edu;jkg@cs.stanford.edu;kroo@stanford.edu;mykel@stanford.edu;zacmanchester@stanford.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nmenda2020nonlinear,\ntitle={Non-linear System Identification from Partial Observations via Iterative Smoothing and Learning},\nauthor={Kunal Menda and Jean de Becdeli{\\`e}vre and Jayesh K Gupta and Ilan Kroo and Mykel J. Kochenderfer and Zachary Manchester},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gR3ANFPS}\n}",
        "github": "https://drive.google.com/drive/folders/1M4aOCo5HW9MjibSNJqKnMOZAmFCKovBc?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1gR3ANFPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "522;278;263",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1271;110;374",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            354.3333333333333,
            118.71628176267801
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            585.0,
            496.9044173681695
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:d8pYlcXoAqQJ:scholar.google.com/&scioq=Non-linear+System+Identification+from+Partial+Observations+via+Iterative+Smoothing+and+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1gUn24tPr",
        "title": "Classification Attention for Chinese NER",
        "track": "main",
        "status": "Reject",
        "tldr": "Classification Attention for Chinese NER",
        "abstract": "The character-based model, such as BERT, has achieved remarkable success in Chinese named entity recognition (NER). However, such model would likely miss the overall information of the entity words. In this paper, we propose to combine priori entity information with BERT. Instead of relying on additional lexicons or pre-trained word embeddings, our model has generated entity classification embeddings directly on the pre-trained BERT, having the merit of increasing model practicability and avoiding OOV problem. Experiments show that our model has achieved state-of-the-art results on 3 Chinese NER datasets.",
        "keywords": "Chinese NER;NER;tagging;deeplearning;nlp",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuchen Ge;FanYang;PeiYang",
        "authorids": "geyc2@lenovo.com;yangfan24@lenovo.com;yangpei4@lenovo.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nge2020classification,\ntitle={Classification Attention for Chinese {\\{}NER{\\}}},\nauthor={Yuchen Ge and FanYang and PeiYang},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gUn24tPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1gUn24tPr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "595;90;438",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            374.3333333333333,
            211.0234320848964
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6NuZNq6BaxgJ:scholar.google.com/&scioq=Classification+Attention+for+Chinese+NER&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1gX8JrYPr",
        "title": "Connecting the Dots Between MLE and RL for Sequence Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "An entropy regularized policy optimization formalism subsumes a set of sequence prediction learning algorithms. A new interpolation algorithm with improved results on text generation and game imitation learning.",
        "abstract": "Sequence prediction models can be learned from example sequences with a variety of training algorithms. Maximum likelihood learning is simple and efficient, yet can suffer from compounding error at test time. \nReinforcement learning such as policy gradient addresses the issue but can have prohibitively poor exploration efficiency. A rich set of other algorithms, such as data noising, RAML, and softmax policy gradient, have also been developed from different perspectives. \nIn this paper, we present a formalism of entropy regularized policy optimization, and show that the apparently distinct algorithms, including MLE, can be reformulated as special instances of the formulation. The difference between them is characterized by the reward function and two weight hyperparameters.\nThe unifying interpretation enables us to systematically compare the algorithms side-by-side, and gain new insights into the trade-offs of the algorithm design.\nThe new perspective also leads to an improved approach that dynamically interpolates among the family of algorithms, and learns the model in a scheduled way. Experiments on machine translation, text summarization, and game imitation learning demonstrate superiority of the proposed approach.",
        "keywords": "Sequence generation;sequence prediction;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bowen Tan;Zhiting Hu;Zichao Yang;Ruslan Salakhutdinov;Eric Xing",
        "authorids": "bwkevintan@gmail.com;zhitinghu@gmail.com;yangtze2301@gmail.com;rsalakhu@cs.cmu.edu;epxing@cs.cmu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ntan2020connecting,\ntitle={Connecting the Dots Between {\\{}MLE{\\}} and {\\{}RL{\\}} for Sequence Prediction},\nauthor={Bowen Tan and Zhiting Hu and Zichao Yang and Ruslan Salakhutdinov and Eric Xing},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gX8JrYPr}\n}",
        "github": "https://drive.google.com/file/d/13diaxzuxTSB-DReqEhkYPMmZ4BQ6vsEo/view",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1gX8JrYPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "282;180;218",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "320;165;161",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            226.66666666666666,
            42.089850980438925
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            215.33333333333334,
            74.02852303147904
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9258702747622648930&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "B1gX8kBtPr",
        "title": "Universal Approximation with Certified Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We prove that for a large class of functions f there exists an interval certified robust network approximating f up to arbitrary precision.",
        "abstract": "Training neural networks to be certifiably robust is critical to ensure their safety against adversarial attacks. However, it is currently very difficult to train a neural network that is both accurate and certifiably robust. In this work we take a step towards addressing this challenge. We prove that for every continuous function $f$, there exists a network $n$ such that:\n(i) $n$ approximates $f$ arbitrarily close, and (ii) simple interval bound propagation of a region $B$ through $n$ yields a result that is arbitrarily close to the optimal output of $f$ on $B$. Our result can be seen as a Universal Approximation Theorem for interval-certified ReLU networks. To the best of our knowledge, this is the first work to prove the existence of accurate, interval-certified networks.",
        "keywords": "adversarial robustness;universal approximation;certified network;interval bound propagation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Maximilian Baader;Matthew Mirman;Martin Vechev",
        "authorids": "mbaader@inf.ethz.ch;matthew.mirman@inf.ethz.ch;martin.vechev@inf.ethz.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nBaader2020Universal,\ntitle={Universal Approximation with Certified Networks},\nauthor={Maximilian Baader and Matthew Mirman and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gX8kBtPr}\n}",
        "github": "https://github.com/eth-sri/UniversalCertificationTheory",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1gX8kBtPr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "514;162;457",
        "wc_reply_reviewers": "130;0;0",
        "wc_reply_authors": "1354;115;133",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            377.6666666666667,
            154.2645635120249
        ],
        "wc_reply_reviewers_avg": [
            43.333333333333336,
            61.282587702834114
        ],
        "wc_reply_authors_avg": [
            534.0,
            579.8741242718113
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8301791316229019028&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "B1gXR3NtwS",
        "title": "Deep Bayesian Structure Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Bayesian neural networks (BNNs) introduce uncertainty estimation to deep networks by performing Bayesian inference on network weights. However, such models bring the challenges of inference, and further BNNs with weight uncertainty rarely achieve superior performance to standard models. In this paper, we investigate a new line of Bayesian deep learning by performing Bayesian reasoning on the structure of deep neural networks. Drawing inspiration from the neural architecture search, we define the network structure as random weights on the redundant operations between computational nodes, and apply stochastic variational inference techniques to learn the structure distributions of networks. Empirically, the proposed method substantially surpasses the advanced deep neural networks across a range of classification and segmentation tasks. More importantly, our approach also preserves benefits of Bayesian principles, producing improved uncertainty estimation than the strong baselines including MC dropout and variational BNNs algorithms (e.g. noisy EK-FAC). ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhijie Deng;Yucen Luo;Jun Zhu;Bo Zhang",
        "authorids": "dzj17@mails.tsinghua.edu.cn;luoyc15@mails.tsinghua.edu.cn;dcszj@tsinghua.edu.cn;dcszb@tsinghua.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ndeng2020deep,\ntitle={Deep Bayesian Structure Networks},\nauthor={Zhijie Deng and Yucen Luo and Jun Zhu and Bo Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gXR3NtwS}\n}",
        "github": "https://github.com/anonymousest/DBSN",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1gXR3NtwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "537;755;381",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1086;1659;1066",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            557.6666666666666,
            153.38260078060426
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1270.3333333333333,
            274.9500964821718
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "B1gXWCVtvr",
        "title": "Adapting Behaviour for Learning Progress",
        "track": "main",
        "status": "Reject",
        "tldr": "Don\u2019t tune exploration by hand: automagically adapt behaviour modulation for learning progress instead!",
        "abstract": "Determining what experience to generate to best facilitate learning (i.e. exploration) is one of the distinguishing features and open challenges in reinforcement learning. The advent of distributed agents that interact with parallel instances of the environment has enabled larger scale and greater flexibility, but has not removed the need to tune or tailor exploration to the task, because the ideal data for the learning algorithm necessarily depends on its process of learning. We propose to dynamically adapt the data generation by using a non-stationary multi-armed bandit to optimize a proxy of the learning progress. The data distribution is controlled via modulating multiple parameters of the policy (such as stochasticity, consistency or optimism) without significant overhead. The adaptation speed of the bandit can be increased by exploiting the factored modulation structure. We demonstrate on a suite of Atari 2600 games how this unified approach produces results comparable to per-task tuning at a fraction of the cost.",
        "keywords": "adaptation;behaviour;reinforcement learning;modulated behaviour;exploration;deep reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tom Schaul;Diana Borsa;David Ding;David Szepesvari;Georg Ostrovski;Will Dabney;Simon Osindero",
        "authorids": "schaul@google.com;borsa@google.com;fding@google.com;dsz@google.com;ostrovski@google.com;wdabney@google.com;osindero@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nschaul2020adapting,\ntitle={Adapting Behaviour for Learning Progress},\nauthor={Tom Schaul and Diana Borsa and David Ding and David Szepesvari and Georg Ostrovski and Will Dabney and Simon Osindero},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gXWCVtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1gXWCVtvr",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "437;474;402;104",
        "wc_reply_reviewers": "0;0;0;14",
        "wc_reply_authors": "410;672;822;177",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            354.25,
            146.70783039769896
        ],
        "wc_reply_reviewers_avg": [
            3.5,
            6.06217782649107
        ],
        "wc_reply_authors_avg": [
            520.25,
            247.01050078893408
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8896508186467212309&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1gXYR4YDH",
        "title": "DSReg: Using Distant Supervision as a Regularizer",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, we aim at tackling a general issue in NLP tasks where some of the negative examples are highly similar to the positive examples, i.e., hard-negative examples). We propose the distant supervision as a regularizer (DSReg) approach to tackle this issue. We convert the original task to a multi-task learning problem, in which we first utilize the idea of distant supervision to retrieve hard-negative examples. The obtained hard-negative examples are then used as a  regularizer, and we jointly optimize the original target objective of distinguishing positive examples from negative examples along with the auxiliary task objective of distinguishing soften positive examples (comprised of positive examples and hard-negative examples) from easy-negative examples. In the neural context, this can be done by feeding the final token representations to different output layers. Using this unbelievably simple strategy, we improve the performance of a range of different NLP tasks, including text classification, sequence labeling and reading comprehension. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuxian Meng;Muyu Li;Xiaoya Li;Wei Wu;Fei Wu;Jiwei Li",
        "authorids": "yuxian_meng@shannonai.com;muyu_li@shannonai.com;xiaoya_li@shannonai.com;wei_wu@shannonai.com;wufei@zju.edu.cn;jiwei_li@shannonai.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nmeng2020dsreg,\ntitle={{\\{}DSR{\\}}eg: Using Distant Supervision as a Regularizer},\nauthor={Yuxian Meng and Muyu Li and Xiaoya Li and Wei Wu and Fei Wu and Jiwei Li},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gXYR4YDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1gXYR4YDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "631;268;202",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "358;530;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            367.0,
            188.61071019430472
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            296.0,
            220.76835522027758
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4670065548239922208&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1gZV1HYvS",
        "title": "Multi-Agent Interactions Modeling with Correlated Policies",
        "track": "main",
        "status": "Poster",
        "tldr": "Modeling complex multi-agent interactions under multi-agent imitation learning framework with explicit modeling of correlated policies by approximating opponents\u2019 policies. ",
        "abstract": "In multi-agent systems, complex interacting behaviors arise due to the high correlations among agents. However, previous work on modeling multi-agent interactions from demonstrations is primarily constrained by assuming the independence among policies and their reward structures. \nIn this paper, we cast the multi-agent interactions modeling problem into a multi-agent imitation learning framework with explicit modeling of correlated policies by approximating opponents\u2019 policies, which can recover agents' policies that can regenerate similar interactions. Consequently, we develop a Decentralized Adversarial Imitation Learning algorithm with Correlated policies (CoDAIL), which allows for decentralized training and execution. Various experiments demonstrate that CoDAIL can better regenerate complex interactions close to the demonstrators and outperforms state-of-the-art multi-agent imitation learning methods. Our code is available at \\url{https://github.com/apexrl/CoDAIL}.",
        "keywords": "Multi-agent reinforcement learning;Imitation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Minghuan Liu;Ming Zhou;Weinan Zhang;Yuzheng Zhuang;Jun Wang;Wulong Liu;Yong Yu",
        "authorids": "minghuanliu@sjtu.edu.cn;mingak@sjtu.edu.cn;wnzhang@sjtu.edu.cn;zhuangyuzheng@huawei.com;w.j@huawei.com;liuwulong@huawei.com;yyu@apex.sjtu.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nLiu2020Multi-Agent,\ntitle={Multi-Agent Interactions Modeling with Correlated Policies},\nauthor={Minghuan Liu and Ming Zhou and Weinan Zhang and Yuzheng Zhuang and Jun Wang and Wulong Liu and Yong Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gZV1HYvS}\n}",
        "github": "https://github.com/apexrl/CoDAIL",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1gZV1HYvS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "276;473;359",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "363;1036;472",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.3333333333333,
            80.75614871671107
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            623.6666666666666,
            294.9399185521613
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1707555896923900607&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "B1g_BT4FvS",
        "title": "Samples Are Useful? Not Always: denoising policy gradient updates using variance explained",
        "track": "main",
        "status": "Reject",
        "tldr": "SAUNA uses the fraction of variance explained (Vex) as a metric to filter the transitions used for policy gradient updates: such filtering improves the sampling prior for a better exploration of the environment and yields a better performance.",
        "abstract": "Policy gradient algorithms in reinforcement learning optimize the policy directly and rely on efficiently sampling an environment. However, while most sampling procedures are based solely on sampling the agent's policy, other measures directly accessible through these algorithms could be used to improve sampling before each policy update. Following this line of thoughts, we propose the use of SAUNA, a method where transitions are rejected from the gradient updates if they do not meet a particular criterion, and kept otherwise. This criterion, the fraction of variance explained Vex, is a measure of the discrepancy between a model and actual samples. In this work, Vex is used to evaluate the impact each transition will have on learning: this criterion refines sampling and improves the policy gradient algorithm. In this paper: (a) We introduce and explore Vex, the criterion used for denoising policy gradient updates. (b) We conduct experiments across a variety of benchmark environments, including standard continuous control problems. Our results show better performance with SAUNA. (c) We investigate why Vex provides a reliable assessment for the selection of samples that will positively impact learning. (d) We show how this criterion can work as a dynamic tool to adjust the ratio between exploration and exploitation.",
        "keywords": "reinforcement learning;policy gradient;sampling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yannis Flet-Berliac;Philippe Preux",
        "authorids": "yannis.flet-berliac@inria.fr;philippe.preux@inria.fr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nflet-berliac2020samples,\ntitle={Samples Are Useful? Not Always: denoising policy gradient updates using variance explained},\nauthor={Yannis Flet-Berliac and Philippe Preux},\nyear={2020},\nurl={https://openreview.net/forum?id=B1g_BT4FvS}\n}",
        "github": "https://github.com/iclr2020-submission/denoising-gradient-updates",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1g_BT4FvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "408;394;408",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "390;822;487",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            403.3333333333333,
            6.599663291074444
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            566.3333333333334,
            185.06995674308914
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "B1gcblSKwB",
        "title": "Meta-Learning with Network Pruning for Overfitting Reduction",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Meta-Learning has achieved great success in few-shot learning. However, the existing meta-learning models have been evidenced to overfit on meta-training tasks when using deeper and wider convolutional neural networks. This means that we cannot improve the meta-generalization performance by merely deepening or widening the networks. To remedy such a deficiency of meta-overfitting, we propose in this paper a sparsity constrained meta-learning approach to learn from meta-training tasks a subnetwork from which first-order optimization methods can quickly converge towards the optimal network in meta-testing tasks. Our theoretical analysis shows the benefit of sparsity for improving the generalization gap of the learned meta-initialization network. We have implemented our approach on top of the widely applied Reptile algorithm assembled with varying network pruning routines including Dense-Sparse-Dense (DSD) and Iterative Hard Thresholding (IHT). Extensive experimental results on benchmark datasets with different over-parameterized deep networks demonstrate that our method can not only effectively ease meta-overfitting but also in many cases improve the meta-generalization performance when applied to few-shot classification tasks.",
        "keywords": "Meta-Learning;Few-shot Learning;Network Pruning;Generalization Analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongduan Tian;Bo Liu;Xiao-Tong Yuan;Qingshan Liu",
        "authorids": "hongduan_tian@nuist.edu.cn;kfliubo@gmail.com;xtyuan1980@gmail.com;qsliu@nuist.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntian2020metalearning,\ntitle={Meta-Learning with Network Pruning for Overfitting Reduction},\nauthor={Hongduan Tian and Bo Liu and Xiao-Tong Yuan and Qingshan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gcblSKwB}\n}",
        "github": "https://drive.google.com/open?id=1VOY1sCA1j5G1LE2AbDrPoZM-1ZwwVOHA",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1gcblSKwB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1842;194;536",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "883;178;529",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            857.3333333333334,
            710.1254975159125
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            530.0,
            287.8159133890967
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16843083299375496395&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1gd0nEFwS",
        "title": "Universal Source-Free Domain Adaptation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A novel unsupervised domain adaptation paradigm - performing adaptation without accessing the source data ('source-free') and without any assumption about the source-target category-gap ('universal').",
        "abstract": "There is a strong incentive to develop versatile learning techniques that can transfer the knowledge of class-separability from a labeled source domain to an unlabeled target domain in the presence of a domain-shift. Existing domain adaptation (DA) approaches are not equipped for practical DA scenarios as a result of their reliance on the knowledge of source-target label-set relationship (e.g. Closed-set, Open-set or Partial DA). Furthermore, almost all the prior unsupervised DA works require coexistence of source and target samples even during deployment, making them unsuitable for incremental, real-time adaptation. Devoid of such highly impractical assumptions, we propose a novel two-stage learning process. Initially, in the procurement-stage, the objective is to equip the model for future source-free deployment, assuming no prior knowledge of the upcoming category-gap and domain-shift. To achieve this, we enhance the model\u2019s ability to reject out-of-source distribution samples by leveraging the available source data, in a novel generative classifier framework. Subsequently, in the deployment-stage, the objective is to design a unified adaptation algorithm capable of operating across a wide range of category-gaps, with no access to the previously seen source samples. To achieve this, in contrast to the usage of complex adversarial training regimes, we define a simple yet effective source-free adaptation objective by utilizing a novel instance-level weighing mechanism, named as Source Similarity Metric (SSM). A thorough evaluation shows the practical usability of the proposed learning framework with superior DA performance even over state-of-the-art source-dependent approaches.",
        "keywords": "unsupervised domain adaptation;knowledge transfer;source-free adaptation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jogendra Nath Kundu;Naveen Venkat;Rahul M V;R. Venkatesh Babu",
        "authorids": "jogendrak@iisc.ac.in;nav.naveenvenkat@gmail.com;rmvenkat@andrew.cmu.edu;venky@iisc.ac.in",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1gd0nEFwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "400;319;174",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "918;645;64",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            297.6666666666667,
            93.48915563969021
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            542.3333333333334,
            356.12201404699607
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 410,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13396021133130094693&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "B1gd7REFDB",
        "title": "Context-Aware Object Detection With Convolutional Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "A deep neural network that leverages conditional random field to enforce context semantics constrains in object detection",
        "abstract": "Although the state-of-the-art object detection methods are successful in detecting and classifying objects by leveraging deep convolutional neural networks (CNNs), these methods overlook the semantic context which implies the probabilities that different classes of objects occur jointly. In this work, we propose a context-aware CNN (or conCNN for short) that for the first time effectively enforces the semantics context constraints in the CNN-based object detector by leveraging the popular conditional random field (CRF) model in CNN. In particular, conCNN features a context-aware module that naturally models the mean-field inference method for CRF using a stack of common CNN operations. It can be seamlessly plugged into any existing region-based object detection paradigm. Our experiments using COCO datasets showcase that conCNN improves the average precision (AP) of object detection by 2 percentage points, while only introducing negligible extra training overheads.",
        "keywords": "Object Detection;CNN;Context;CRF",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yizhou Yan;Lei Cao;Samuel Madden;Elke Rundensteiner",
        "authorids": "yyan2@wpi.edu;lcao@csail.mit.edu;madden@csail.mit.edu;rundenst@cs.wpi.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nyan2020contextaware,\ntitle={Context-Aware Object Detection With Convolutional Neural Networks},\nauthor={Yizhou Yan and Lei Cao and Samuel Madden and Elke Rundensteiner},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gd7REFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1gd7REFDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "323;333;283",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "267;189;292",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            313.0,
            21.602468994692867
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            249.33333333333334,
            43.86595744107522
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8EfYfQK731EJ:scholar.google.com/&scioq=Context-Aware+Object+Detection+With+Convolutional+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1gdkxHFDH",
        "title": "Training individually fair ML models with sensitive subspace robustness",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Algorithm for training individually fair classifier using adversarial robustness",
        "abstract": "We consider training machine learning models that are fair in the sense that their performance is invariant under certain sensitive perturbations to the inputs. For example, the performance of a resume screening system should be invariant under changes to the gender and/or ethnicity of the applicant. We formalize this notion of algorithmic fairness as a variant of individual fairness and develop a distributionally robust optimization approach to enforce it during training. We also demonstrate the effectiveness of the approach on two ML tasks that are susceptible to gender and racial biases. ",
        "keywords": "fairness;adversarial robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mikhail Yurochkin;Amanda Bower;Yuekai Sun",
        "authorids": "mikhail.yurochkin@ibm.com;amandarg@umich.edu;yuekai@umich.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nYurochkin2020Training,\ntitle={Training individually fair ML models with sensitive subspace robustness},\nauthor={Mikhail Yurochkin and Amanda Bower and Yuekai Sun},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gdkxHFDH}\n}",
        "github": "https://github.com/IBM/sensitive-subspace-robustness",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1gdkxHFDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "699;186;211",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "524;497;238",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.3333333333333,
            236.1586096014475
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            419.6666666666667,
            128.92978278461842
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 154,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18102623998603329338&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "B1gi-TVKwB",
        "title": "Learning an off-policy predictive state representation for deep reinforcement learning for vision-based steering in autonomous driving",
        "track": "main",
        "status": "Withdraw",
        "tldr": "An algorithm to learn a predictive state representation with general value functions and off-policy learning is applied to the problem of vision-based steering in autonomous driving.",
        "abstract": "An algorithm is introduced for learning a predictive state representation with off-policy temporal difference (TD) learning that is then used to learn to steer a vehicle with reinforcement learning.  There are three components being learned simultaneously:  (1) the off-policy predictions as a compact representation of state, (2) the behavior policy distribution for estimating the off-policy predictions, and (3) the deterministic policy gradient for learning to act.  A behavior policy discriminator is learned and used for estimating the important sampling ratios needed to learn the predictive representation off-policy with general value functions (GVFs).  A linear deterministic policy gradient method is used to train the agent with only the predictive representations while the predictions are being learned.  All three components are combined, demonstrated and evaluated on the problem of steering the vehicle from images in the TORCS racing simulator environment.\nSteering from only images is a challenging problem where evaluation is completed on a held-out set of tracks that were never seen during training in order to measure the generalization of the predictions and controller.  Experiments show the proposed method is able to steer smoothly and navigate many but not all of the tracks available in TORCS with performance that exceeds DDPG using only images as input and approaches the performance of an ideal non-vision based kinematics model.",
        "keywords": "Predictive representations;general value functions;reinforcement learning;off-policy learning;behavior estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Graves",
        "authorids": "dgraves@ualberta.ca",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1gi-TVKwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "525;420;719",
        "wc_reply_reviewers": "276;0;0",
        "wc_reply_authors": "758;563;569",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            554.6666666666666,
            123.85565074804711
        ],
        "wc_reply_reviewers_avg": [
            92.0,
            130.10764773832474
        ],
        "wc_reply_authors_avg": [
            630.0,
            90.54280755532159
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qLTquNVdmpwJ:scholar.google.com/&scioq=Learning+an+off-policy+predictive+state+representation+for+deep+reinforcement+learning+for+vision-based+steering+in+autonomous+driving&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1gi0TEFDB",
        "title": "Understanding Top-k Sparsification in Distributed Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Distributed stochastic gradient descent (SGD) algorithms are widely deployed in training large-scale deep learning models, while the communication overhead among workers becomes the new system bottleneck. Recently proposed gradient sparsification techniques, especially Top-$k$ sparsification with error compensation (TopK-SGD), can significantly reduce the communication traffic without obvious impact on the model accuracy. Some theoretical studies have been carried out to analyze the convergence property of TopK-SGD. However, existing studies do not dive into the details of Top-$k$ operator in gradient sparsification and use relaxed bounds (e.g., exact bound of Random-$k$) for analysis; hence the derived results cannot well describe the real convergence performance of TopK-SGD. To this end, we first study the gradient distributions of TopK-SGD during training process through extensive experiments. We then theoretically derive a tighter bound for the Top-$k$ operator. Finally, we exploit the property of gradient distribution to propose an approximate top-$k$ selection algorithm, which is computing-efficient for GPUs, to improve the scaling efficiency of TopK-SGD by significantly reducing the computing overhead.",
        "keywords": "Distributed Deep Learning;SGD;Gradient Sparsification;Communication-efficient SGD;Top-k",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shaohuai Shi;Xiaowen Chu;Ka Chun Cheung;Simon See",
        "authorids": "csshshi@comp.hkbu.edu.hk;chxw@comp.hkbu.edu.hk;chcheung@nvidia.com;ssee@nvidia.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nshi2020understanding,\ntitle={Understanding Top-k Sparsification in Distributed Deep Learning},\nauthor={Shaohuai Shi and Xiaowen Chu and Ka Chun Cheung and Simon See},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gi0TEFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1gi0TEFDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "313;718;288",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "422;1076;211",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            439.6666666666667,
            197.07584552371935
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            569.6666666666666,
            368.2484064987775
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 128,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11182575544128886596&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1gikpEtwH",
        "title": "Anomaly Detection and Localization in Images using Guided Attention",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Anomaly detection and localization is a popular computer vision problem which involves detecting anomalous images and localizing anomalies within them. However, this task is challenging due to small sample size and pixel coverage of the anomaly in real-world scenarios. Previous works have a drawback of using anomalous images to compute a threshold during training to detect and localize anomalies. To tackle these issues, we propose AVAGA - the first end-to-end trainable convolutional adversarial variational autoencoder (CAVAE) framework using guided attention which localizes the anomaly with the help of attention maps. AVAGA detects an image as anomalous from the large pixel-wise difference between the input and reconstructed image. In an unsupervised setting, we propose a guided attention loss, where we encourage AVAGA to focus on all non-anomalous regions in the image without using any anomalous images during training. Furthermore, we also propose a selective gradient backpropagation technique for guided attention, which enhances the performance of anomaly localization while using only 2% anomalous images in a weakly supervised setting. AVAGA outperforms the state-of-the-art (SoTA) methods by 10% and 18% on localization and 8% and 15% on classification accuracy in unsupervised and weakly supervised settings respectively on Mvtec Anomaly Detection (MvAD) dataset and by 11% and 22% on localization and 10% and 19% on classification accuracy in unsupervised and weakly supervised settings respectively on the modified ShanghaiTech Campus (STC) dataset",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shashanka Venkataramanan;Rajat Vikram Singh;Kuan-Chuan Peng",
        "authorids": "shashankv@knights.ucf.edu;singh.rajat@siemens.com;kp388@cornell.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1gikpEtwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "234;134;498",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "449;442;697",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;3",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            288.6666666666667,
            153.5476762731657
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            529.3333333333334,
            118.59267356047853
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-iSuQ7wS1zYJ:scholar.google.com/&scioq=Anomaly+Detection+and+Localization+in+Images+using+Guided+Attention&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1gjs6EtDr",
        "title": "Efficient Content-Based Sparse Attention with Routing Transformers",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a content-based sparse attention model and show improvements on language modeling and image generation.",
        "abstract": "Self-attention has recently been adopted for a wide range of sequence modeling\nproblems. Despite its effectiveness, self-attention suffers quadratic compute and\nmemory requirements with respect to sequence length. Successful approaches to\nreduce this complexity focused on attention to local sliding windows or a small\nset of locations independent of content. Our work proposes to learn dynamic\nsparse attention patterns that avoid allocating computation and memory to attend\nto content unrelated to the query of interest. This work builds upon two lines of\nresearch: it combines the modeling flexibility of prior work on content-based sparse\nattention with the efficiency gains from approaches based on local, temporal sparse\nattention. Our model, the Routing Transformer, endows self-attention with a sparse\nrouting module based on online k-means while reducing the overall complexity of\nattention to O(n^{1.5}d) from O(n^2d) for sequence length n and hidden dimension\nd. We show that our model outperforms comparable sparse attention models on\nlanguage modeling on Wikitext-103 (15.8 vs 18.3 perplexity) as well as on\nimage generation on ImageNet-64 (3.43 vs 3.44 bits/dim) while using fewer self-attention layers.\nCode will be open-sourced on acceptance.",
        "keywords": "Sparse attention;autoregressive;generative models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aurko Roy*;Mohammad Taghi Saffar*;David Grangier;Ashish Vaswani",
        "authorids": "aurkor@google.com;msaffar@google.com;grangier@google.com;avaswani@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nroy*2020efficient,\ntitle={Efficient Content-Based Sparse Attention with Routing Transformers},\nauthor={Aurko Roy* and Mohammad Taghi Saffar* and David Grangier and Ashish Vaswani},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gjs6EtDr}\n}",
        "github": "http://open-sourced-on-acceptance.com",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1gjs6EtDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "1268;342;126",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "887;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            578.6666666666666,
            495.344548998192
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            295.6666666666667,
            418.13580994164505
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 671,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11653633172486276299&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "B1gkpR4FDB",
        "title": "Statistical Adaptive Stochastic Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We investigate statistical methods for automatically scheduling the learning rate (step size) in stochastic optimization. First, we consider a broad family of stochastic optimization methods with constant hyperparameters (including the learning rate and various forms of momentum) and derive a general necessary condition for the resulting dynamics to be stationary. Based on this condition, we develop a simple online statistical test to detect (non-)stationarity and use it to automatically drop the learning rate by a constant factor whenever stationarity is detected. Unlike in prior work, our stationarity condition and our statistical test applies to different algorithms without modification. Finally, we propose a smoothed stochastic line-search method that can be used to warm up the optimization process before the statistical test can be applied effectively. This removes the expensive trial and error for setting a good initial learning rate. The combined method is highly autonomous and it attains state-of-the-art training and testing performance in our experiments on several deep learning tasks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pengchuan Zhang;Hunter Lang;Qiang Liu;Lin Xiao",
        "authorids": "penzhan@microsoft.com;hjl@mit.edu;lqiang@cs.utexas.edu;lin.xiao@microsoft.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020statistical,\ntitle={Statistical Adaptive Stochastic Optimization},\nauthor={Pengchuan Zhang and Hunter Lang and Qiang Liu and Lin Xiao},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gkpR4FDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1gkpR4FDB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "315;447;586",
        "wc_reply_reviewers": "447;144;0",
        "wc_reply_authors": "559;751;573",
        "reply_reviewers": "1;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            449.3333333333333,
            110.64758871701132
        ],
        "wc_reply_reviewers_avg": [
            197.0,
            186.2954642496698
        ],
        "wc_reply_authors_avg": [
            627.6666666666666,
            87.39692341394074
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vVX_x07aUHIJ:scholar.google.com/&scioq=Statistical+Adaptive+Stochastic+Optimization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1gm-a4tDH",
        "title": "Modeling treatment events in disease progression",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel matrix completion based algorithm to model disease progression with events",
        "abstract": "Ability to quantify and predict progression of a disease is fundamental for selecting an appropriate treatment. Many clinical metrics cannot be acquired frequently either because of their cost (e.g. MRI, gait analysis) or because they are inconvenient or harmful to a patient (e.g. biopsy, x-ray). In such scenarios, in order to estimate individual trajectories of disease progression, it is advantageous to leverage similarities between patients, i.e. the covariance of trajectories, and find a latent representation of progression. Most of existing methods for estimating trajectories do not account for events in-between observations, what dramatically decreases their adequacy for clinical practice. In this study, we develop a machine learning framework named Coordinatewise-Soft-Impute (CSI) for analyzing disease progression from sparse observations in the presence of confounding events. CSI is guaranteed to converge to the global minimum of the corresponding optimization problem. Experimental results also demonstrates the effectiveness of CSI using both simulated and real dataset.",
        "keywords": "disease progression;treatment events;matrix completion",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guanyang Wang;Yumeng Zhang;Yong Deng;Xuxin Huang;Lukasz Kidzinski",
        "authorids": "guanyang@stanford.edu;zym3008@gmail.com;yongdeng@stanford.edu;xxhuang@stanford.edu;lukasz.kidzinski@stanford.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwang2020modeling,\ntitle={Modeling treatment events in disease progression},\nauthor={Guanyang Wang and Yumeng Zhang and Yong Deng and Xuxin Huang and Lukasz Kidzinski},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gm-a4tDH}\n}",
        "github": "https://www.dropbox.com/sh/y7h9utzsord2k79/AABpL0qWjOse-6dgj3-k0vina?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1gm-a4tDH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "134;328;328",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            263.3333333333333,
            91.45247703346016
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2N8smJhJtioJ:scholar.google.com/&scioq=Modeling+treatment+events+in+disease+progression&hl=en&as_sdt=0,33",
        "gs_version_total": 5
    },
    {
        "id": "B1gn-pEKwH",
        "title": "INFERENCE, PREDICTION, AND ENTROPY RATE OF CONTINUOUS-TIME, DISCRETE-EVENT PROCESSES",
        "track": "main",
        "status": "Reject",
        "tldr": "A new method for inferring a model of, estimating the entropy rate of, and predicting continuous-time, discrete-event processes.",
        "abstract": "The inference of models, prediction of future symbols, and entropy rate estimation of discrete-time, discrete-event processes is well-worn ground. However, many time series are better conceptualized as continuous-time, discrete-event processes. Here, we provide new methods for inferring models, predicting future symbols, and estimating the entropy rate of continuous-time, discrete-event processes. The methods rely on an extension of Bayesian structural inference that takes advantage of neural network\u2019s universal approximation power. Based on experiments with simple synthetic data, these new methods seem to be competitive with state-of- the-art methods for prediction and entropy rate estimation as long as the correct model is inferred.",
        "keywords": "continuous-time prediction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sarah Marzen;James P. Crutchfield",
        "authorids": "smarzen@cmc.edu;chaos@cse.ucdavis.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmarzen2020inference,\ntitle={{\\{}INFERENCE{\\}}, {\\{}PREDICTION{\\}}, {\\{}AND{\\}} {\\{}ENTROPY{\\}} {\\{}RATE{\\}} {\\{}OF{\\}} {\\{}CONTINUOUS{\\}}-{\\{}TIME{\\}}, {\\{}DISCRETE{\\}}-{\\{}EVENT{\\}} {\\{}PROCESSES{\\}}},\nauthor={Sarah Marzen and James P. Crutchfield},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gn-pEKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1gn-pEKwH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "609;646;401",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            552.0,
            107.8362956831635
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OhjoJ0L0qpsJ:scholar.google.com/&scioq=INFERENCE,+PREDICTION,+AND+ENTROPY+RATE+OF+CONTINUOUS-TIME,+DISCRETE-EVENT+PROCESSES&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1gqipNYwH",
        "title": "Option Discovery using Deep Skill Chaining",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a new hierarchical reinforcement learning algorithm which can solve high-dimensional goal-oriented tasks  more reliably than non-hierarchical agents and other state-of-the-art skill discovery techniques.",
        "abstract": "Autonomously discovering temporally extended actions, or skills, is a longstanding goal of hierarchical reinforcement learning. We propose a new algorithm that combines skill chaining with deep neural networks to autonomously discover skills in high-dimensional, continuous domains. The resulting algorithm, deep skill chaining, constructs skills with the property that executing one enables the agent to execute another. We demonstrate that deep skill chaining significantly outperforms both non-hierarchical agents and other state-of-the-art skill discovery techniques in challenging continuous control tasks.",
        "keywords": "Hierarchical Reinforcement Learning;Reinforcement Learning;Skill Discovery;Deep Learning;Deep Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akhil Bagaria;George Konidaris",
        "authorids": "akhil_bagaria@brown.edu;gdk@cs.brown.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nBagaria2020Option,\ntitle={Option Discovery using Deep Skill Chaining},\nauthor={Akhil Bagaria and George Konidaris},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gqipNYwH}\n}",
        "github": "https://github.com/deep-skill-chaining/deep-skill-chaining",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1gqipNYwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "487;528;743",
        "wc_reply_reviewers": "103;0;148",
        "wc_reply_authors": "980;666;1465",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            586.0,
            112.27050666433578
        ],
        "wc_reply_reviewers_avg": [
            83.66666666666667,
            61.94800687314771
        ],
        "wc_reply_authors_avg": [
            1037.0,
            328.6710614986763
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 150,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13264170387120464821&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "B1grSREtDH",
        "title": "Bayesian Residual Policy Optimization: Scalable Bayesian Reinforcement Learning with Clairvoyant Experts",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a scalable Bayesian Reinforcement Learning algorithm that learns a Bayesian correction over an ensemble of clairvoyant experts to solve problems with complex latent rewards and dynamics.",
        "abstract": "Informed and robust decision making in the face of uncertainty is critical for robots that perform physical tasks alongside people. We formulate this as a Bayesian Reinforcement Learning problem over latent Markov Decision Processes (MDPs). While Bayes-optimality is theoretically the gold standard, existing algorithms do not scale well to continuous state and action spaces. We propose a scalable solution that builds on the following insight: in the absence of uncertainty, each latent MDP is easier to solve. We split the challenge into two simpler components. First, we obtain an ensemble of clairvoyant experts and fuse their advice to compute a baseline policy. Second, we train a Bayesian residual policy to improve upon the ensemble's recommendation and learn to reduce uncertainty. Our algorithm, Bayesian Residual Policy Optimization (BRPO), imports the scalability of policy gradient methods as well as the initialization from prior models. BRPO significantly improves the ensemble of experts and drastically outperforms existing adaptive RL methods.",
        "keywords": "Bayesian Residual Reinforcement Learning;Residual Reinforcement Learning;Bayes Policy Optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gilwoo Lee;Brian Hou;Sanjiban Choudhury;Siddhartha S. Srinivasa",
        "authorids": "gilwoo@cs.uw.edu;bhou@cs.uw.edu;sanjibac@cs.uw.edu;siddh@cs.uw.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nlee2020bayesian,\ntitle={Bayesian Residual Policy Optimization: Scalable Bayesian Reinforcement Learning with Clairvoyant Experts},\nauthor={Gilwoo Lee and Brian Hou and Sanjiban Choudhury and Siddhartha S. Srinivasa},\nyear={2020},\nurl={https://openreview.net/forum?id=B1grSREtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1grSREtDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "479;102;260",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "710;128;408",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            280.3333333333333,
            154.57971693883027
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            415.3333333333333,
            237.65708255570436
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2389416385153745085&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "B1grayHYDH",
        "title": "Incorporating Perceptual Prior to Improve Model's Adversarial Robustness",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Deep Neural Networks trained using human-annotated data are able to achieve human-like accuracy on many computer vision tasks such as classification, object recognition and segmentation. However, they are still far from being as robust as the human visual system. In this paper, we demonstrate that even models that are trained to be robust to random perturbations do not necessarily learn robust representations. We propose to address this by imposing a perception based prior on the learned representations to ensure that perceptually similar images have similar representations. We demonstrate that, although this training method does not use adversarial samples during training, it significantly improves the network\u2019s robustness to single-step and multi-step adversarial attacks, thus validating our hypothesis that the network indeed learns more robust representations. Our proposed method provides a means of achieving adversarial robustness at no additional computational cost when compared to normal training. ",
        "keywords": "Representation learining;adversarial defense;robust neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "B.S. Vivek;Arya Baburaj;Ashutosh B Sathe;R. Venkatesh Babu",
        "authorids": "svivek@iisc.ac.in;aryababuraj@iisc.ac.in;satheab16.mech@coep.ac.in;venky@iisc.ac.in",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1grayHYDH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "546;190;331",
        "wc_reply_reviewers": "0;103;0",
        "wc_reply_authors": "221;179;118",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            355.6666666666667,
            146.37926386236848
        ],
        "wc_reply_reviewers_avg": [
            34.333333333333336,
            48.554665641476255
        ],
        "wc_reply_authors_avg": [
            172.66666666666666,
            42.28737662970148
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oopbbnFszZUJ:scholar.google.com/&scioq=Incorporating+Perceptual+Prior+to+Improve+Model%27s+Adversarial+Robustness&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1gskyStwr",
        "title": "Frequency-based Search-control in Dyna",
        "track": "main",
        "status": "Poster",
        "tldr": "Acquire states from high frequency region for search-control in Dyna.",
        "abstract": "Model-based reinforcement learning has been empirically demonstrated as a successful strategy to improve sample efficiency. In particular, Dyna is an elegant model-based architecture integrating learning and planning that provides huge flexibility of using a model. One of the most important components in Dyna is called search-control, which refers to the process of generating state or state-action pairs from which we query the model to acquire simulated experiences. Search-control is critical in improving learning efficiency. In this work, we propose a simple and novel search-control strategy by searching high frequency regions of the value function. Our main intuition is built on Shannon sampling theorem from signal processing, which indicates that a high frequency signal requires more samples to reconstruct. We empirically show that a high frequency function is more difficult to approximate. This suggests a search-control strategy: we should use states from high frequency regions of the value function to query the model to acquire more samples. We develop a simple strategy to locally measure the frequency of a function by gradient and hessian norms, and provide theoretical justification for this approach. We then apply our strategy to search-control in Dyna, and conduct experiments to show its property and effectiveness on benchmark domains.",
        "keywords": "Model-based reinforcement learning;search-control;Dyna;frequency of a signal",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yangchen Pan;Jincheng Mei;Amir-massoud Farahmand",
        "authorids": "pan6@ualberta.ca;jmei2@ualberta.ca;farahmand@vectorinstitute.ai",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nPan2020Frequency-based,\ntitle={Frequency-based Search-control in Dyna},\nauthor={Yangchen Pan and Jincheng Mei and Amir-massoud Farahmand},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gskyStwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer5",
        "site": "https://openreview.net/forum?id=B1gskyStwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "285;426;510",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "516;421;816",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            407.0,
            92.83318372220141
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            584.3333333333334,
            168.34158395621947
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 17,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2849858529546206580&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "B1gtK0NKwr",
        "title": "Correctness Verification of Neural Network",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present the first verification that a neural network for perception tasks produces a correct output within a specified tolerance for every input of interest. ",
        "abstract": "We present the first verification that a neural network for perception tasks produces\na correct output within a specified tolerance for every input of interest. We define\ncorrectness relative to a specification which identifies 1) a state space consisting of\nall relevant states of the world and 2) an observation process that produces neural\nnetwork inputs from the states of the world. Tiling the state and input spaces with\na finite number of tiles, obtaining ground truth bounds from the state tiles and\nnetwork output bounds from the input tiles, then comparing the ground truth and\nnetwork output bounds delivers an upper bound on the network output error for\nany input of interest. Results from two case studies highlight the ability of our\ntechnique to deliver tight error bounds for all inputs of interest and show how the\nerror bounds vary over the state and input spaces.",
        "keywords": "Neural network verification;safety;reliability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yichen Yang;Martin Rinard",
        "authorids": "yicheny@csail.mit.edu;rinard@csail.mit.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "https://anonymous.4open.science/r/5f526d25-cdbf-46db-b737-b235676481b7/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1gtK0NKwr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "284;258;670",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "93;0;143",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.0,
            188.3896670910235
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            78.66666666666667,
            59.252754272598075
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15343493030980332407&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1guLAVFDB",
        "title": "Span Recovery for Deep Neural Networks with Applications to Input Obfuscation",
        "track": "main",
        "status": "Poster",
        "tldr": "We provably recover the span of a deep multi-layered neural network with latent structure and empirically apply efficient span recovery algorithms to attack networks by obfuscating inputs.",
        "abstract": "The tremendous success of deep neural networks has motivated the need to better understand the fundamental properties of these networks, but many of the theoretical results proposed have only been for shallow networks. In this paper, we study an important primitive for understanding the meaningful input space of a deep network: span recovery. For $k<n$, let $\\mathbf{A} \\in \\mathbb{R}^{k \\times n}$ be the innermost weight matrix of an arbitrary feed forward neural network $M: \\mathbb{R}^n \\to  \\mathbb{R}$, so $M(x)$ can be written as $M(x) = \\sigma(\\mathbf{A} x)$, for some network $\\sigma: \\mathbb{R}^k \\to  \\mathbb{R}$. The goal is then to recover the row span of $\\mathbf{A}$ given only oracle access to the value of $M(x)$. We show that if $M$ is a multi-layered network with ReLU activation functions, then partial recovery is possible: namely, we can provably recover $k/2$ linearly independent vectors in the row span of $\\mathbf{A}$ using poly$(n)$ non-adaptive queries to $M(x)$.  Furthermore, if $M$ has differentiable activation functions, we demonstrate that \\textit{full} span recovery is possible even when the output is first passed through a sign or $0/1$ thresholding function; in this case our algorithm is adaptive. Empirically, we confirm that full span recovery is not always possible, but only for unrealistically thin layers. For reasonably wide networks, we obtain full span recovery on both random networks and networks trained on MNIST data. Furthermore, we demonstrate the utility of span recovery as an attack by inducing neural networks to misclassify data obfuscated by controlled random noise as sensical inputs. \n",
        "keywords": "Span recovery;low rank neural networks;adversarial attack",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rajesh Jayaram;David P. Woodruff;Qiuyi Zhang",
        "authorids": "rkjayara@cs.cmu.edu;dwoodruf@andrew.cmu.edu;qiuyiz@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nJayaram2020Span,\ntitle={Span Recovery for Deep Neural Networks with Applications to Input Obfuscation},\nauthor={Rajesh Jayaram and David P. Woodruff and Qiuyi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1guLAVFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1guLAVFDB",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "550;729;179;243",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "636;876;287;10",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            425.25,
            224.54439984110047
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            452.25,
            330.23504886671253
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11152400268195188077&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1gzLaNYvr",
        "title": "TSInsight: A local-global attribution framework for interpretability in time-series data",
        "track": "main",
        "status": "Reject",
        "tldr": "We present an attribution technique leveraging sparsity inducing norms to achieve interpretability.",
        "abstract": "With the rise in employment of deep learning methods in safety-critical scenarios, interpretability is more essential than ever before. Although many different directions regarding interpretability have been explored for visual modalities, time-series data has been neglected with only a handful of methods tested due to their poor intelligibility. We approach the problem of interpretability in a novel way by proposing TSInsight where we attach an auto-encoder with a sparsity-inducing norm on its output to the classifier and fine-tune it based on the gradients from the classifier and a reconstruction penalty. The auto-encoder learns to preserve features that are important for the prediction by the classifier and suppresses the ones that are irrelevant i.e. serves as a feature attribution method to boost interpretability. In other words, we ask the network to only reconstruct parts which are useful for the classifier i.e. are correlated or causal for the prediction. In contrast to most other attribution frameworks, TSInsight is capable of generating both instance-based and model-based explanations. We evaluated TSInsight along with other commonly used attribution methods on a range of different time-series datasets to validate its efficacy. Furthermore, we analyzed the set of properties that TSInsight achieves out of the box including adversarial robustness and output space contraction. The obtained results advocate that TSInsight can be an effective tool for the interpretability of deep time-series models.",
        "keywords": "Deep Learning;Representation Learning;Convolutional Neural Networks;Time-Series Analysis;Feature Importance;Visualization;Demystification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shoaib Ahmed Siddiqui;Dominique Mercier;Andreas Dengel;Sheraz Ahmed",
        "authorids": "shoaib_ahmed.siddiqui@dfki.de;dominique.mercier@dfki.de;andreas.dengel@dfki.de;sheraz.ahmed@dfki.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsiddiqui2020tsinsight,\ntitle={{\\{}TSI{\\}}nsight: A local-global attribution framework for interpretability in time-series data},\nauthor={Shoaib Ahmed Siddiqui and Dominique Mercier and Andreas Dengel and Sheraz Ahmed},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gzLaNYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1gzLaNYvr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "402;527;335",
        "wc_reply_reviewers": "0;404;325",
        "wc_reply_authors": "398;809;1100",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            421.3333333333333,
            79.56688311658871
        ],
        "wc_reply_reviewers_avg": [
            243.0,
            174.82753406333532
        ],
        "wc_reply_authors_avg": [
            769.0,
            287.98263836557925
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8043836786772765796&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "B1l0wp4tvr",
        "title": "Information Plane Analysis of Deep Neural Networks via Matrix--Based Renyi's Entropy and Tensor Kernels",
        "track": "main",
        "status": "Reject",
        "tldr": "First comprehensive information plane analysis of large scale deep neural networks using matrix based entropy and tensor kernels.",
        "abstract": "Analyzing deep neural networks (DNNs) via information plane (IP) theory has gained tremendous attention recently as a tool to gain insight into, among others, their generalization ability. However, it is by no means obvious how to estimate mutual information (MI) between each hidden layer and the input/desired output, to construct the IP. For instance, hidden layers with many neurons require MI estimators with robustness towards the high dimensionality associated with such layers. MI estimators should also be able to naturally handle convolutional layers, while at the same time being computationally tractable to scale to large networks. None of the existing IP methods to date have been able to study truly deep Convolutional Neural Networks (CNNs), such as the e.g.\\ VGG-16. In this paper, we propose an IP analysis using the new matrix--based R\\'enyi's entropy coupled with tensor kernels over convolutional layers, leveraging the power of kernel methods to represent properties of the probability distribution independently of the dimensionality of the data. The obtained results shed new light on the previous literature concerning small-scale DNNs, however using a completely new approach. Importantly, the new framework enables us to provide the first comprehensive IP analysis of contemporary large-scale DNNs and CNNs, investigating the different training phases and providing new insights into the training dynamics of large-scale neural networks.",
        "keywords": "information plane;information theory;deep neural networks;entropy;mutual information;tensor kernels",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kristoffer Wickstr\u00f8m;Sigurd L\u00f8kse;Michael Kampffmeyer;Shujian Yu;Jose Principe;Robert Jenssen",
        "authorids": "kristoffer.k.wickstrom@uit.no;sigurd.lokse@uit.no;michael.c.kampffmeyer@uit.no;yusjlcy9011@cnel.ufl.edu;principe@cnel.ufl.edu;robert.jenssen@uit.no",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nwickstr{\\o}m2020information,\ntitle={Information Plane Analysis of Deep Neural Networks via Matrix--Based Renyi's Entropy and Tensor Kernels},\nauthor={Kristoffer Wickstr{\\o}m and Sigurd L{\\o}kse and Michael Kampffmeyer and Shujian Yu and Jose Principe and Robert Jenssen},\nyear={2020},\nurl={https://openreview.net/forum?id=B1l0wp4tvr}\n}",
        "github": "https://anonymous.4open.science/r/d1ad771a-aa55-4585-b056-2b07b098f51e/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1l0wp4tvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "200;514;890",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "705;244;1551",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            534.6666666666666,
            282.0701252446908
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            833.3333333333334,
            541.241987366916
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1115161907404924157&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1l1qnEFwH",
        "title": "Deep Audio Prior",
        "track": "main",
        "status": "Reject",
        "tldr": "a deep audio network that does not require any external training data",
        "abstract": "Deep convolutional  neural networks are known to specialize in distilling compact and robust prior from a large amount of data. We are interested in applying deep networks in the absence of training dataset. In this paper, we introduce deep audio prior (DAP)  which leverages the structure of a network and the temporal information in a single audio file. Specifically, we demonstrate that a randomly-initialized neural network can be used with carefully designed audio prior to tackle challenging audio problems such as universal blind source separation, interactive audio editing, audio texture synthesis, and audio co-separation.\n\nTo understand the robustness of the deep audio prior, we construct a benchmark dataset Universal-150 for universal sound source separation with a diverse set of sources. We show superior audio results than previous work on both qualitatively and quantitative evaluations. We also perform thorough ablation study to validate our design choices.",
        "keywords": "deep audio prior;blind sound separation;deep learning;audio representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yapeng Tian;Chenliang Xu;Dingzeyu Li",
        "authorids": "yapengtian@rochester.edu;chenliang.xu@rochester.edu;dinli@adobe.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntian2020deep,\ntitle={Deep Audio Prior},\nauthor={Yapeng Tian and Chenliang Xu and Dingzeyu Li},\nyear={2020},\nurl={https://openreview.net/forum?id=B1l1qnEFwH}\n}",
        "github": "https://iclr-dap.github.io/Deep-Audio-Prior/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1l1qnEFwH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "465;316;144",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1043;287;395",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            308.3333333333333,
            131.1597838092488
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            575.0,
            333.8502658378453
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17228653216558156414&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "B1l2bp4YwS",
        "title": "What graph neural networks cannot learn: depth vs width",
        "track": "main",
        "status": "Poster",
        "tldr": "Several graph problems are impossible unless the product of a graph neural network's depth and width exceeds a polynomial of the graph size.",
        "abstract": "This paper studies the expressive power of graph neural networks falling within the message-passing framework (GNNmp). Two results are presented. First, GNNmp are shown to be Turing universal under sufficient conditions on their depth, width, node attributes, and layer expressiveness. Second, it is discovered that GNNmp can lose a significant portion of their power when their depth and width is restricted. The proposed impossibility statements stem from a new technique that enables the repurposing of seminal results from distributed computing and leads to lower bounds for an array of decision, optimization, and estimation problems involving graphs. Strikingly, several of these problems are deemed impossible unless the product of a GNNmp's depth and width exceeds a polynomial of the graph size; this dependence remains significant even for tasks that appear simple or when considering approximation.",
        "keywords": "graph neural networks;capacity;impossibility results;lower bounds;expressive power",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andreas Loukas",
        "authorids": "andreas.loukas@epfl.ch",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nLoukas2020What,\ntitle={What graph neural networks cannot learn: depth vs width},\nauthor={Andreas Loukas},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1l2bp4YwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1l2bp4YwS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "196;645;583",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "803;618;720",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            474.6666666666667,
            198.6661073817631
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            713.6666666666666,
            75.65858987730489
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 351,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16805974139448353915&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "B1l34C4tPB",
        "title": "Deep Black-Box Optimization with Influence Functions",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Deep neural networks are increasingly being used to model black-box functions. Examples include modeling brain response to stimuli, material properties under given synthesis conditions, and digital art. In these applications, often the model is a surrogate and the goal is rather to optimize the black-box function to achieve the desired brain response, material property, or digital art characteristics. Moreover, resource constraints imply that, rather than training on a passive dataset, one should focus subsequent sampling on the most informative data points. In the Bayesian setting, this can be achieved by utilizing the ability of Bayesian models such as Gaussian processes to model uncertainty in observed data via posterior variance, which can guide subsequent sampling. However, uncertainty estimates for deep neural networks are largely lacking or are very expensive to compute. For example, bootstrap or cross-validation estimates require re-training the network several times which is often computationally prohibitive. In this work, we use influence functions to estimate the variance of neural network outputs, and design a black-box optimization algorithm similar to confidence bound-based Bayesian algorithms. We demonstrate the effectiveness of our method through experiments on synthetic and real-world optimization problems.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jayanth Koushik;Michael J. Tarr;Aarti Singh",
        "authorids": "jkoushik@cs.cmu.edu;aartisingh@cmu.edu;michaeltarr@cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=B1l34C4tPB",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Wk4LYZYp5xYJ:scholar.google.com/&scioq=Deep+Black-Box+Optimization+with+Influence+Functions&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "B1l3M64KwB",
        "title": "How many weights are enough : can tensor factorization learn efficient policies ?",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep reinforcement learning requires a heavy price in terms of sample efficiency and overparameterization in the neural networks used for function approximation. In this work, we employ tensor factorization in order to learn more compact representations for reinforcement learning policies. We show empirically that in the low-data regime, it is possible to learn online policies with 2 to 10 times less total coefficients, with little to no loss of performance. We also leverage progress in second order optimization, and use the theory of wavelet scattering to further reduce the number of learned coefficients, by foregoing learning the topmost convolutional layer filters altogether. We evaluate our results on the Atari suite against recent baseline algorithms that represent the state-of-the-art in data efficiency, and get comparable results with an order of magnitude gain in weight parsimony.",
        "keywords": "reinforcement learning;Q-learning;tensor factorization;low-rank approximation;data efficiency;second-order optimization;scattering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pierre H. Richemond;Arinbjorn Kolbeinsson;Yike Guo",
        "authorids": "phr17@ic.ac.uk;ak711@imperial.ac.uk;y.guo@imperial.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nrichemond2020how,\ntitle={How many weights are enough : can tensor factorization learn efficient policies ?},\nauthor={Pierre H. Richemond and Arinbjorn Kolbeinsson and Yike Guo},\nyear={2020},\nurl={https://openreview.net/forum?id=B1l3M64KwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1l3M64KwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "203;852;329",
        "wc_reply_reviewers": "67;0;0",
        "wc_reply_authors": "238;353;386",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            461.3333333333333,
            280.9914984874493
        ],
        "wc_reply_reviewers_avg": [
            22.333333333333332,
            31.584102892999123
        ],
        "wc_reply_authors_avg": [
            325.6666666666667,
            63.436757658491835
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1zMjuzrxlvkJ:scholar.google.com/&scioq=How+many+weights+are+enough+:+can+tensor+factorization+learn+efficient+policies+%3F&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1l4SgHKDH",
        "title": "Residual Energy-Based Models for Text Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "We show that Energy-Based models when trained on the residual of an auto-regressive language model can be used effectively and efficiently to generate text. ",
        "abstract": "Text generation is ubiquitous in many NLP tasks, from summarization, to dialogue and machine translation. The dominant parametric approach is based on locally normalized models which predict one word at a time. While these work remarkably well, they are plagued by exposure bias due to the greedy nature of the generation process. In this work, we investigate un-normalized energy-based models (EBMs) which operate not at the token but at the sequence level. In order to make training tractable, we first work in the residual of a pretrained locally normalized language model and second we train using noise contrastive estimation. Furthermore, since the EBM works at the sequence level, we can leverage pretrained bi-directional contextual representations, such as BERT and RoBERTa. Our experiments on two large language modeling datasets show that residual EBMs yield lower perplexity compared to locally normalized baselines. Moreover, generation via importance sampling is very efficient and of higher quality than the baseline models according to human evaluation.",
        "keywords": "energy-based models;text generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuntian Deng;Anton Bakhtin;Myle Ott;Arthur Szlam;Marc'Aurelio Ranzato",
        "authorids": ";;;;",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nDeng2020Residual,\ntitle={Residual Energy-Based Models for Text Generation},\nauthor={Yuntian Deng and Anton Bakhtin and Myle Ott and Arthur Szlam and Marc'Aurelio Ranzato},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1l4SgHKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1l4SgHKDH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "523;608;268",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "498;283;235",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            466.3333333333333,
            144.47221955179558
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            338.6666666666667,
            114.35714037853323
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 164,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13293237973368937877&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "B1l5m6VFwr",
        "title": "EINS: Long Short-Term Memory with Extrapolated Input Network Simplification",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper modelled cell states of LSTMs and GRUs as dynamic Hopfield networks to present the novel light-weight RNN of EINS with either comparable, or better performances than the LSTM in a wide range of tasks.",
        "abstract": "This paper contrasts the two canonical recurrent neural networks (RNNs) of long short-term memory (LSTM) and gated recurrent unit (GRU) to propose our novel light-weight RNN of Extrapolated Input for Network Simplification (EINS). We treat LSTMs and GRUs as differential equations, and our analysis highlights several auxiliary components in the standard LSTM design that are secondary in importance. Guided by these insights, we present a design that abandons the LSTM redundancies, thereby introducing EINS. We test EINS against the LSTM over a carefully chosen range of tasks from language modelling and medical data imputation-prediction through a sentence-level variational autoencoder and image generation to learning to learn to optimise another neural network. Despite having both a simpler design and fewer parameters, this simplification either performs comparably, or better, than the LSTM in each task.",
        "keywords": "recurrent neural network;RNN;long short-term memory;LSTM;gated recurrent network;GRU;dynamical mathematics;interpretability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nicholas I-Hsien Kuo;Mehrtash T. Harandi;Nicolas Fourrier;Gabriela Ferraro;Christian Walder;Hanna Suominen",
        "authorids": ";mehrtash.harandi@monash.edu;nicolas.fourrier@devinci.fr;gabriela.ferraro@csiro.au;christian.walder@data61.csiro.au;hanna.suominen@anu.edu.au",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nkuo2020eins,\ntitle={{\\{}EINS{\\}}: Long Short-Term Memory with Extrapolated Input Network Simplification},\nauthor={Nicholas I-Hsien Kuo and Mehrtash T. Harandi and Nicolas Fourrier and Gabriela Ferraro and Christian Walder and Hanna Suominen},\nyear={2020},\nurl={https://openreview.net/forum?id=B1l5m6VFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1l5m6VFwr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "306;337;152",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            265.0,
            80.89911412782384
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VjEZ2Qkz6YMJ:scholar.google.com/&scioq=EINS:+Long+Short-Term+Memory+with+Extrapolated+Input+Network+Simplification&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1l6nnEtwr",
        "title": "AN EFFICIENT HOMOTOPY TRAINING ALGORITHM FOR NEURAL NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We present a Homotopy Training Algorithm (HTA) to solve optimization problems arising from neural networks. The HTA starts with several decoupled systems with low dimensional structure and tracks the solution to the high dimensional coupled system. The decoupled systems are easy to solve due to the low dimensionality but can be connected to the original system via a continuous homotopy path guided by the HTA. We have proved the convergence of HTA for the non-convex case and existence of the homotopy solution path for the convex case. The HTA has provided a better accuracy on several examples including VGG models on CIFAR-10. Moreover, the HTA would be combined with the dropout technique to provide an alternative way to train the neural networks.",
        "keywords": "Homotopy training algorithm;Convergence analysis;Neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qipin Chen;Wenrui Hao",
        "authorids": "qzc18@psu.edu;wxh64@psu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nchen2020an,\ntitle={{\\{}AN{\\}} {\\{}EFFICIENT{\\}} {\\{}HOMOTOPY{\\}} {\\{}TRAINING{\\}} {\\{}ALGORITHM{\\}} {\\{}FOR{\\}} {\\{}NEURAL{\\}} {\\{}NETWORKS{\\}}},\nauthor={Qipin Chen and Wenrui Hao},\nyear={2020},\nurl={https://openreview.net/forum?id=B1l6nnEtwr}\n}",
        "github": "https://github.com/Bill-research/homotopy",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1l6nnEtwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "871;244;279",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            464.6666666666667,
            287.6761296241931
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7782489270000573703&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1l6y0VFPr",
        "title": "Identity Crisis: Memorization and Generalization Under Extreme Overparameterization",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We study the interplay between memorization and generalization of\noverparameterized networks in the extreme case of a single training example and an identity-mapping task. We examine fully-connected and convolutional networks (FCN and CNN), both linear and nonlinear, initialized randomly and then trained to minimize the reconstruction error. The trained networks stereotypically take one of two forms: the constant function (memorization) and the identity function (generalization).\nWe formally characterize generalization in single-layer FCNs and CNNs.\nWe show empirically that different architectures exhibit strikingly different inductive biases.\nFor example, CNNs of up to 10 layers are able to generalize\nfrom a single example, whereas FCNs cannot learn the identity function reliably from 60k examples. Deeper CNNs often fail, but nonetheless do astonishing work to memorize the training output: because CNN biases are location invariant, the model must progressively grow an output pattern from the image boundaries via the coordination of many layers. Our work helps to quantify and visualize the sensitivity of inductive biases to architectural choices such as depth, kernel width, and number of channels.\n",
        "keywords": "Generalization;Memorization;Understanding;Inductive Bias",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chiyuan Zhang;Samy Bengio;Moritz Hardt;Michael C. Mozer;Yoram Singer",
        "authorids": "pluskid@gmail.com;bengio@google.com;moritzhardt@gmail.com;mcmozer@google.com;y.s@cs.princeton.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nZhang2020Identity,\ntitle={Identity Crisis: Memorization and Generalization Under Extreme Overparameterization},\nauthor={Chiyuan Zhang and Samy Bengio and Moritz Hardt and Michael C. Mozer and Yoram Singer},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1l6y0VFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1l6y0VFPr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "549;199;294",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "618;485;214",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            347.3333333333333,
            147.77986631773928
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            439.0,
            168.109091564575
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 112,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10004397673578621788&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "B1l8L6EtDS",
        "title": "Self-Adversarial Learning with Comparative Discrimination for Text Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a self-adversarial learning (SAL) paradigm which improves the generator in a self-play fashion for improving GANs' performance in text generation.",
        "abstract": "Conventional Generative Adversarial Networks (GANs) for text generation tend to have issues of reward sparsity and mode collapse that affect the quality and diversity of generated samples. To address the issues, we propose a novel self-adversarial learning (SAL) paradigm for improving GANs' performance in text generation. In contrast to standard GANs that use a binary classifier as its discriminator to predict whether a sample is real or generated, SAL employs a comparative discriminator which is a pairwise classifier for comparing the text quality between a pair of samples. During training, SAL rewards the generator when its currently generated sentence is found to be better than its previously generated samples. This self-improvement reward mechanism allows the model to receive credits more easily and avoid collapsing towards the limited number of real samples, which not only helps alleviate the reward sparsity issue but also reduces the risk of mode collapse. Experiments on text generation benchmark datasets show that our proposed approach substantially improves both the quality and the diversity, and yields more stable performance compared to the previous GANs for text generation.",
        "keywords": "adversarial learning;text generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wangchunshu Zhou;Tao Ge;Ke Xu;Furu Wei;Ming Zhou",
        "authorids": "v-waz@microsoft.com;tage@microsoft.com;kexu@nlsde.buaa.edu.cn;fuwei@microsoft.com;mingzhou@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nZhou2020Self-Adversarial,\ntitle={Self-Adversarial Learning with Comparative Discrimination for Text Generation},\nauthor={Wangchunshu Zhou and Tao Ge and Ke Xu and Furu Wei and Ming Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1l8L6EtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1l8L6EtDS",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "560;430;262",
        "wc_reply_reviewers": "0;0;83",
        "wc_reply_authors": "752;555;751",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.3333333333333,
            121.98724887826961
        ],
        "wc_reply_reviewers_avg": [
            27.666666666666668,
            39.12657522565563
        ],
        "wc_reply_authors_avg": [
            686.0,
            92.63188795801727
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 31,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12068985364796468005&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1lC62EKwr",
        "title": "Evidence-Aware Entropy Decomposition For Active Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "An evidence-aware entropy decomposition approach for active deep learning using multiple sources of uncertainty",
        "abstract": "We present a novel multi-source uncertainty prediction approach that enables deep learning (DL) models to be actively trained with much less labeled data. By leveraging the second-order uncertainty representation provided by subjective logic (SL), we conduct evidence-based theoretical analysis and formally decompose the predicted entropy over multiple classes into two distinct sources of uncertainty: vacuity and dissonance, caused by lack of evidence and conflict of strong evidence, respectively. The evidence based entropy decomposition provides deeper insights on the nature of uncertainty, which can help effectively explore a large and high-dimensional unlabeled data space. We develop a novel loss function that augments DL based evidence prediction with uncertainty anchor sample identification through kernel density estimation (KDE). The accurately estimated multiple sources of uncertainty are systematically integrated and dynamically balanced using a data sampling function for label-efficient active deep learning (ADL). Experiments conducted over both synthetic and real data and comparison with competitive AL methods demonstrate the effectiveness of the proposed ADL model.   ",
        "keywords": "active learning;entropy decomposition;uncertainty",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Weishi Shi;Xujiang Zhao;Feng Chen;Qi Yu",
        "authorids": "ws7586@rit.edu;xujiang.zhao@utdallas.edu;feng.chen@utdallas.edu;qi.yu@rit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nshi2020evidenceaware,\ntitle={Evidence-Aware Entropy Decomposition For  Active Deep Learning},\nauthor={Weishi Shi and Xujiang Zhao and Feng Chen and Qi Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lC62EKwr}\n}",
        "github": "https://drive.google.com/drive/folders/1imwnOahh8HtHK_g_HSTb4TCxZ7YG04ay",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1lC62EKwr",
        "pdf_size": 0,
        "rating": "3;6",
        "confidence": "0;0",
        "wc_review": "439;192",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "1523;717",
        "reply_reviewers": "0;0",
        "reply_authors": "3;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            315.5,
            123.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1120.0,
            403.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            1.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lFQ8rTRuCHEJ:scholar.google.com/&scioq=Evidence-Aware+Entropy+Decomposition+For+Active+Deep+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1lCn64tvS",
        "title": "Improving SAT Solver Heuristics with Graph Networks and Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We use reinforcement learning with graph neural networks to augment a branching heuristic of a SAT solver achieving 2-3X reduction in the number of iterations and generalizing to problems up to 5X larger than the training set.",
        "abstract": "We present GQSAT, a branching heuristic in a Boolean SAT solver trained with value-based reinforcement learning (RL) using Graph Neural Networks for function approximation. Solvers using GQSAT are complete SAT solvers that either provide a satisfying assignment or a proof of unsatisfiability, which is required for many SAT applications. The branching heuristic commonly used in SAT solvers today suffers from bad decisions during their warm-up period, whereas GQSAT has been trained to examine the structure of the particular problem instance to make better decisions at the beginning of the search. Training GQSAT is data efficient and does not require elaborate dataset preparation or feature engineering to train. We train GQSAT on small SAT problems using RL interfacing with an existing SAT solver. We show that GQSAT is able to reduce the number of iterations required to solve SAT problems by 2-3X, and it generalizes to unsatisfiable SAT instances, as well as to problems with 5X more variables than it was trained on. We also show that, to a lesser extent, it generalizes to SAT problems from different domains by evaluating it on graph coloring. Our experiments show that augmenting SAT solvers with agents trained with RL and graph neural networks can improve performance on the SAT search problem.",
        "keywords": "SAT;reinforcement learning;graph neural networks;heuristics;DQN;boolean satisfiability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vitaly Kurin;Saad Godil;Shimon Whiteson;Bryan Catanzaro",
        "authorids": "vitaliykurin@gmail.com;sgodil@nvidia.com;shimon.whiteson@gmail.com;bcatanzaro@nvidia.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkurin2020improving,\ntitle={Improving {\\{}SAT{\\}} Solver Heuristics with Graph Networks and Reinforcement Learning},\nauthor={Vitaly Kurin and Saad Godil and Shimon Whiteson and Bryan Catanzaro},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lCn64tvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1lCn64tvS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "411;499;300",
        "wc_reply_reviewers": "92;0;0",
        "wc_reply_authors": "304;351;117",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            403.3333333333333,
            81.42208272671213
        ],
        "wc_reply_reviewers_avg": [
            30.666666666666668,
            43.36921591277491
        ],
        "wc_reply_authors_avg": [
            257.3333333333333,
            101.0687334881015
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15858889420477600244&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1lDoJSYDH",
        "title": "Lagrangian Fluid Simulation with Continuous Convolutions",
        "track": "main",
        "status": "Poster",
        "tldr": "We learn particle-based fluid simulation with convolutional networks.",
        "abstract": "We present an approach to Lagrangian fluid simulation with a new type of convolutional network. Our networks process sets of moving particles, which describe fluids in space and time. Unlike previous approaches, we do not build an explicit graph structure to connect the particles but use spatial convolutions as the main differentiable operation that relates particles to their neighbors. To this end we present a simple, novel, and effective extension of N-D convolutions to the continuous domain. We show that our network architecture can simulate different materials, generalizes to arbitrary collision geometries, and can be used for inverse problems. In addition, we demonstrate that our continuous convolutions outperform prior formulations in terms of accuracy and speed.\n",
        "keywords": "particle-based physics;fluid mechanics;continuous convolutions;material estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Benjamin Ummenhofer;Lukas Prantl;Nils Thuerey;Vladlen Koltun",
        "authorids": "benjamin.ummenhofer@intel.com;lukas.prantl@tum.de;nils.thuerey@tum.de;vkoltun@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nUmmenhofer2020Lagrangian,\ntitle={Lagrangian Fluid Simulation with Continuous Convolutions},\nauthor={Benjamin Ummenhofer and Lukas Prantl and Nils Thuerey and Vladlen Koltun},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lDoJSYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1lDoJSYDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "774;357;705",
        "wc_reply_reviewers": "0;0;28",
        "wc_reply_authors": "601;779;1135",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            612.0,
            182.49931506720785
        ],
        "wc_reply_reviewers_avg": [
            9.333333333333334,
            13.199326582148887
        ],
        "wc_reply_authors_avg": [
            838.3333333333334,
            222.00500494858719
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 229,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14941128186968830970&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1lFa3EFwB",
        "title": "Stablizing Adversarial Invariance Induction by Discriminator Matching",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Incorporating the desired invariance into representation learning is a key challenge in many situations, e.g., for domain generalization and privacy/fairness constraints. An adversarial invariance induction (AII) shows its power on this purpose, which maximizes the proxy of the conditional entropy between representations and attributes by adversarial training between an attribute discriminator and feature extractor. However, the practical behavior of AII is still unclear as the previous analysis assumes the optimality of the attribute classifier, which is rarely held in practice. This paper first analyzes the practical behavior of AII both theoretically and empirically, indicating that AII has theoretical difficulty as it maximizes variational {\\em upper} bound of the actual conditional entropy, and AII catastrophically fails to induce invariance even in simple cases as suggested by the above theoretical findings. We then argue that a simple modification to AII can significantly stabilize the adversarial induction framework and achieve better invariant representations. Our modification is based on the property of conditional entropy; it is maximized if and only if the divergence between all pairs of marginal distributions over $z$ between different attributes is minimized. The proposed method, {\\em invariance induction by discriminator matching}, modify AII objective to explicitly consider the divergence minimization requirements by defining a proxy of the divergence by using the attribute discriminator. Empirical validations on both the toy dataset and four real-world datasets (related to applications of user anonymization and domain generalization) reveal that the proposed method provides superior performance when inducing invariance for nuisance factors. ",
        "keywords": "invariance induction;adversarial training;domain generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yusuke Iwasawa;Kei Akuzawa;Yutaka Matsuo",
        "authorids": "iwasawa@weblab.t.u-tokyo.ac.jp;akuzawa-kei@weblab.t.u-tokyo.ac.jp;matsuo@weblab.t.u-tokyo.ac.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\niwasawa2020stablizing,\ntitle={Stablizing Adversarial Invariance Induction by Discriminator Matching},\nauthor={Yusuke Iwasawa and Kei Akuzawa and Yutaka Matsuo},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lFa3EFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1lFa3EFwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "242;763;600",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "362;321;457",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            535.0,
            217.6066788190718
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            380.0,
            56.96197562116913
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:G6L4UPwNkzwJ:scholar.google.com/&scioq=Stablizing+Adversarial+Invariance+Induction+by+Discriminator+Matching&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1lFkRNKDS",
        "title": "Context-Gated Convolution",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A novel Context-Gated Convolution which incorporates global context information into CNNs by explicitly modulating convolution kernels, and thus captures more representative local patterns and extract discriminative features.",
        "abstract": "As the basic building block of Convolutional Neural Networks (CNNs), the convolutional layer is designed to extract local patterns and lacks the ability to model global context in its nature. Many efforts have been recently made to complement CNNs with the global modeling ability, especially by a family of works on global feature interaction. In these works, the global context information is incorporated into local features before they are fed into convolutional layers. However, research on neuroscience reveals that, besides influences changing the inputs to our neurons, the neurons' ability of modifying their functions dynamically according to context is essential for perceptual tasks, which has been overlooked in most of CNNs. Motivated by this, we propose one novel Context-Gated Convolution (CGC) to explicitly modify the weights of convolutional layers adaptively under the guidance of global context. As such, being aware of the global context, the modulated convolution kernel of our proposed CGC can better extract representative local patterns and compose discriminative features. Moreover, our proposed CGC is lightweight, amenable to modern CNN architectures, and consistently improves the performance of CNNs according to extensive experiments on image classification, action recognition, and machine translation. ",
        "keywords": "Convolutional Neural Network;Context-Gated Convolution;Global Context Information",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xudong Lin;Lin Ma;Wei Liu;Shih-Fu Chang",
        "authorids": "xudong.lin@columbia.edu;forest.linma@gmail.com;wl2223@columbia.edu;shih.fu.chang@columbia.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1lFkRNKDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "311;242;138",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "369;520;286",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            230.33333333333334,
            71.10711794343955
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            391.6666666666667,
            96.86531313576026
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 81,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7480677484970516009&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "B1lGU64tDr",
        "title": "Relational State-Space Model for Stochastic Multi-Object Systems",
        "track": "main",
        "status": "Poster",
        "tldr": "A deep hierarchical state-space model in which the state transitions of correlated objects are coordinated by graph neural networks.",
        "abstract": "Real-world dynamical systems often consist of multiple stochastic subsystems that interact with each other. Modeling and forecasting the behavior of such dynamics are generally not easy, due to the inherent hardness in understanding the complicated interactions and evolutions of their constituents. This paper introduces the relational state-space model (R-SSM), a sequential hierarchical latent variable model that makes use of graph neural networks (GNNs) to simulate the joint state transitions of multiple correlated objects. By letting GNNs cooperate with SSM, R-SSM provides a flexible way to incorporate relational information into the modeling of multi-object dynamics. We further suggest augmenting the model with normalizing flows instantiated for vertex-indexed random variables and propose two auxiliary contrastive objectives to facilitate the learning. The utility of R-SSM is empirically evaluated on synthetic and real time series datasets.",
        "keywords": "state-space model;time series;deep sequential model;graph neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fan Yang;Ling Chen;Fan Zhou;Yusong Gao;Wei Cao",
        "authorids": "fanyang01@zju.edu.cn;lingchen@cs.zju.edu.cn;fanzhou@zju.edu.cn;jianchuan.gys@alibaba-inc.com;mingsong.cw@alibaba-inc.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nYang2020Relational,\ntitle={Relational State-Space Model for Stochastic Multi-Object Systems},\nauthor={Fan Yang and Ling Chen and Fan Zhou and Yusong Gao and Wei Cao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lGU64tDr}\n}",
        "github": "https://github.com/fanyang01/relational-ssm",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1lGU64tDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "350;1114;302",
        "wc_reply_reviewers": "0;374;0",
        "wc_reply_authors": "598;1312;671",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            588.6666666666666,
            371.9832732201215
        ],
        "wc_reply_reviewers_avg": [
            124.66666666666667,
            176.30529077584583
        ],
        "wc_reply_authors_avg": [
            860.3333333333334,
            320.7640184864187
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17631460320544574139&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1lJzyStvS",
        "title": "Self-Supervised Learning of Appliance Usage",
        "track": "main",
        "status": "Poster",
        "tldr": "We learn appliance usage patterns in homes without labels, using self-supervised learning with energy and location data",
        "abstract": "Learning home appliance usage is important for understanding people's activities and optimizing energy consumption.  The problem is modeled as an event detection task, where the objective is to learn when a user turns an appliance on, and which appliance it is (microwave, hair dryer, etc.). Ideally, we would like to solve the problem in an unsupervised way so that the method can be applied to new homes and new appliances without any labels. To this end, we introduce a new deep learning model that takes input from two home sensors: 1) a smart electricity meter that outputs the total energy consumed by the home as a function of time, and 2) a motion sensor that outputs the locations of the residents over time.  The model learns the distribution of the residents' locations conditioned on the home energy signal. We show that this cross-modal prediction task allows us to detect when a particular appliance is used, and the location of the appliance in the home, all in a self-supervised manner, without any labeled data. ",
        "keywords": "Appliance usage;self-supervised learning;multi-modal learning;unsupervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chen-Yu Hsu;Abbas Zeitoun;Guang-He Lee;Dina Katabi;Tommi Jaakkola",
        "authorids": "cyhsu@mit.edu;zeitoun@mit.edu;guanghe@csail.mit.edu;dina@csail.mit.edu;tommi@csail.mit.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nHsu2020Self-Supervised,\ntitle={Self-Supervised Learning of Appliance Usage},\nauthor={Chen-Yu Hsu and Abbas Zeitoun and Guang-He Lee and Dina Katabi and Tommi Jaakkola},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lJzyStvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1lJzyStvS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "749;374;252",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1387;684;139",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            458.3333333333333,
            211.48102095039692
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            736.6666666666666,
            510.8530991282023
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1376874240572891039&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1lKDlHtwS",
        "title": "Measuring causal influence with back-to-back regression: the linear case",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Identifying causes from observations can be particularly challenging when i) potential factors are difficult to manipulate individually and ii) observations are complex and multi-dimensional. To address this issue, we introduce \u201cBack-to-Back\u201d regression (B2B), a method designed to efficiently measure, from a set of co-varying factors, the causal influences that most plausibly account for multidimensional observations. After proving the consistency of B2B and its links to other linear approaches, we show that our method outperforms least-squares regression and cross-decomposition techniques (e.g. canonical correlation analysis and partial least squares) on causal identification. Finally, we apply B2B to neuroimaging recordings of 102 subjects reading word sequences. The results show that the early and late brain representations, caused by low- and high-level word features respectively, are more reliably detected with B2B than with other standard techniques.\n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jean-Remi King;Francois Charton;Maxime Oquab;David Lopez-Paz",
        "authorids": "jeanremi@fb.com;fcharton@fb.com;qas@fb.com;dlp@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nking2020measuring,\ntitle={Measuring causal influence with back-to-back regression: the linear case},\nauthor={Jean-Remi King and Francois Charton and Maxime Oquab and David Lopez-Paz},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lKDlHtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1lKDlHtwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "301;555;144",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "943;1189;246",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            333.3333333333333,
            169.34055102727822
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            792.6666666666666,
            399.3848046962005
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16468984102251655092&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1lL9grYDS",
        "title": "Efficient and Robust Asynchronous Federated Learning with Stragglers",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "We propose an efficient and robust asynchronous federated learning algorithm on the existence of stragglers",
        "abstract": "We address the efficiency issues caused by the straggler effect in the recently emerged federated learning, which collaboratively trains a model on decentralized non-i.i.d. (non-independent and identically distributed) data across massive worker devices without exchanging training data in the unreliable and heterogeneous networks. We propose a novel two-stage analysis on the error bounds of general federated learning, which provides practical insights into optimization. As a result, we propose a novel easy-to-implement federated learning algorithm that uses asynchronous settings and strategies to control discrepancies between the global model and delayed models and adjust the number of local epochs with the estimation of staleness to accelerate convergence and resist performance deterioration caused by stragglers. Experiment results show that our algorithm converges fast and robust on the existence of massive stragglers.",
        "keywords": "federated learning;straggler effect;distributed machine learning;distributed optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ming Chen;Bingcheng Mao;Tianyi Ma",
        "authorids": "chm@zju.edu.cn;maobingcheng@myhexin.com;matianyi@myhexin.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nanonymous2020efficient,\ntitle={Efficient and Robust Asynchronous Federated Learning with Stragglers},\nauthor={Anonymous},\nbooktitle={Submitted to International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lL9grYDS},\nnote={under review}\n}",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=B1lL9grYDS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 46,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6839198801492580274&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1lLw6EYwB",
        "title": "Gap-Aware Mitigation of Gradient Staleness",
        "track": "main",
        "status": "Poster",
        "tldr": "A new distributed, asynchronous, SGD-based algorithm, which achieves state-of-the-art accuracy on existing architectures using staleness penalization without having to re-tune the hyperparameters.",
        "abstract": "Cloud computing is becoming increasingly popular as a platform for distributed training of deep neural networks. Synchronous stochastic gradient descent (SSGD) suffers from substantial slowdowns due to stragglers if the environment is non-dedicated, as is common in cloud computing. Asynchronous SGD (ASGD) methods are immune to these slowdowns but are scarcely used due to gradient staleness, which encumbers the convergence process. Recent techniques have had limited success mitigating the gradient staleness when scaling up to many workers (computing nodes).  In this paper we define the Gap as a measure of gradient staleness and propose Gap-Aware (GA), a novel asynchronous-distributed method that penalizes stale gradients linearly to the Gap and performs well even when scaling to large numbers of workers. Our evaluation on the CIFAR, ImageNet, and WikiText-103 datasets shows that GA outperforms the currently acceptable gradient penalization method, in final test accuracy. We also provide convergence rate proof for GA. Despite prior beliefs, we show that if GA is applied, momentum becomes beneficial in asynchronous environments, even when the number of workers scales up.",
        "keywords": "distributed;asynchronous;large scale;gradient staleness;staleness penalization;sgd;deep learning;neural networks;optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Saar Barkai;Ido Hakimi;Assaf Schuster",
        "authorids": "saarbarkai@gmail.com;idohakimi@gmail.com;assaf@cs.technion.ac.il",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nBarkai2020Gap-Aware,\ntitle={Gap-Aware Mitigation of Gradient Staleness},\nauthor={Saar Barkai and Ido Hakimi and Assaf Schuster},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lLw6EYwB}\n}",
        "github": "https://drive.google.com/drive/folders/1z1e_GI-6FZyfROIftoLHqz1X7xvNczWs?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1lLw6EYwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "263;627;412",
        "wc_reply_reviewers": "0;97;0",
        "wc_reply_authors": "344;737;413",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            434.0,
            149.4144125132066
        ],
        "wc_reply_reviewers_avg": [
            32.333333333333336,
            45.726238516730064
        ],
        "wc_reply_authors_avg": [
            498.0,
            171.3300907604966
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12330074125180286458&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1lOraEFPB",
        "title": "Transition Based Dependency Parser for Amharic Language Using Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Researches shows that attempts done to apply existing dependency parser on morphological rich languages including Amharic shows a poor performance. In this study, a dependency parser for Amharic language is implemented using arc-eager transition system and LSTM network. The study introduced another way of building labeled dependency structure by using a separate network model to predict dependency relation. This helps the number of classes to decrease from 2n+2 into n, where n is the number of relationship types in the language and increases the number of examples for each class in the data set. Evaluation of the parser model results 91.54 and 81.4 unlabeled and labeled attachment score respectively. The major challenge in this study was the decrease of the accuracy of labeled attachment score. This is mainly due to the size and quality of the tree-bank available for Amharic language.  Improving the tree-bank by increasing the size and by adding morphological information can make the performance of parser better.",
        "keywords": "Amharic dependency parsing;arc-eager transition;LSTM;Transition action prediction;Relationship type prediction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mizanu Zelalem;Million Meshesha (PhD)",
        "authorids": "mizatmymail@gmail.com;meshe84@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzelalem2020transition,\ntitle={Transition Based Dependency Parser for Amharic Language Using Deep Learning},\nauthor={Mizanu Zelalem and Million Meshesha (PhD)},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lOraEFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1lOraEFPB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "282;277;195",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            251.33333333333334,
            39.88594851434386
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11283531571461830678&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "B1lOzpVtDB",
        "title": "Conversation Generation with Concept Flow",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper presents ConceptFlow that explicitly models the conversation flow in commonsense knowledge graph for better conversation generation.",
        "abstract": "Human conversations naturally evolve around related entities and connected concepts, while may also shift from topic to topic. This paper presents ConceptFlow, which leverages commonsense knowledge graphs to explicitly model such conversation flows for better conversation response generation. ConceptFlow grounds the conversation inputs to the latent concept space and represents the potential conversation flow as a concept flow along the commonsense relations. The concept is guided by a graph attention mechanism that models the possibility of the conversation evolving towards different concepts. The conversation response is then decoded using the encodings of both utterance texts and concept flows, integrating the learned conversation structure in the concept space. Our experiments on Reddit conversations demonstrate the advantage of ConceptFlow over previous commonsense aware dialog models and fine-tuned GPT-2 models, while using much fewer parameters but with explicit modeling of conversation structures.",
        "keywords": "Grounded Natural Language Generation;Conversation Response Generation;ConceptNet;ConceptFlow;Conversation Flow",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Houyu Zhang;Zhenghao Liu;Chenyan Xiong;Zhiyuan Liu",
        "authorids": "houyu_zhang@brown.edu;liu-zh16@mails.tsinghua.edu.cn;chenyan.xiong@microsoft.com;liuzy@tsinghua.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://drive.google.com/open?id=1JCO2lsFDbUMsEc3kloR1FSzxxuDXJ6CF",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1lOzpVtDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "287;459;382",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            376.0,
            70.34676017178522
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9559748324685006045&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1lPETVFPS",
        "title": "Towards Principled Objectives for Contrastive Disentanglement",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Unsupervised learning is an important tool that has received a significant amount of attention for decades. Its goal is `unsupervised recovery,' i.e., extracting salient factors/properties  from unlabeled data. Because of the challenges in defining salient properties, recently, `contrastive disentanglement' has gained popularity to discover the additional variations that are enhanced in one dataset relative to another. %In fact, contrastive disentanglement and unsupervised recovery are often combined in that we seek additional variations that exhibit salient factors/properties. \nExisting formulations have devised a variety of losses for this task. However, all present day methods exhibit two major shortcomings: (1) encodings for data that does not exhibit salient factors is not pushed to carry no signal; and (2) introduced losses are often hard to estimate and require additional trainable parameters. We present a new formulation for contrastive disentanglement which avoids both shortcomings by carefully formulating a probabilistic model and by using non-parametric yet easily computable metrics. We show on four challenging datasets that the proposed approach is able to better disentangle salient factors.\n",
        "keywords": "Disentanglement;Contrastive",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anwesa Choudhuri;Ashok Vardhan Makkuva;Ranvir Rana;Sewoong Oh;Girish Chowdhary;Alexander Schwing",
        "authorids": "anwesac2@illinois.edu;makkuva2@illinois.edu;rbrana2@illinois.edu;sewoong@cs.washington.edu;girishc@illinois.edu;aschwing@illinois.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nchoudhuri2020towards,\ntitle={Towards Principled Objectives for Contrastive Disentanglement},\nauthor={Anwesa Choudhuri and Ashok Vardhan Makkuva and Ranvir Rana and Sewoong Oh and Girish Chowdhary and Alexander Schwing},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lPETVFPS}\n}",
        "github": "https://drive.google.com/file/d/1IlBRrf2Zm4YOOb67IQOrwXlaXlO6-4Q4/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1lPETVFPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "393;305;230",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "706;725;543",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            309.3333333333333,
            66.61497996363548
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            658.0,
            81.68639217560454
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7856959508764225629&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1lPaCNtPB",
        "title": "Real or Not Real, that is the Question",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "While generative adversarial networks (GAN) have been widely adopted in various topics, in this paper we generalize the standard GAN to a new perspective by treating realness as a random variable that can be estimated from multiple angles. In this generalized framework, referred to as RealnessGAN, the discriminator outputs a distribution as the measure of realness. While RealnessGAN shares similar theoretical guarantees with the standard GAN, it provides more insights on adversarial learning. More importantly, compared to multiple baselines, RealnessGAN provides stronger guidance for the generator, achieving improvements on both synthetic and real-world datasets. Moreover, it enables the basic DCGAN architecture to generate realistic images at 1024*1024 resolution when trained from scratch.",
        "keywords": "GAN;generalization;realness;loss function",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuanbo Xiangli*;Yubin Deng*;Bo Dai*;Chen Change Loy;Dahua Lin",
        "authorids": "xy019@ie.cuhk.edu.hk;danny.s.deng.ds@gmail.com;doubledaibo@gmail.com;ccloy@ntu.edu.sg;dhlin@ie.cuhk.edu.hk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nXiangli*2020Real,\ntitle={Real or Not Real, that is the Question},\nauthor={Yuanbo Xiangli* and Yubin Deng* and Bo Dai* and Chen Change Loy and Dahua Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lPaCNtPB}\n}",
        "github": "https://github.com/kam1107/RealnessGAN",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1lPaCNtPB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "393;400;87",
        "wc_reply_reviewers": "329;193;0",
        "wc_reply_authors": "1754;1028;283",
        "reply_reviewers": "2;1;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            293.3333333333333,
            145.92768376455817
        ],
        "wc_reply_reviewers_avg": [
            174.0,
            134.9839496631606
        ],
        "wc_reply_authors_avg": [
            1021.6666666666666,
            600.5499331631153
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "B1lTqgSFDH",
        "title": "Antifragile and Robust Heteroscedastic Bayesian Optimisation",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a heteroscedastic Bayesian Optimisation scheme capable of both representing and minimising aleatoric noise, which is crucial for many scientific applications.",
        "abstract": "    Bayesian Optimisation is an important decision-making tool for high-stakes applications in drug discovery and materials design. An oft-overlooked modelling consideration however is the representation of input-dependent or heteroscedastic aleatoric uncertainty. The cost of misrepresenting this uncertainty as being homoscedastic could be high in drug discovery applications where neglecting heteroscedasticity in high throughput virtual screening could lead to a failed drug discovery program. In this paper, we propose a heteroscedastic Bayesian Optimisation scheme which both represents and optimises aleatoric noise in the suggestions. We consider cases such as drug discovery where we would like to minimise or be robust to aleatoric uncertainty but also applications such as materials discovery where it may be beneficial to maximise or be antifragile to aleatoric uncertainty. Our scheme features a heteroscedastic Gaussian Process (GP) as the surrogate model in conjunction with two acquisition heuristics. First, we extend the augmented expected improvement (AEI) heuristic to the heteroscedastic setting and second, we introduce a new acquisition function, aleatoric-penalised expected improvement (ANPEI) based on a simple scalarisation of the performance and noise objective. Both methods are capable of penalising or promoting aleatoric noise in the suggestions and yield improved performance relative to a naive implementation of homoscedastic Bayesian Optimisation on toy problems as well as a real-world optimisation problem.",
        "keywords": "Bayesian Optimisation;Gaussian Processeses;Heteroscedasticity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ryan Rhys-Griffiths;Miguel Garcia-Ortegon;Alexander A. Aldrick;Alpha A. Lee",
        "authorids": "rrg27@cam.ac.uk;mg770@cam.ac.uk;av495@cam.ac.uk;aal44@cam.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nrhys-griffiths2020antifragile,\ntitle={Antifragile and Robust Heteroscedastic Bayesian Optimisation},\nauthor={Ryan Rhys-Griffiths and Miguel Garcia-Ortegon and Alexander A. Aldrick and Alpha A. Lee},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lTqgSFDH}\n}",
        "github": "https://anonymous.4open.science/r/3361287c-6879-4b38-9153-5c9491271200/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1lTqgSFDH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "429;764;331",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            508.0,
            185.3878816607673
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:s_m28yu7UhgJ:scholar.google.com/&scioq=Antifragile+and+Robust+Heteroscedastic+Bayesian+Optimisation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1lXfA4Ywr",
        "title": "Towards Modular Algorithm Induction",
        "track": "main",
        "status": "Reject",
        "tldr": "An architecture for learning to compose modules to learn algorithmic tasks.",
        "abstract": "We present a modular neural network architecture MAIN that learns algorithms given a set of input-output examples. MAIN consists of a neural controller that interacts with a variable-length input tape and learns to compose modules together with their corresponding argument choices. Unlike previous approaches, MAIN uses a general domain-agnostic mechanism for selection of modules and their arguments. It uses a general input tape layout together with a parallel history tape to indicate most recently used locations. Finally, it uses a memoryless controller with a length-invariant self-attention based input tape encoding to allow for random access to tape locations. The MAIN architecture is trained end-to-end using reinforcement learning from a set of input-output examples. We evaluate MAIN on five algorithmic tasks and show that it can learn policies that generalizes perfectly to inputs of much longer lengths than the ones used for training.",
        "keywords": "algorithm induction;reinforcement learning;program synthesis;modular",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel A. Abolafia;Rishabh Singh;Manzil Zaheer;Charles Sutton",
        "authorids": "danabo@google.com;rising@google.com;manzilzaheer@google.com;charlessutton@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nabolafia2020towards,\ntitle={Towards Modular Algorithm Induction},\nauthor={Daniel A. Abolafia and Rishabh Singh and Manzil Zaheer and Charles Sutton},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lXfA4Ywr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1lXfA4Ywr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "369;261;403",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.3333333333333,
            60.53832578531462
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18047616935044079089&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "B1lda1HtvB",
        "title": "Feature Selection using Stochastic Gates",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop a fully embedded feature selection method based directly on approximating the $\\ell_0$ penalty. ",
        "abstract": "Feature selection problems have been extensively studied in the setting of\nlinear estimation, for instance LASSO, but less emphasis has been placed on\nfeature selection for non-linear functions. In this study, we propose a method\nfor feature selection in high-dimensional non-linear function estimation\nproblems. The new procedure is based on directly penalizing the $\\ell_0$ norm of\nfeatures, or the count of the number of selected features. Our $\\ell_0$ based regularization relies on a continuous relaxation of the Bernoulli distribution, which\nallows our model to learn the parameters of the approximate Bernoulli\ndistributions via gradient descent. The proposed framework simultaneously learns\na non-linear regression or classification function while selecting a small\nsubset of features. We provide an information-theoretic justification for\nincorporating Bernoulli distribution into our approach. Furthermore, we evaluate\nour method using synthetic and real-life data and demonstrate that our approach\noutperforms other embedded methods in terms of predictive performance and feature selection.",
        "keywords": "Feature selection;classification;regression;survival analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yutaro Yamada;Ofir Lindenbaum;Sahand Negahban;Yuval Kluger",
        "authorids": "yutaro.yamada@yale.edu;ofirlin@gmail.com;sahand.negahban@yale.edu;yuval.kluger@yale.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nyamada2020feature,\ntitle={Feature Selection using Stochastic Gates},\nauthor={Yutaro Yamada and Ofir Lindenbaum and Sahand Negahban and Yuval Kluger},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lda1HtvB}\n}",
        "github": "https://anonymous.4open.science/r/6c6fc22c-ec36-4834-89e2-e2606a4ee2ad/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1lda1HtvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "326;647;655",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "446;553;548",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            542.6666666666666,
            153.24127671384394
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            515.6666666666666,
            49.30404536028346
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 192,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3895875359750859329&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "B1ldb6NKDr",
        "title": "Multi-Agent Hierarchical Reinforcement Learning for Humanoid Navigation",
        "track": "main",
        "status": "Reject",
        "tldr": "Improving MARL by sharing task agnostic sub policies.",
        "abstract": "Multi-agent reinforcement learning is a particularly challenging problem. Current\nmethods have made progress on cooperative and competitive environments with\nparticle-based agents. Little progress has been made on solutions that could op-\nerate in the real world with interaction, dynamics, and humanoid robots. In this\nwork, we make a significant step in multi-agent models on simulated humanoid\nrobot  navigation  by  combining  Multi-Agent  Reinforcement  Learning  (MARL)\nwith  Hierarchical  Reinforcement  Learning  (HRL).  We  build  on  top  of  founda-\ntional prior  work in  learning low-level physical  controllers for locomotion  and\nadd a layer to learn decentralized policies for multi-agent goal-directed collision\navoidance systems. A video of our results on a multi-agent pursuit environment\ncan be seen here\n",
        "keywords": "Multi-Agent Reinforcement Learning;Reinforcement Learning;Hierarchical Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Glen Berseth;Brandon haworth;Seonghyeon Moon;Mubbasir Kapadia;Petros Faloutsos",
        "authorids": "gberseth@gmail.com;m.brandon.haworth@gmail.com;sm2062@cs.rutgers.edu;mubbasir.kapadia@gmail.com;pfaloutsos@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nberseth2020multiagent,\ntitle={Multi-Agent Hierarchical Reinforcement Learning for Humanoid Navigation},\nauthor={Glen Berseth and Brandon haworth and Seonghyeon Moon and Mubbasir Kapadia and Petros Faloutsos},\nyear={2020},\nurl={https://openreview.net/forum?id=B1ldb6NKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1ldb6NKDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "622;419;416",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "545;148;233",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            485.6666666666667,
            96.41000409132285
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            308.6666666666667,
            170.67773401615364
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11078514900247505425&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1lf4yBYPr",
        "title": "Is my Deep Learning Model Learning more than I want it to?",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Can we trust our deep learning models? A framework to measure and improve a deep learning model's trust during training.",
        "abstract": "Existing deep learning approaches for learning visual features tend to extract more information than what is required for the task at hand. From a privacy preservation perspective, the input visual information is not protected from the model; enabling the model to become more intelligent than it is trained to be. Existing approaches for suppressing additional task learning assume the presence of ground truth labels for the tasks to be suppressed during training time. In this research, we propose a three-fold novel contribution: (i) a novel metric to measure the trust score of a trained deep learning model, (ii) a model-agnostic solution framework for trust score improvement by suppressing all the unwanted tasks, and (iii) a simulated benchmark dataset, PreserveTask, having five different fundamental image classification tasks to study the generalization nature of models. In the first set of experiments, we measure and improve the trust scores of five popular deep learning models: VGG16, VGG19, Inception-v1, MobileNet, and DenseNet and demonstrate that Inception-v1 is having the lowest trust score. Additionally, we show results of our framework on color-MNIST dataset and practical applications of face attribute preservation in Diversity in Faces (DiF) and IMDB-Wiki dataset.",
        "keywords": "model trust;disentangled representation;colored mnist;face attribute preservation;new dataset",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Naveen Panwar;Tarun Tater;Anush Sankaran;Senthil Mani",
        "authorids": "naveen.panwar@in.ibm.com;anussank@in.ibm.com;taruntater3@gmail.com;sentmani@in.ibm.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://github.com/dl-model-recommend/model-trust",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1lf4yBYPr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "188;661;253",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            367.3333333333333,
            209.34235649342972
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dCS5MG08-IoJ:scholar.google.com/&scioq=Is+my+Deep+Learning+Model+Learning+more+than+I+want+it+to%3F&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1lgUkBFwr",
        "title": "Unsupervised domain adaptation with imputation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a way to jointly tackle unsupervised domain adaptation and non-stochastic missing data in a target domain using distant supervision from a complete source domain.",
        "abstract": "Motivated by practical applications, we consider unsupervised domain adaptation for classification problems, in the presence of missing data in the target domain. More precisely, we focus on the case where there is a domain shift between source and target domains, while some components of the target data are systematically absent. We propose a way to impute non-stochastic missing data for a classification task by leveraging supervision from a complete source domain through domain adaptation. We introduce a single model performing joint domain adaptation, imputation and classification which is shown to perform well under various representative divergence families (H-divergence, Optimal Transport). We perform experiments on two families of datasets: a classical digit classification benchmark commonly used in domain adaptation papers and real world digital advertising datasets, on which we evaluate our model\u2019s classification performance in an unsupervised setting. We analyze its behavior showing the benefit of explicitly imputing non-stochastic missing data jointly with domain adaptation.",
        "keywords": "domain adaptation;imputation;missing data;advertising",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matthieu Kirchmeyer;Patrick Gallinari;Alain Rakotomamonjy;Amin Mantrach",
        "authorids": "m.kirchmeyer@criteo.com;patrick.gallinari@lip6.fr;a.rakotomamonjy@criteo.com;a.mantrach@criteo.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkirchmeyer2020unsupervised,\ntitle={Unsupervised domain adaptation with imputation},\nauthor={Matthieu Kirchmeyer and Patrick Gallinari and Alain Rakotomamonjy and Amin Mantrach},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lgUkBFwr}\n}",
        "github": "https://www.dropbox.com/sh/8gszx52xfu0gdgz/AAAYx4H3aqA88DtvuC2k9h4za?dl=0",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1lgUkBFwr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "595;390;446",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "872;1101;504",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            477.0,
            86.51396804370187
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            825.6666666666666,
            245.91642663492183
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18333004692577647638&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1liIlBKvS",
        "title": "Selfish Emergent Communication",
        "track": "main",
        "status": "Reject",
        "tldr": "We manage to emerge communication with selfish agents, contrary to the current view in ML",
        "abstract": "Current literature in machine learning holds that unaligned, self-interested agents do not learn to use an emergent communication channel. We introduce a new sender-receiver game to study emergent communication for this spectrum of partially-competitive scenarios and put special care into evaluation. We find that communication can indeed emerge in partially-competitive scenarios, and we discover three things that are tied to improving it. First, that selfish communication is proportional to cooperation, and it naturally occurs for situations that are more cooperative than competitive. Second, that stability and performance are improved by using LOLA (Foerster et al, 2018), especially in more competitive scenarios. And third, that discrete protocols lend themselves better to learning cooperative communication than continuous ones. ",
        "keywords": "multi agent reinforcement learning;emergent communication;game theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael Noukhovitch;Travis LaCroix;Aaron Courville",
        "authorids": "michael.noukhovitch@umontreal.ca;tlacroix@uci.edu;aaron.courville@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nnoukhovitch2020selfish,\ntitle={Selfish Emergent Communication},\nauthor={Michael Noukhovitch and Travis LaCroix and Aaron Courville},\nyear={2020},\nurl={https://openreview.net/forum?id=B1liIlBKvS}\n}",
        "github": "https://anonymous.4open.science/r/bb115cbe-1bb8-4c43-bf70-eff6c449db77/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1liIlBKvS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "597;927;174",
        "wc_reply_reviewers": "0;1593;0",
        "wc_reply_authors": "959;5313;370",
        "reply_reviewers": "0;7;0",
        "reply_authors": "2;11;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            566.0,
            308.1914989093632
        ],
        "wc_reply_reviewers_avg": [
            531.0,
            750.9474016201135
        ],
        "wc_reply_authors_avg": [
            2214.0,
            2204.4774135079424
        ],
        "reply_reviewers_avg": [
            2.3333333333333335,
            3.2998316455372216
        ],
        "reply_authors_avg": [
            4.666666666666667,
            4.4969125210773475
        ],
        "replies_avg": [
            27,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:riOAUva3IaUJ:scholar.google.com/&scioq=Selfish+Emergent+Communication&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1liraVYwr",
        "title": "LocalGAN: Modeling Local Distributions for Adversarial Response Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "A study on leveraging the local distribution of query-response pairs to adversarial response generation.",
        "abstract": "This paper presents a new methodology for modeling the local semantic distribution of responses to a given query in the human-conversation corpus,  and on this basis, explores a specified adversarial learning mechanism for training Neural Response Generation (NRG) models to build conversational agents. The proposed mechanism aims to address the training instability problem and improve the quality of generated results of Generative Adversarial Nets (GAN) in their utilizations in the response generation scenario.  Our investigation begins with the thorough discussions upon the objective function brought by general GAN architectures to NRG models, and the training instability problem is proved to be ascribed to the special local distributions of conversational corpora.  Consequently, an energy function is employed to estimate the status of a local area restricted by the query and its responses in the semantic space,  and the mathematical approximation of this energy-based distribution is finally found.  Building on this foundation, a local distribution oriented objective is proposed and combined with the original objective, working as a hybrid loss for the adversarial training of response generation models, named as LocalGAN.  Our experimental results demonstrate that the reasonable local distribution modeling of the query-response corpus is of great importance to adversarial NRG, and our proposed LocalGAN is promising for improving both the training stability and the quality of generated results.\n",
        "keywords": "neural response generation;adversarial learning;local distribution;energy-based distribution modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhen Xu;Baoxun Wang;Huan Zhang;Kexin Qiu;Deyuan Zhang;Chengjie Sun",
        "authorids": "xuzhenhit@gmail.com;baoxun.wang@gmail.com;zhanghuan123@pku.edu.cn;kq2131@columbia.edu;dyzhang@sau.edu.cn;cjsun@insun.hit.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nxu2020localgan,\ntitle={Local{\\{}GAN{\\}}: Modeling Local Distributions for Adversarial Response Generation},\nauthor={Zhen Xu and Baoxun Wang and Huan Zhang and Kexin Qiu and Deyuan Zhang and Chengjie Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=B1liraVYwr}\n}",
        "github": "https://github.com/Kramgasse49/local_gan_4generation",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1liraVYwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "222;537;570",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "293;775;497",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            443.0,
            156.85024705112835
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            521.6666666666666,
            197.54718142481528
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hGDfft2fiYYJ:scholar.google.com/&scioq=LocalGAN:+Modeling+Local+Distributions+for+Adversarial+Response+Generation&hl=en&as_sdt=0,5",
        "gs_version_total": 5
    },
    {
        "id": "B1lj20NFDS",
        "title": "Variational Autoencoders for Highly Multivariate Spatial Point Processes Intensities",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Multivariate spatial point process models can describe heterotopic data over space. However, highly multivariate intensities are computationally challenging due to the curse of dimensionality. To bridge this gap, we introduce a declustering based hidden variable model that leads to an efficient inference procedure via a variational autoencoder (VAE). We also prove that this model is a generalization of the VAE-based model for collaborative filtering. This leads to an interesting application of spatial point process models to recommender systems. Experimental results show the method's utility on both synthetic data and real-world data sets.\n",
        "keywords": "VAE;collaborative filtering;recommender systems;spatial point process",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Baichuan Yuan;Xiaowei Wang;Jianxin Ma;Chang Zhou;Andrea L. Bertozzi;Hongxia Yang",
        "authorids": "ybcmath@gmail.com;daemon.wxw@alibaba-inc.com;majx13fromthu@gmail.com;ericzhou.zc@alibaba-inc.com;bertozzi@math.ucla.edu;yang.yhx@alibaba-inc.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nYuan2020Variational,\ntitle={Variational Autoencoders for Highly Multivariate Spatial Point Processes Intensities},\nauthor={Baichuan Yuan and Xiaowei Wang and Jianxin Ma and Chang Zhou and Andrea L. Bertozzi and Hongxia Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lj20NFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1lj20NFDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "141;214;938",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "338;670;189",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            431.0,
            359.73972072411834
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            399.0,
            201.04891610418264
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3798204745508245751&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1lmSeHKwB",
        "title": "The Power of Semantic Similarity based Soft-Labeling for Generalized Zero-Shot Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "How to use cross-entropy loss for zero shot learning with soft labeling on unseen classes : a simple and effective solution that achieves state-of-the-art performance on five ZSL benchmark datasets.",
        "abstract": "Zero-Shot Learning (ZSL) is a classification task where some classes referred as unseen classes have no labeled training images. Instead, we only have side information (or description) about seen and unseen classes, often in the form of semantic or descriptive attributes. Lack of training images from a set of classes restricts the use of standard classification techniques and losses, including the popular cross-entropy loss. The key step in tackling ZSL problem is bridging visual to semantic space via learning a nonlinear embedding. A well established approach is to obtain the semantic representation of the visual information and perform classification in the semantic space. In this paper, we propose a novel architecture of casting ZSL as a fully connected neural-network with cross-entropy loss to embed visual space to semantic space. During training in order to introduce unseen visual information to the network, we utilize soft-labeling based on semantic similarities between seen and unseen classes. To the best of our knowledge, such similarity based soft-labeling is not explored for cross-modal transfer and ZSL. We evaluate the proposed model on five benchmark datasets for zero-shot learning, AwA1, AwA2, aPY, SUN and CUB datasets, and show that, despite the simplicity, our approach achieves the state-of-the-art performance in Generalized-ZSL setting on all of these datasets and outperforms the state-of-the-art for some datasets.",
        "keywords": "Zero Shot Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shabnam Daghaghi;Tharun Medini;Anshumali Shrivastava",
        "authorids": "shabnam.daghaghi@rice.edu;tharun.medini@rice.edu;anshumali@rice.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://drive.google.com/drive/folders/1uPjXE-HdwuZONlQF-eFiWxmp6xCyNcGP?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer5;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1lmSeHKwB",
        "pdf_size": 0,
        "rating": "1;3;3;3;6",
        "confidence": "0;0;0;0;0",
        "wc_review": "377;384;973;217;672",
        "wc_reply_reviewers": "0;0;0;0;0",
        "wc_reply_authors": "101;17;174;166;140",
        "reply_reviewers": "0;0;0;0;0",
        "reply_authors": "1;1;1;1;1",
        "rating_avg": [
            3.2,
            1.6
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            524.6,
            267.9407397168262
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            119.6,
            57.27337950566563
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iw7af2B9b7QJ:scholar.google.com/&scioq=The+Power+of+Semantic+Similarity+based+Soft-Labeling+for+Generalized+Zero-Shot+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1ln0hNFDS",
        "title": "Quantitatively Disentangling and Understanding Part Information in CNNs",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "This paper presents an unsupervised method to learn a neural network, namely an explainer, to diagnose part information that is used for inference by a pre-trained convolutional neural network (CNN). The explainer performs like an auto-encoder, which quantitatively disentangles part features from intermediate layers and uses the part features to reconstruct CNN features without much loss of information. The disentanglement and quantification of part information help people understand intermediate-layer features used by the CNN. More crucially, we learn the explainer via knowledge distillation without using any annotations of object parts or textures for supervision. In experiments, our method was widely used to diagnose features of different benchmark CNNs, and explainers significantly boosted the feature interpretability.",
        "keywords": "Convolutional Neural Networks;Interpretability;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Quanshi Zhang;Yu Yang;Haotian Ma;Ying Nian Wu",
        "authorids": "zqs1022@sjtu.edu.cn;yy19970901@ucla.edu;11612807@mail.sustc.edu.cn;ywu@stat.ucla.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1ln0hNFDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "373;461;189",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            341.0,
            113.32548992467082
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:X2iSJvl39XcJ:scholar.google.com/&scioq=Quantitatively+Disentangling+and+Understanding+Part+Information+in+CNNs&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1lnbRNtwr",
        "title": "Global Relational Models of Source Code",
        "track": "main",
        "status": "Poster",
        "tldr": "Models of source code that combine global and structural features learn more powerful representations of programs.",
        "abstract": "Models of code can learn distributed representations of a program's syntax and semantics to predict many non-trivial properties of a program. Recent state-of-the-art models leverage highly structured representations of programs, such as trees, graphs and paths therein (e.g. data-flow relations), which are precise and abundantly available for code. This provides a strong inductive bias towards semantically meaningful relations, yielding more generalizable representations than classical sequence-based models. Unfortunately, these models primarily rely on graph-based message passing to represent relations in code, which makes them de facto local due to the high cost of message-passing steps, quite in contrast to modern, global sequence-based models, such as the Transformer. In this work, we bridge this divide between global and structured models by introducing two new hybrid model families that are both global and incorporate structural bias: Graph Sandwiches, which wrap traditional (gated) graph message-passing layers in sequential message-passing layers; and Graph Relational Embedding Attention Transformers (GREAT for short), which bias traditional Transformers with relational information from graph edge types. By studying a popular, non-trivial program repair task, variable-misuse identification, we explore the relative merits of traditional and hybrid model families for code representation. Starting with a  graph-based model that already improves upon the prior state-of-the-art for this task by 20%, we show that our proposed hybrid models improve an additional 10-15%, while training both faster and using fewer parameters.",
        "keywords": "Models of Source Code;Graph Neural Networks;Structured Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vincent J. Hellendoorn;Charles Sutton;Rishabh Singh;Petros Maniatis;David Bieber",
        "authorids": "vjhellendoorn@gmail.com;charlessutton@google.com;rising@google.com;maniatis@google.com;dbieber@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nHellendoorn2020Global,\ntitle={Global Relational Models of Source Code},\nauthor={Vincent J. Hellendoorn and Charles Sutton and Rishabh Singh and Petros Maniatis and David Bieber},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lnbRNtwr}\n}",
        "github": "[![github](/images/github_icon.svg) VHellendoorn/ICLR20-Great](https://github.com/VHellendoorn/ICLR20-Great)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1lnbRNtwr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "208;200;562",
        "wc_reply_reviewers": "0;0;62",
        "wc_reply_authors": "538;356;999",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            323.3333333333333,
            168.7944180225032
        ],
        "wc_reply_reviewers_avg": [
            20.666666666666668,
            29.227080289043965
        ],
        "wc_reply_authors_avg": [
            631.0,
            270.6153481727647
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 285,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5518360884682862263&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "B1lpxCVYDr",
        "title": "Anomaly Detection by Deep Direct Density Ratio Estimation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Estimating the ratio of two probability densities without estimating each density separately has been shown to provide useful solutions to various machine learning tasks such as domain adaptation, anomaly detection, feature extraction, and conditional density estimation. However, density ratio estimation in the context of deep learning has not been extensively explored yet. In this paper, we apply a Bregman-divergence minimization method for density ratio estimation to deep neural networks and investigate its properties and practical performance in image anomaly detection. Our numerical experiments on the CIFAR-10, CIFAR-100 and Fashion-MNIST datasets demonstrate that deep direct density ratio estimation greatly improves the anomaly detection ability and reduces the computation time over state-of-the-art methods.",
        "keywords": "anomaly detection;density ratio estimation;neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Masahiro Abe;Masashi Sugiyama",
        "authorids": "masahiro.abe@d2c.co.jp;sugi@k.u-tokyo.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1lpxCVYDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "152;573;111",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            278.6666666666667,
            208.79708384511076
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8610897295176456833&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1lqDertwr",
        "title": "Regularization Matters in Policy Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that conventional regularization methods (e.g., $L_2$, dropout), which have been largely ignored in RL methods, can be very effective in policy optimization.",
        "abstract": "Deep Reinforcement Learning (Deep RL) has been receiving increasingly more attention  thanks to its encouraging performance on a variety of control tasks. Yet, conventional regularization techniques in training neural networks (e.g., $L_2$ regularization, dropout) have been largely ignored in RL methods, possibly because agents are typically trained and evaluated in the same environment. In this work, we present the first comprehensive study of regularization techniques with multiple policy optimization algorithms on continuous control tasks. Interestingly, we find conventional regularization techniques on the policy networks can often bring large improvement on the task performance, and the improvement is typically more significant when the task is more difficult. We also compare with the widely used entropy regularization and find $L_2$ regularization is generally better. Our findings are further confirmed to be robust against the choice of training hyperparameters. We also study the effects of regularizing different components and find that only regularizing the policy network is typically enough. We hope our study provides guidance for future practices in regularizing policy optimization algorithms.",
        "keywords": "Regularization;Policy Optimization;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhuang Liu;Xuanlin Li;Bingyi Kang;Trevor Darrell",
        "authorids": "zhuangl@berkeley.edu;xuanlinli17@berkeley.edu;kang@u.nus.edu;trevor@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nliu2020regularization,\ntitle={Regularization Matters in Policy Optimization},\nauthor={Zhuang Liu and Xuanlin Li and Bingyi Kang and Trevor Darrell},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lqDertwr}\n}",
        "github": "https://drive.google.com/open?id=1oy5Tyg8JUqTDZ-2jA8HQuoejchIMgnG6",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1lqDertwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "685;391;671",
        "wc_reply_reviewers": "0;152;0",
        "wc_reply_authors": "1664;2599;1097",
        "reply_reviewers": "0;2;0",
        "reply_authors": "3;5;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            582.3333333333334,
            135.41376919977114
        ],
        "wc_reply_reviewers_avg": [
            50.666666666666664,
            71.65348716023682
        ],
        "wc_reply_authors_avg": [
            1786.6666666666667,
            619.2933248648997
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            3.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 93,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7054612527245262649&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1lsFlrKDr",
        "title": "Semi-supervised Autoencoding Projective Dependency Parsing",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We describe two end-to-end autoencoding parsers for semi-supervised graph-based dependency parsing.",
        "abstract": "We describe two end-to-end autoencoding models for semi-supervised graph-based dependency parsing. The first model is a Local Autoencoding Parser (LAP) encoding the input using continuous latent variables in a sequential manner; The second model is a Global Autoencoding Parser (GAP) encoding the input into dependency trees as latent variables, with exact inference. Both models consist of two parts: an encoder enhanced by deep neural networks (DNN) that can utilize the contextual information to encode the input into latent variables, and a decoder which is a generative model able to reconstruct the input. Both LAP and GAP admit a unified structure with different loss functions for labeled and unlabeled data with shared parameters. We conducted experiments on WSJ and UD dependency parsing data sets, showing that our models can exploit the unlabeled data to boost the performance given a limited amount of labeled data.",
        "keywords": "Dependency Parsing;Semi-supervised Learning;Tractable Inference;Evidence Lowerbound",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiao Zhang;Dan Goldwasser",
        "authorids": "zhang923@purdue.edu;dgoldwas@purdue.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1lsFlrKDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "562;173;326",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            353.6666666666667,
            160.00902752310225
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FgtjF-50hEMJ:scholar.google.com/&scioq=Semi-supervised+Autoencoding+Projective+Dependency+Parsing&hl=en&as_sdt=0,5",
        "gs_version_total": 6
    },
    {
        "id": "B1lsXREYvr",
        "title": "One-Shot Neural Architecture Search via Compressive Sensing",
        "track": "main",
        "status": "Reject",
        "tldr": "A new approach for one-shot neural architecture search that blends in techniques from Fourier-sparse recovery.",
        "abstract": "Neural architecture search (NAS), or automated design of neural network models, remains a very challenging meta-learning problem. Several recent works (called \"one-shot\" approaches) have focused on dramatically reducing NAS running time by leveraging proxy models that still provide architectures with competitive performance. In our work, we propose a new meta-learning algorithm that we call CoNAS, or Compressive sensing-based Neural Architecture Search. Our approach merges ideas from one-shot NAS approaches with iterative techniques for learning low-degree sparse Boolean polynomial functions. We validate our approach on several standard test datasets, discover novel architectures hitherto unreported, and achieve competitive (or better) results in both performance and search time compared to existing NAS approaches. Further, we provide theoretical analysis via upper bounds on the number of validation error measurements needed to perform reliable meta-learning; to our knowledge, these analysis tools are novel to the NAS literature and may be of independent interest.",
        "keywords": "deep learning;autoML;neural architecture search;image classification;language modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Minsu Cho;Mohammadreza Soltani;Chinmay Hegde",
        "authorids": "chomd90@iastate.edu;mohammadreza.soltani@duke.edu;chinmay@iastate.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ncho2020oneshot,\ntitle={One-Shot Neural Architecture Search via Compressive Sensing},\nauthor={Minsu Cho and Mohammadreza Soltani and Chinmay Hegde},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lsXREYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1lsXREYvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "297;424;565",
        "wc_reply_reviewers": "0;0;333",
        "wc_reply_authors": "693;535;680",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            428.6666666666667,
            109.46029214082866
        ],
        "wc_reply_reviewers_avg": [
            111.0,
            156.97770542341354
        ],
        "wc_reply_authors_avg": [
            636.0,
            71.6147098483731
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17321221274528790635&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "B1ltfgSYwS",
        "title": "Few-Shot One-Class Classification via Meta-Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop an approach for the novel and challenging few-shot one-class classification problem and validate it on datasets from the image and time-series domain, including a real-world dataset of industrial sensor readings.",
        "abstract": "Although  few-shot  learning  and  one-class  classification have been separately well studied,  their intersection remains rather unexplored.  Our work addresses the few-shot one-class classification problem and presents a meta-learning approach that requires only few data examples from only one class to adapt to unseen tasks.  The proposed method builds upon the model-agnostic meta-learning (MAML) algorithm (Finn et al., 2017) and explicitly trains for few-shot class-imbalance learning, aiming to learn a model initialization that is particularly suited for learning one-class classification tasks after observing only a few examples of one class.  Experimental results on datasets from the image domain and the time-series domain show that our model substantially outperforms the baselines, including MAML, and demonstrate the ability to learn new tasks from only few majority class samples.  Moreover, we successfully learn anomaly detectors for a real world application involving sensor readings recorded during industrial manufacturing of workpieces with a CNC milling machine using only few examples from the normal class.",
        "keywords": "meta-learning;few-shot learning;one-class classification;class-imbalance learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ahmed Frikha;Denis Krompa\u00df;Hans-Georg Koepken;Volker Tresp",
        "authorids": "ahmed.frikha@siemens.com;denis.krompass@siemens.com;hans-georg.koepken@siemens.com;volker.tresp@siemens.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfrikha2020fewshot,\ntitle={Few-Shot One-Class Classification via Meta-Learning},\nauthor={Ahmed Frikha and Denis Krompa{\\ss} and Hans-Georg Koepken and Volker Tresp},\nyear={2020},\nurl={https://openreview.net/forum?id=B1ltfgSYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1ltfgSYwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "489;228;257",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2235;1407;1893",
        "reply_reviewers": "0;0;0",
        "reply_authors": "5;3;4",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            324.6666666666667,
            116.80277774474753
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1845.0,
            339.7293040054096
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            4.0,
            0.816496580927726
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 67,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7043979745060871348&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "B1lxV6NFPH",
        "title": "BANANAS: Bayesian Optimization with Neural Networks for Neural Architecture Search",
        "track": "main",
        "status": "Reject",
        "tldr": "We design a NAS algorithm that performs Bayesian optimization using a neural network model, which takes advantage of a novel way to featurize neural architectures, and it performs very well on multiple search spaces.",
        "abstract": "Neural Architecture Search (NAS) has seen an explosion of research in the past few years. A variety of methods have been proposed to perform NAS, including reinforcement learning, Bayesian optimization with a Gaussian process model, evolutionary search, and gradient descent. In this work, we design a NAS algorithm that performs Bayesian optimization using a neural network model.\n\nWe develop a path-based encoding scheme to featurize the neural architectures that are used to train the neural network model. This strategy is particularly effective for encoding architectures in cell-based search spaces. After training on just 200 random neural architectures, we are able to predict the validation accuracy of a new architecture to within one percent of its true accuracy on average. This may be of independent interest beyond Bayesian neural architecture search.\n\nWe test our algorithm on the NASBench dataset (Ying et al. 2019), and show that our algorithm significantly outperforms other NAS methods including evolutionary search, reinforcement learning, and AlphaX (Wang et al. 2019). Our algorithm is over 100x more efficient than random search, and 3.8x more efficient than the next-best algorithm. We also test our algorithm on the search space used in DARTS (Liu et al. 2018), and show that our algorithm is competitive with state-of-the-art NAS algorithms on this search space.",
        "keywords": "neural architecture search;Bayesian optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Colin White;Willie Neiswanger;Yash Savani",
        "authorids": "crwhite@cs.cmu.edu;willie@cs.cmu.edu;yash@realityengines.ai",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwhite2020bananas,\ntitle={{\\{}BANANAS{\\}}: Bayesian Optimization with Neural Networks for Neural Architecture Search},\nauthor={Colin White and Willie Neiswanger and Yash Savani},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lxV6NFPH}\n}",
        "github": "https://github.com/iclr474/bananas",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1lxV6NFPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "166;217;674",
        "wc_reply_reviewers": "0;0;192",
        "wc_reply_authors": "271;289;1408",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;3",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            352.3333333333333,
            228.40363881125498
        ],
        "wc_reply_reviewers_avg": [
            64.0,
            90.50966799187809
        ],
        "wc_reply_authors_avg": [
            656.0,
            531.7950733130197
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8151751744108925157&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1lyZpEYvH",
        "title": "Multi-Dimensional Explanation of Reviews",
        "track": "main",
        "status": "Reject",
        "tldr": "Neural model predicting multi-aspect sentiments and generating a probabilistic multi-dimensional mask simultaneously. Model outperforms strong baselines and generates masks that are: strong feature predictors, meaningful, and interpretable.",
        "abstract": "Neural models achieved considerable improvement for many natural language processing tasks, but they offer little transparency, and interpretability comes at a cost. In some domains, automated predictions without justifications have limited applicability. Recently, progress has been made regarding single-aspect sentiment analysis for reviews, where the ambiguity of a justification is minimal. In this context, a justification, or mask, consists of (long) word sequences from the input text, which suffice to make the prediction. Existing models cannot handle more than one aspect in one training and induce binary masks that might be ambiguous. In our work, we propose a neural model for predicting multi-aspect sentiments for reviews and generates a probabilistic multi-dimensional mask (one per aspect) simultaneously, in an unsupervised and multi-task learning manner. Our evaluation shows that on three datasets, in the beer and hotel domain, our model outperforms strong baselines and generates masks that are: strong feature predictors, meaningful, and interpretable.",
        "keywords": "deep learning;explanation;interpretability;reviews;multi-aspect;sentiment analysis;mask",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Diego Antognini;Claudiu Musat;Boi Faltings",
        "authorids": "diego.antognini@epfl.ch;claudiu.musat@swisscom.com;boi.faltings@epfl.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nantognini2020multidimensional,\ntitle={Multi-Dimensional Explanation of Reviews},\nauthor={Diego Antognini and Claudiu Musat and Boi Faltings},\nyear={2020},\nurl={https://openreview.net/forum?id=B1lyZpEYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1lyZpEYvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "374;454;223",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "429;299;136",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            350.3333333333333,
            95.77868006793346
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            288.0,
            119.86937334726775
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12006672458855001969&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "B1x1MerYPB",
        "title": "Putting Machine Translation in Context with the Noisy Channel Model",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We show that Bayes' rule provides a compelling mechanism for controlling unconditional document language models, using the long-standing challenge of effectively leveraging document context in machine translation. In our formulation, we estimate the probability of a candidate translation as the product of the unconditional probability of the candidate output document and the ``reverse translation probability'' of translating the candidate output back into the input source language document---the so-called ``noisy channel'' decomposition. A particular advantage of our model is that it requires only parallel sentences to train, rather than parallel documents, which are not always available. Using a new beam search reranking approximation to solve the decoding problem, we find that document language models outperform language models that assume independence between sentences, and that using either a document or sentence language model outperform comparable models that directly estimate the translation probability. We obtain the best-published results on the NIST Chinese--English translation task, a standard task for evaluating document translation. Our model also outperforms the benchmark Transformer model by approximately 2.5 BLEU on the WMT19 Chinese--English translation task.",
        "keywords": "machine translation;context-aware machine translation;bayes rule",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lei Yu;Laurent Sartran;Wojciech Stokowiec;Wang Ling;Lingpeng Kong;Phil Blunsom;Chris Dyer",
        "authorids": "leiyu@google.com;lsartran@google.com;wstokowiec@google.com;lingwang@google.com;lingpenk@google.com;pblunsom@google.com;cdyer@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nyu2020putting,\ntitle={Putting Machine Translation in Context with the Noisy Channel Model},\nauthor={Lei Yu and Laurent Sartran and Wojciech Stokowiec and Wang Ling and Lingpeng Kong and Phil Blunsom and Chris Dyer},\nyear={2020},\nurl={https://openreview.net/forum?id=B1x1MerYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1x1MerYPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "305;211;175",
        "wc_reply_reviewers": "0;0;4",
        "wc_reply_authors": "488;720;340",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            230.33333333333334,
            54.80470377217229
        ],
        "wc_reply_reviewers_avg": [
            1.3333333333333333,
            1.8856180831641267
        ],
        "wc_reply_authors_avg": [
            516.0,
            156.39266819984454
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4UQbw0F1bSoJ:scholar.google.com/&scioq=Putting+Machine+Translation+in+Context+with+the+Noisy+Channel+Model&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1x1ma4tDr",
        "title": "DDSP: Differentiable Digital Signal Processing",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Better audio synthesis by combining interpretable DSP with end-to-end learning.",
        "abstract": "Most generative models of audio directly generate samples in one of two domains: time or frequency. While sufficient to express any signal, these representations are inefficient, as they do not utilize existing knowledge of how sound is generated and perceived. A third approach (vocoders/synthesizers) successfully incorporates strong domain knowledge of signal processing and perception, but has been less actively researched due to limited expressivity and difficulty integrating with modern auto-differentiation-based machine learning methods. In this paper, we introduce the Differentiable Digital Signal Processing (DDSP) library, which enables direct integration of classic signal processing elements with deep learning methods. Focusing on audio synthesis, we achieve high-fidelity generation without the need for large autoregressive models or adversarial losses, demonstrating that DDSP enables utilizing strong inductive biases without losing the expressive power of neural networks. Further, we show that combining interpretable modules permits manipulation of each separate model component, with applications such as independent control of pitch and loudness, realistic extrapolation to pitches not seen during training, blind dereverberation of room acoustics, transfer of extracted room acoustics to new environments, and transformation of timbre between disparate sources. In short, DDSP enables an interpretable and modular approach to generative modeling, without sacrificing the benefits of deep learning. The library will is available at https://github.com/magenta/ddsp and we encourage further contributions from the community and domain experts.\n",
        "keywords": "dsp;audio;music;nsynth;wavenet;wavernn;vocoder;synthesizer;sound;signal;processing;tensorflow;autoencoder;disentanglement",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jesse Engel;Lamtharn (Hanoi) Hantrakul;Chenjie Gu;Adam Roberts",
        "authorids": "jesseengel@google.com;hanoih@google.com;gcj@google.com;adarob@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nEngel2020DDSP:,\ntitle={DDSP: Differentiable Digital Signal Processing},\nauthor={Jesse Engel and Lamtharn (Hanoi) Hantrakul and Chenjie Gu and Adam Roberts},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1x1ma4tDr}\n}",
        "github": "https://github.com/magenta/ddsp",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1x1ma4tDr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "386;158;332",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "952;266;557",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.0,
            97.28309205612247
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            591.6666666666666,
            281.12907276828474
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            22,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 562,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=494865138250348922&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "B1x2eCNFvH",
        "title": "Local Label Propagation for Large-Scale Semi-Supervised Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "A significant issue in training deep neural networks to solve supervised learning\ntasks is the need for large numbers of labeled datapoints. The goal of semisupervised learning is to leverage ubiquitous unlabeled data, together with small quantities of labeled data, to achieve high task performance. Though substantial recent progress has been made in developing semi-supervised algorithms that are effective for comparatively small datasets, many of these techniques do not scale readily to the large (unlabeled) datasets characteristic of real-world applications. In this paper we introduce a novel approach to scalable semi-supervised learning, called Local Label Propagation (LLP). Extending ideas from recent work on unsupervised embedding learning, LLP first embeds datapoints, labeled and otherwise, in a common latent space using a deep neural network. It then propagates pseudolabels from known to unknown datapoints in a manner that depends on the local geometry of the embedding, taking into account both inter-point distance and local data density as a weighting on propagation likelihood. The parameters of the deep embedding are then trained to simultaneously maximize pseudolabel categorization performance as well as a metric of the clustering of datapoints within each psuedo-label group, iteratively alternating stages of network training and label propagation. We illustrate the utility of the LLP method on the ImageNet dataset, achieving results that outperform previous state-of-the-art scalable semi-supervised learning algorithms by large margins, consistently across a wide variety of training regimes. We also show that the feature representation learned with LLP transfers well to scene recognition in the Places 205 dataset.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chengxu Zhuang;Chaofei Fan;Xuehao Ding;Divyanshu Murli;Daniel Yamins",
        "authorids": "chengxuz@stanford.edu;stfan@stanford.edu;xhding@stanford.edu;divymurli@gmail.com;yamins@stanford.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhuang2020local,\ntitle={Local Label Propagation for Large-Scale Semi-Supervised Learning},\nauthor={Chengxu Zhuang and Chaofei Fan and Xuehao Ding and Divyanshu Murli and Daniel Yamins},\nyear={2020},\nurl={https://openreview.net/forum?id=B1x2eCNFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1x2eCNFvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "392;292;200",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1275;627;566",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            294.6666666666667,
            78.4063489492755
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            822.6666666666666,
            320.81597355631914
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4823364734706456360&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1x3EgHtwB",
        "title": "ExpandNets: Linear Over-parameterization to Train Compact Convolutional Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposes linear expansion strategies building upon over-parameterization to facilitate practical compact network training. ",
        "abstract": "In this paper, we introduce a novel approach to training a given compact network. To this end, we build upon over-parameterization, which typically improves both optimization and generalization in neural network training, while being unnecessary at inference time. We propose to expand each linear layer of the compact network into multiple linear layers, without adding any nonlinearity. As such, the resulting expanded network can benefit from over-parameterization during training but can be compressed back to the compact one algebraically at inference. As evidenced by our experiments, this consistently outperforms training the compact network from scratch and knowledge distillation using a teacher. In this context, we introduce several expansion strategies, together with an initialization scheme, and demonstrate the benefits of our ExpandNets on several tasks, including image classification, object detection, and semantic segmentation. ",
        "keywords": "Compact Network Training;Linear Expansion;Over-parameterization;Knowledge Transfer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shuxuan Guo;Jose M. Alvarez;Mathieu Salzmann",
        "authorids": "shuxuan.guo@epfl.ch;josea@nvidia.com;mathieu.salzmann@epfl.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nguo2020expandnets,\ntitle={ExpandNets: Linear Over-parameterization to Train Compact Convolutional Networks},\nauthor={Shuxuan Guo and Jose M. Alvarez and Mathieu Salzmann},\nyear={2020},\nurl={https://openreview.net/forum?id=B1x3EgHtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1x3EgHtwB",
        "pdf_size": 0,
        "rating": "3;6",
        "confidence": "0;0",
        "wc_review": "483;297",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "893;1113",
        "reply_reviewers": "0;0",
        "reply_authors": "2;2",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            390.0,
            93.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1003.0,
            110.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 98,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1077783001089626273&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "B1x62TNtDS",
        "title": "Understanding the Limitations of Variational Mutual Information Estimators",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Variational approaches based on neural networks are showing promise for estimating mutual information (MI) between high dimensional variables. However, they can be difficult to use in practice due to poorly understood bias/variance tradeoffs. We theoretically show that, under some conditions, estimators such as MINE exhibit variance that could grow exponentially with the true  amount of underlying MI. We also empirically demonstrate that existing estimators fail to satisfy basic self-consistency properties of MI, such as data processing and additivity under independence. Based on a unified perspective of variational approaches, we develop a new estimator that focuses on variance reduction. Empirical results on standard benchmark tasks demonstrate that our proposed estimator exhibits improved bias-variance trade-offs on standard benchmark tasks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiaming Song;Stefano Ermon",
        "authorids": "jiaming.tsong@gmail.com;ermon@cs.stanford.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nSong2020Understanding,\ntitle={Understanding the Limitations of Variational Mutual Information Estimators},\nauthor={Jiaming Song and Stefano Ermon},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1x62TNtDS}\n}",
        "github": "https://github.com/ermongroup/smile-mi-estimator",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1x62TNtDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "603;157;569",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            443.0,
            202.7083290510448
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 246,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4523141934967854838&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "B1x6BTEKwr",
        "title": "Piecewise linear activations substantially shape the loss surfaces of neural networks",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper presents how the loss surfaces of nonlinear neural networks are substantially shaped by the nonlinearities in activations.",
        "abstract": "Understanding the loss surface of a neural network is fundamentally important to the understanding of deep learning. This paper presents how piecewise linear activation functions substantially shape the loss surfaces of neural networks. We first prove that {\\it the loss surfaces of many neural networks have infinite spurious local minima} which are defined as the local minima with higher empirical risks than the global minima. Our result demonstrates that the networks with piecewise linear activations possess substantial differences to the well-studied linear neural networks. This result holds for any neural network with arbitrary depth and arbitrary piecewise linear activation functions (excluding linear functions) under most loss functions in practice. Essentially, the underlying assumptions are consistent with most practical circumstances where the output layer is narrower than any hidden layer. In addition, the loss surface of a neural network with piecewise linear activations is partitioned into multiple smooth and multilinear cells by nondifferentiable boundaries. The constructed spurious local minima are concentrated in one cell as a valley: they are connected with each other by a continuous path, on which empirical risk is invariant. Further for one-hidden-layer networks, we prove that all local minima in a cell constitute an equivalence class; they are concentrated in a valley; and they are all global minima in the cell.",
        "keywords": "neural network;nonlinear activation;loss surface;spurious local minimum",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fengxiang He;Bohan Wang;Dacheng Tao",
        "authorids": "fengxiang.he@sydney.edu.au;bhwangfy@gmail.com;dacheng.tao@sydney.edu.au",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nHe2020Piecewise,\ntitle={Piecewise linear activations substantially shape the loss surfaces of neural networks},\nauthor={Fengxiang He and Bohan Wang and Dacheng Tao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1x6BTEKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1x6BTEKwr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "819;432;377",
        "wc_reply_reviewers": "609;44;197",
        "wc_reply_authors": "3878;656;1237",
        "reply_reviewers": "1;1;3",
        "reply_authors": "6;2;5",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            542.6666666666666,
            196.6830501650364
        ],
        "wc_reply_reviewers_avg": [
            283.3333333333333,
            238.60194653206185
        ],
        "wc_reply_authors_avg": [
            1923.6666666666667,
            1402.1303632528452
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            4.333333333333333,
            1.699673171197595
        ],
        "replies_avg": [
            24,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5950803387841082956&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "B1x6w0EtwH",
        "title": "Graph Constrained Reinforcement Learning for Natural Language Action Spaces",
        "track": "main",
        "status": "Poster",
        "tldr": "We present KG-A2C, a reinforcement learning agent that builds a dynamic knowledge graph while exploring and generates natural language using a template-based action space - outperforming all current agents on a wide set of text-based games.",
        "abstract": "Interactive Fiction games are text-based simulations in which an agent interacts with the world purely through natural language. They are ideal environments for studying how to extend reinforcement learning agents to meet the challenges of natural language understanding, partial observability, and action generation in combinatorially-large text-based action spaces. We present KG-A2C, an agent that builds a dynamic knowledge graph while exploring and generates actions using a template-based action space. We contend that the dual uses of the knowledge graph to reason about game state and to constrain natural language generation are the keys to scalable exploration of combinatorially large natural language actions. Results across a wide variety of IF games show that KG-A2C outperforms current IF agents despite the exponential increase in action space size.",
        "keywords": "natural language generation;deep reinforcement learning;knowledge graphs;interactive fiction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Prithviraj Ammanabrolu;Matthew Hausknecht",
        "authorids": "raj.ammanabrolu@gatech.edu;matthew.hausknecht@microsoft.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nAmmanabrolu2020Graph,\ntitle={Graph Constrained Reinforcement Learning for Natural Language Action Spaces},\nauthor={Prithviraj Ammanabrolu and Matthew Hausknecht},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1x6w0EtwH}\n}",
        "github": "https://github.com/rajammanabrolu/KG-A2C",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1x6w0EtwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "517;228;267",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "552;237;401",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            337.3333333333333,
            128.03732094805622
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            396.6666666666667,
            128.63471105766473
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 132,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15066208654437399788&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1x8anVFPr",
        "title": "On Layer Normalization in the Transformer Architecture",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The Transformer architecture is popularly used in natural language processing tasks. To train a Transformer model, a carefully designed learning rate warm-up stage is usually needed: the learning rate has to be set to an extremely small value at the beginning of the optimization and then gradually increases in some given number of iterations. Such a stage is shown to be crucial to the final performance and brings more hyper-parameter tunings. In this paper, we study why the learning rate warm-up stage is important in training the Transformer and theoretically show that the location of layer normalization matters. It can be proved that at the beginning of the optimization, for the original Transformer, which places the layer normalization between the residual blocks, the expected gradients of the parameters near the output layer are large. Then using a large learning rate on those gradients makes the training unstable. The warm-up stage is practically helpful to avoid this problem. Such an analysis motivates us to investigate a slightly modified Transformer architecture which locates the layer normalization inside the residual blocks. We show that the gradients in this Transformer architecture are well-behaved at initialization. Given these findings,  we are the first to show that this Transformer variant is easier and faster to train. The learning rate warm-up stage can be safely removed, and the training time can be largely reduced on a wide range of applications.",
        "keywords": "Transformer;BERT;Layer Normalization;Natural Language Processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruibin Xiong;Yunchang Yang;Di He;Kai Zheng;Shuxin Zheng;Huishuai Zhang;Yanyan Lan;Liwei Wang;Tie-Yan Liu",
        "authorids": "xiongruibin18@mails.ucas.ac.cn;1500010650@pku.edu.cn;dihe@microsoft.com;zhengk92@pku.edu.cn;shuxin.zheng@microsoft.com;huishuai.zhang@microsoft.com;lanyanyan@ict.ac.cn;wanglw@cis.pku.edu.cn;tyliu@microsoft.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@misc{\nxiong2020on,\ntitle={On Layer Normalization in the Transformer Architecture},\nauthor={Ruibin Xiong and Yunchang Yang and Di He and Kai Zheng and Shuxin Zheng and Huishuai Zhang and Yanyan Lan and Liwei Wang and Tie-Yan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1x8anVFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1x8anVFPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "692;590;804",
        "wc_reply_reviewers": "721;0;35",
        "wc_reply_authors": "1218;598;750",
        "reply_reviewers": "2;0;1",
        "reply_authors": "3;2;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            695.3333333333334,
            87.39692341394073
        ],
        "wc_reply_reviewers_avg": [
            252.0,
            331.9407577666031
        ],
        "wc_reply_authors_avg": [
            855.3333333333334,
            263.8450723857132
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1258,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13565805303124836944&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "B1x996EKPS",
        "title": "Fast Machine Learning with Byzantine Workers and Servers",
        "track": "main",
        "status": "Reject",
        "tldr": "We present an algorithm that tolerates not only Byzantine workers but also Byzantine servers in synchronous networks with a low overhead.",
        "abstract": "Machine Learning (ML) solutions are nowadays distributed and are prone to various types of component failures, which can be encompassed in so-called Byzantine behavior. This paper introduces LiuBei, a Byzantine-resilient ML algorithm that does not trust any individual component in the network (neither workers nor servers), nor does it induce additional communication rounds (on average), compared to standard non-Byzantine resilient algorithms. LiuBei builds upon gradient aggregation rules (GARs) to tolerate a minority of Byzantine workers. Besides, LiuBei replicates the parameter server on multiple machines instead of trusting it. We introduce a novel filtering mechanism that enables workers to filter out replies from Byzantine server replicas without requiring communication with all servers. Such a filtering mechanism is based on network synchrony, Lipschitz continuity of the loss function, and the GAR used to aggregate workers\u2019 gradients. We also introduce a protocol, scatter/gather, to bound drifts between models on correct servers with a small number of communication messages. We theoretically prove that LiuBei achieves Byzantine resilience to both servers and workers and guarantees convergence. We build LiuBei using TensorFlow, and we show that LiuBei tolerates Byzantine behavior with an accuracy loss of around 5% and around 24% convergence overhead compared to vanilla TensorFlow. We moreover show that the throughput gain of LiuBei compared to another state\u2013of\u2013the\u2013art Byzantine\u2013resilient ML algorithm (that assumes network asynchrony) is 70%.",
        "keywords": "Distributed machine learning;Byzantine resilience;Fault tolerance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "El-Mahdi El-Mhamdi;Rachid Guerraoui;Arsany Guirguis",
        "authorids": "elmahdi.elmhamdi@epfl.ch;rachid.guerraoui@epfl.ch;arsany.guirguis@epfl.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nel-mhamdi2020fast,\ntitle={Fast Machine Learning with Byzantine Workers and Servers},\nauthor={El-Mahdi El-Mhamdi and Rachid Guerraoui and Arsany Guirguis},\nyear={2020},\nurl={https://openreview.net/forum?id=B1x996EKPS}\n}",
        "github": "https://github.com/anonconfsubmit/submit-4",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1x996EKPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "209;85;194",
        "wc_reply_reviewers": "0;0;37",
        "wc_reply_authors": "532;596;490",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            162.66666666666666,
            55.25898619731958
        ],
        "wc_reply_reviewers_avg": [
            12.333333333333334,
            17.441967269268172
        ],
        "wc_reply_authors_avg": [
            539.3333333333334,
            43.583891009816405
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18065783456012407664&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "B1x9ITVYDr",
        "title": "Compressive Recovery Defense: A Defense Framework for $\\ell_0, \\ell_2$ and $\\ell_\\infty$ norm attacks.",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We provide recovery guarantees for compressible signals that have been corrupted with noise and extend the framework introduced in \\cite{bafna2018thwarting} to defend neural networks against $\\ell_0$, $\\ell_2$, and $\\ell_{\\infty}$-norm attacks. In the case of $\\ell_0$-norm noise, we provide recovery guarantees for Iterative Hard Thresholding (IHT) and Basis Pursuit (BP). For $\\ell_2$-norm bounded noise, we provide recovery guarantees for BP, and for the case of $\\ell_\\infty$-norm bounded noise, we provide recovery guarantees for Dantzig Selector (DS). These guarantees theoretically bolster the defense framework introduced in \\cite{bafna2018thwarting} for defending neural networks against adversarial inputs. Finally, we experimentally demonstrate the effectiveness of this defense framework against an array of $\\ell_0$, $\\ell_2$ and $\\ell_\\infty$-norm attacks.   ",
        "keywords": "adversarial input;adversarial machine learning;neural networks;compressive sensing.",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jasjeet Dhaliwal;Kyle Hambrook",
        "authorids": "jasjeet.dhaliwal@sjsu.edu;kyle.hambrook@sjsu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ndhaliwal2020compressive,\ntitle={Compressive Recovery Defense: A Defense Framework for {\\$}{\\textbackslash}ell{\\_}0, {\\textbackslash}ell{\\_}2{\\$} and {\\$}{\\textbackslash}ell{\\_}{\\textbackslash}infty{\\$} norm attacks.},\nauthor={Jasjeet Dhaliwal and Kyle Hambrook},\nyear={2020},\nurl={https://openreview.net/forum?id=B1x9ITVYDr}\n}",
        "github": "https://github.com/anonymousiclrcompressive/iclr2020",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1x9ITVYDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "352;174;336",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "280;1182;679",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            287.3333333333333,
            80.40453276332677
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            713.6666666666666,
            369.054949235958
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "B1xBAA4FwH",
        "title": "On Evaluating Explainability Algorithms",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a suite of metrics that capture desired properties of explainability algorithms and use it to objectively compare and evaluate such methods",
        "abstract": "A plethora of methods attempting to explain predictions of black-box models have been proposed by the Explainable Artificial Intelligence (XAI) community. Yet, measuring the quality of the generated explanations is largely unexplored, making quantitative comparisons non-trivial. In this work, we propose a suite of multifaceted metrics that enables us to objectively compare explainers based on the correctness, consistency, as well as the confidence of the generated explanations. These metrics are computationally inexpensive, do not require model-retraining and can be used across different data modalities. We evaluate them on common explainers such as Grad-CAM, SmoothGrad, LIME and Integrated Gradients. Our experiments show that the proposed metrics reflect qualitative observations reported in earlier works.",
        "keywords": "interpretability;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gokula Krishnan Santhanam;Ali Alami-Idrissi;Nuno Mota;Anika Schumann;Ioana Giurgiu",
        "authorids": "gst@zurich.ibm.com;aai@zurich.ibm.com;nuno.motagoncalves@epfl.ch;ikh@zurich.ibm.com;igi@zurich.ibm.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsanthanam2020on,\ntitle={On Evaluating Explainability Algorithms},\nauthor={Gokula Krishnan Santhanam and Ali Alami-Idrissi and Nuno Mota and Anika Schumann and Ioana Giurgiu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xBAA4FwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1xBAA4FwH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "393;1252;601",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "805;697;543",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            748.6666666666666,
            365.90010780114415
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            681.6666666666666,
            107.50917273527047
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:et6I2jxSJRsJ:scholar.google.com/&scioq=On+Evaluating+Explainability+Algorithms&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1xDq2EFDH",
        "title": "Analytical Moment Regularizer for Training Robust Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "An efficient estimate to the Gaussian first moment of DNNs as a regularizer to training robust networks.",
        "abstract": "Despite the impressive performance of deep neural networks (DNNs) on numerous learning  tasks, they still exhibit uncouth behaviours. One puzzling behaviour is the subtle sensitive reaction of DNNs to various noise attacks. Such a nuisance has strengthened the line of research around developing and training noise-robust networks. In this work, we propose a new training regularizer that aims to minimize the probabilistic expected training loss of a DNN subject to a generic Gaussian input. We provide an efficient and simple approach to approximate such a regularizer for arbitrarily deep networks. This is done by leveraging the analytic expression of the output mean of a shallow neural network, avoiding the need for memory and computation expensive data augmentation. We conduct extensive experiments on LeNet and AlexNet on various datasets including MNIST, CIFAR10, and CIFAR100 to demonstrate the effectiveness of our proposed regularizer. In particular, we show that networks that are trained with the proposed regularizer benefit from a boost in robustness against Gaussian noise to an equivalent amount of performing 3-21 folds of noisy data augmentation. Moreover, we empirically show on several architectures and datasets that improving robustness against Gaussian noise, by using the new regularizer, can improve the overall robustness against 6 other types of attacks by two orders of magnitude.",
        "keywords": "robustness;analytic regularizer;first moment",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Modar Alfadly;Adel Bibi;Muhammed Kocabas;Bernard Ghanem",
        "authorids": "modar.alfadly@kaust.edu.sa;adel.bibi@kaust.edu.sa;muhammed.kocabas@tue.mpg.de;bernard.ghanem@kaust.edu.sa",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nalfadly2020analytical,\ntitle={Analytical Moment Regularizer for Training Robust Networks},\nauthor={Modar Alfadly and Adel Bibi and Muhammed Kocabas and Bernard Ghanem},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xDq2EFDH}\n}",
        "github": "https://drive.google.com/open?id=1Pb3zfGnp6y-YgE7-gyh3qz7zykorClU5",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1xDq2EFDH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "176;313;342",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "44;289;199",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            277.0,
            72.39244896166082
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            177.33333333333334,
            101.18739491107027
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10736010839357279716&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1xGGTEtDH",
        "title": "Universal Approximation with Deep Narrow Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The classical Universal Approximation Theorem certifies that the universal approximation property holds for the class of neural networks of arbitrary width. Here we consider the natural `dual' theorem for width-bounded networks of arbitrary depth. Precisely, let $n$ be the number of inputs neurons, $m$ be the number of output neurons, and let $\\rho$ be any nonaffine continuous function, with a continuous nonzero derivative at some point. Then we show that the class of neural networks of arbitrary depth, width $n + m + 2$, and activation function $\\rho$, exhibits the universal approximation property with respect to the uniform norm on compact subsets of $\\mathbb{R}^n$. This covers every activation function possible to use in practice; in particular this includes polynomial activation functions, making this genuinely different to the classical case. We go on to consider extensions of this result. First we show an analogous result for a certain class of nowhere differentiable activation functions. Second we establish an analogous result for noncompact domains, by showing that deep narrow networks with the ReLU activation function exhibit the universal approximation property with respect to the $p$-norm on $\\mathbb{R}^n$. Finally we show that width of only $n + m + 1$ suffices for `most' activation functions.",
        "keywords": "deep learning;universal approximation;deep narrow networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Patrick Kidger;Terry Lyons",
        "authorids": "kidger@maths.ox.ac.uk;tlyons@maths.ox.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkidger2020universal,\ntitle={Universal Approximation with Deep Narrow Networks},\nauthor={Patrick Kidger and Terry Lyons},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xGGTEtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1xGGTEtDH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "186;256;436",
        "wc_reply_reviewers": "23;0;17",
        "wc_reply_authors": "163;128;171",
        "reply_reviewers": "1;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.6666666666667,
            105.30379332620876
        ],
        "wc_reply_reviewers_avg": [
            13.333333333333334,
            9.741092797468305
        ],
        "wc_reply_authors_avg": [
            154.0,
            18.672618098881223
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 513,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4012566685953948199&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "B1xGxgSYvH",
        "title": "Domain-Invariant Representations: A Look on Compression and Weights",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a new theoretical bound of the target risk for domain invariant representation which emphasizes both the role of compression and weights.",
        "abstract": " Learning Invariant Representations to adapt deep classifiers of a source domain to a new target domain has recently attracted much attention. In this paper, we show that the search for invariance favors the compression of representations. We point out this may have a bad impact on adaptability of representations expressed as a minimal combined domain error. By considering the risk of compression, we show that weighting representations can align representation distributions without impacting their adaptability. This supports the claim that representation invariance is too strict a constraint. First, we introduce a new bound on the target risk that reveals a trade-off between compression and invariance of learned representations. More precisely, our results show that the adaptability of a representation can be better controlled when the compression risk is taken into account. In contrast, preserving adaptability may overestimate the risk of compression that makes the bound impracticable. We support these statements with a theoretical analysis illustrated on a standard domain adaptation benchmark. Second, we show that learning weighted representations plays a key role in relaxing the constraint of invariance and then preserving the risk of compression. Taking advantage of this trade-off may open up promising directions for the design of new adaptation methods.",
        "keywords": "Domain Adaptation;Invariant Representation;Compression;Machine Learning Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Victor Bouvier;C\u00e9line Hudelot;Cl\u00e9ment Chastagnol;Philippe Very;Myriam Tami",
        "authorids": "vbouvier@sidetrade.com;celine.hudelot@centralesupelec.fr;cchastagnol@sidetrade.com;pveryranchet@gmail.com;myriam.tami@centralesupelec.fr",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nbouvier2020domaininvariant,\ntitle={Domain-Invariant Representations: A Look on Compression and Weights},\nauthor={Victor Bouvier and C{\\'e}line Hudelot and Cl{\\'e}ment Chastagnol and Philippe Very and Myriam Tami},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xGxgSYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1xGxgSYvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "202;866;696",
        "wc_reply_reviewers": "0;6;0",
        "wc_reply_authors": "191;754;626",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            588.0,
            281.62859703280606
        ],
        "wc_reply_reviewers_avg": [
            2.0,
            2.8284271247461903
        ],
        "wc_reply_authors_avg": [
            523.6666666666666,
            240.96518881826523
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4291127388094030276&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1xIj3VYvr",
        "title": "Weakly Supervised Clustering by Exploiting Unique Class Count",
        "track": "main",
        "status": "Poster",
        "tldr": "A weakly supervised learning based clustering framework performs comparable to that of fully supervised learning models by exploiting unique class count.",
        "abstract": "A weakly supervised learning based clustering framework is proposed in this paper. As the core of this framework, we introduce a novel multiple instance learning task based on a bag level label called unique class count (ucc), which is the number of unique classes among all instances inside the bag. In this task, no annotations on individual instances inside the bag are needed during training of the models. We mathematically prove that with a perfect ucc classifier, perfect clustering of individual instances inside the bags is possible even when no annotations on individual instances are given during training. We have constructed a neural network based ucc classifier and experimentally shown that the clustering performance of our framework with our weakly supervised ucc classifier is comparable to that of fully supervised learning models where labels for all instances are known. Furthermore, we have tested the applicability of our framework to a real world task of semantic segmentation of breast cancer metastases in histological lymph node sections and shown that the performance of our weakly supervised framework is comparable to the performance of a fully supervised Unet model.",
        "keywords": "weakly supervised clustering;weakly supervised learning;multiple instance learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mustafa Umit Oner;Hwee Kuan Lee;Wing-Kin Sung",
        "authorids": "umitoner@comp.nus.edu.sg;leehk@bii.a-star.edu.sg;ksung@comp.nus.edu.sg",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nOner2020Weakly,\ntitle={Weakly Supervised Clustering by Exploiting Unique Class Count},\nauthor={Mustafa Umit Oner and Hwee Kuan Lee and Wing-Kin Sung},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xIj3VYvr}\n}",
        "github": "http://bit.ly/uniqueclasscount",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1xIj3VYvr",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "284;205;433",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1296;690;530",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;3;1",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            307.3333333333333,
            94.53159377807096
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            838.6666666666666,
            329.91446702979783
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17745868154045998603&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "B1xMEerYvB",
        "title": "Smooth markets: A basic mechanism for organizing gradient-based learners",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce a class of n-player games suited to gradient-based methods.",
        "abstract": "With the success of modern machine learning, it is becoming increasingly important to understand and control how learning algorithms interact. Unfortunately, negative results from game theory show there is little hope of understanding or controlling general n-player games. We therefore introduce smooth markets (SM-games), a class of n-player games with pairwise zero sum interactions. SM-games codify a common design pattern in machine learning that includes some GANs, adversarial training, and other recent algorithms. We show that SM-games are amenable to analysis and optimization using first-order methods.",
        "keywords": "game theory;optimization;gradient descent;adversarial learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David Balduzzi;Wojciech M. Czarnecki;Tom Anthony;Ian Gemp;Edward Hughes;Joel Leibo;Georgios Piliouras;Thore Graepel",
        "authorids": "dbalduzzi@google.com;lejlot@google.com;edwardhughes@google.com;jzl@google.com;imgemp@google.com;twa@google.com;georgios.piliouras@gmail.com;thore@google.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nBalduzzi2020Smooth,\ntitle={Smooth markets: A basic mechanism for organizing gradient-based learners},\nauthor={David Balduzzi and Wojciech M. Czarnecki and Tom Anthony and Ian Gemp and Edward Hughes and Joel Leibo and Georgios Piliouras and Thore Graepel},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xMEerYvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1xMEerYvB",
        "pdf_size": 0,
        "rating": "8;8",
        "confidence": "0;0",
        "wc_review": "320;341",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "156;309",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            330.5,
            10.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            232.5,
            76.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11231473629863448020&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1xRGkHYDS",
        "title": "A bi-diffusion based layer-wise sampling method for deep learning in large graphs",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The Graph Convolutional Network (GCN) and its variants are powerful models for graph representation learning and have recently achieved great success on many graph-based applications. However, most of them target on shallow models (e.g. 2 layers) on relatively small graphs. Very recently, although many acceleration methods have been developed for GCNs training,\nit still remains a severe challenge how to scale GCN-like models to larger graphs and deeper layers due to the over-expansion of neighborhoods across layers. In this paper, to address the above challenge, we propose a novel layer-wise sampling strategy, which samples the nodes layer by layer conditionally based on the factors of the bi-directional diffusion between layers. In this way, we potentially restrict the time complexity linear to the number of layers, and construct a mini-batch of nodes with high local bi-directional influence (correlation). Further, we apply the self-attention mechanism to flexibly learn suitable weights for the sampled nodes, which allows the model to be able to incorporate both the first-order and higher-order proximities during a single layer propagation process without extra recursive propagation or skip connection. Extensive experiments on three large benchmark graphs demonstrate the effectiveness and efficiency of the proposed model.",
        "keywords": "Layerwise Sampling;Graph Neural Networks;Attention Mechanism",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu He;Shiyang Wen;Wenjin Wu;Yan Zhang;Siran Yang;Yuan Wei;Di Zhang;Guojie  Song;Wei Lin;Liang Wang;Bo Zheng",
        "authorids": "herve.hy@alibaba-inc.com;shiyang.wsy@alibaba-inc.com;kevin.wwj@alibaba-inc.com;zy143424@alibaba-inc.com;siran.ysr@alibaba-inc.com;yuanxi.wy@alibaba-inc.com;di.zhangd@alibaba-inc.com;gjsong@pku.edu.cn;yangkun.lw@alibaba-inc.com;liangbo.wl@alibaba-inc.com;bozheng@alibaba-inc.com",
        "gender": ";;;;;;;;;;",
        "homepage": ";;;;;;;;;;",
        "dblp": ";;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;",
        "orcid": ";;;;;;;;;;",
        "linkedin": ";;;;;;;;;;",
        "or_profile": ";;;;;;;;;;",
        "aff": ";;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;",
        "position": ";;;;;;;;;;",
        "bibtex": "@misc{\nhe2020a,\ntitle={A bi-diffusion based layer-wise sampling method for deep learning in large graphs},\nauthor={Yu He and Shiyang Wen and Wenjin Wu and Yan Zhang and Siran Yang and Yuan Wei and Di Zhang and Guojie  Song and Wei Lin and Liang Wang and Bo Zheng},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xRGkHYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1xRGkHYDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "246;774;265",
        "wc_reply_reviewers": "0;179;0",
        "wc_reply_authors": "229;397;61",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            428.3333333333333,
            244.54629191400323
        ],
        "wc_reply_reviewers_avg": [
            59.666666666666664,
            84.38140922159468
        ],
        "wc_reply_authors_avg": [
            229.0,
            137.17142559585798
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            11,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EedcmMg1z8IJ:scholar.google.com/&scioq=A+bi-diffusion+based+layer-wise+sampling+method+for+deep+learning+in+large+graphs&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1xSperKvH",
        "title": "Enabling Deep Spiking Neural Networks with Hybrid Conversion and Spike Timing Dependent Backpropagation",
        "track": "main",
        "status": "Poster",
        "tldr": "A hybrid training technique that combines ANN-SNN conversion and spike-based backpropagation to optimize training effort and inference latency.",
        "abstract": "Spiking Neural Networks (SNNs) operate with asynchronous discrete events (or spikes) which can potentially lead to higher energy-efficiency in neuromorphic hardware implementations. Many works have shown that an SNN for inference can be formed by copying the weights from a trained Artificial Neural Network (ANN) and setting the firing threshold for each layer as the maximum input received in that layer. These type of converted SNNs require a large number of time steps to achieve competitive accuracy which diminishes the energy savings. The number of time steps can be reduced by training SNNs with spike-based backpropagation from scratch, but that is computationally expensive and slow. To address these challenges, we present a computationally-efficient training technique for deep SNNs. We propose a hybrid training methodology: 1) take a converted SNN and use its weights and thresholds as an initialization step for spike-based backpropagation, and 2) perform incremental spike-timing dependent backpropagation (STDB) on this carefully initialized network to obtain an SNN that converges within few epochs and requires fewer time steps for input processing. STDB is performed with a novel surrogate gradient function defined using neuron's spike time. The weight update is proportional to the difference in spike timing between the current time step and the most recent time step the neuron generated an output spike. The SNNs trained with our hybrid conversion-and-STDB training perform at $10{\\times}{-}25{\\times}$ fewer number of time steps and achieve similar accuracy compared to purely converted SNNs. The proposed training methodology converges in less than $20$ epochs of spike-based backpropagation for most standard image classification datasets, thereby greatly reducing the training complexity compared to training SNNs from scratch. We perform experiments on CIFAR-10, CIFAR-100 and ImageNet datasets for both VGG and ResNet architectures. We achieve top-1 accuracy of $65.19\\%$ for ImageNet dataset on SNN with $250$ time steps, which is $10{\\times}$ faster compared to converted SNNs with similar accuracy.",
        "keywords": "spiking neural networks;ann-snn conversion;spike-based backpropagation;imagenet",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nitin Rathi;Gopalakrishnan Srinivasan;Priyadarshini Panda;Kaushik Roy",
        "authorids": "rathi2@purdue.edu;srinivg@purdue.edu;priya.panda@yale.edu;kaushik@purdue.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nRathi2020Enabling,\ntitle={Enabling Deep Spiking Neural Networks with Hybrid Conversion and Spike Timing Dependent Backpropagation},\nauthor={Nitin Rathi and Gopalakrishnan Srinivasan and Priyadarshini Panda and Kaushik Roy},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xSperKvH}\n}",
        "github": "https://github.com/nitin-rathi/hybrid-snn-conversion",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1xSperKvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "232;143;332",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "68;250;1099",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            235.66666666666666,
            77.2024754928378
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            472.3333333333333,
            449.3063790134993
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 393,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2336999671459388564&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "B1xZD1rtPr",
        "title": "The Dual Information Bottleneck",
        "track": "main",
        "status": "Reject",
        "tldr": "A new dual formulation of the Information Bottleneck, optimizing label prediction and preserving distributions of exponential form.",
        "abstract": "The Information-Bottleneck (IB) framework suggests a general characterization of optimal representations in learning, and deep learning in particular. It is based on the optimal trade off between the representation complexity and accuracy, both of which are quantified by mutual information. The problem is solved by alternating projections between the encoder and decoder of the representation, which can be performed locally at each representation level. The framework, however, has practical drawbacks, in that mutual information is notoriously difficult to handle at high dimension, and only has closed form solutions in special cases. Further, because it aims to extract representations which are minimal sufficient statistics of the data with respect to the desired label, it does not necessarily optimize the actual prediction of unseen labels. Here we present a  formal dual problem to the IB which has several interesting properties. By switching the order in the KL-divergence between the representation decoder and data, the optimal decoder becomes the geometric rather than the arithmetic mean of the input points. While providing a good approximation to the original IB, it also preserves the form of exponential families, and optimizes the mutual information on the predicted label rather than the desired one. We also analyze the critical points of the dualIB and discuss their importance for the quality of this approach.",
        "keywords": "optimal prediction learning;exponential families;critical points;information theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zoe Piran;Naftali Tishby",
        "authorids": "zoe.piran@mail.huji.ac.il;tishby@cs.huji.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\npiran2020the,\ntitle={The Dual Information Bottleneck},\nauthor={Zoe Piran and Naftali Tishby},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xZD1rtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1xZD1rtPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "308;685;266",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "562;725;164",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            419.6666666666667,
            188.40087284534775
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            483.6666666666667,
            235.63012446534836
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10018776790953802463&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1x_K04YwS",
        "title": "Generalizing Natural Language Analysis through Span-relation Representations",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We use a single model to solve a great variety of natural language analysis tasks by formulating them in a unified span-relation format.",
        "abstract": "A large number of natural language processing tasks exist to analyze syntax, semantics, and information content of human language. These seemingly very different tasks are usually solved by specially designed architectures. In this paper, we provide the simple insight that a great variety of tasks can be represented in a single unified format consisting of labeling spans and relations between spans, thus a single task-independent model can be used across different tasks. We perform extensive experiments to test this insight on 10 disparate tasks as broad as dependency parsing (syntax), semantic role labeling (semantics), relation extraction (information content), aspect based sentiment analysis (sentiment), and many others, achieving comparable performance as state-of-the-art specialized models. We further demonstrate benefits in multi-task learning. We convert these datasets into a unified format to build a benchmark, which provides a holistic testbed for evaluating future models for generalized natural language analysis.",
        "keywords": "Span-Relation Representation;Task-Independent Framework;Natural Language Analysis;Benchmark",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhengbao Jiang;Wei Xu;Jun Araki;Graham Neubig",
        "authorids": "zhengbaj@cs.cmu.edu;weixu@cse.ohio-state.edu;jun.araki@us.bosch.com;gneubig@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1x_K04YwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "342;861;217",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            473.3333333333333,
            278.83129108636444
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 64,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12252454188708362760&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "B1xbTlBKwB",
        "title": "Measuring Numerical Common Sense: Is A Word Embedding Approach Effective?",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Numerical common sense (e.g., ``a person with a height of 2m is very tall'') is essential when deploying artificial intelligence (AI) systems in society. To predict ranges of small and large values for a given target noun and unit, \nprevious studies have implemented a rule-based method that processed numeric values appearing in a natural language by using template matching. To obtain numerical knowledge, crawled textual data from web pages are frequently used as the input in the above method. Although this is an important task, few studies have addressed the availability of numerical common sense extracted from corresponding textual information. To this end, we first used a crowdsourcing service to obtain sufficient data for a subjective agreement on numerical common sense. Second, to examine whether common sense is attributed to current word embedding, we examined the performance of a regressor trained on the obtained data. In comparison with humans, the performance of an automatic relevance determination regression model was good, particularly when the unit was yen (a maximum correlation coefficient of 0.57). Although all the regression approach with word embedding does not predict values with high correlation coefficients, this word-embedding method could potentially contribute to construct numerical common sense for AI deployment.",
        "keywords": "numerical common sense;word embedding;semantic representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hiroaki Yamane;Chin-Yew Lin;Tatsuya Harada",
        "authorids": "hiroaki.yamane@riken.jp;cyl@microsoft.com;harada@mi.t.u-tokyo.ac.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyamane2020measuring,\ntitle={Measuring Numerical Common Sense: Is A Word Embedding Approach Effective?},\nauthor={Hiroaki Yamane and Chin-Yew Lin and Tatsuya Harada},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xbTlBKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1xbTlBKwB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "572;687;275",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            511.3333333333333,
            173.5825132001749
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9561569410321923379&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1xcLJrYwH",
        "title": "Lean Images for Geo-Localization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Most computer vision tasks use textured images. In this paper we consider the geo-localization task - finding the pose of a camera in a large 3D scene from a single lean image, i.e. an image with no texture.  We aim to experimentally explore whether texture and correlation between nearby images are necessary in a CNN-based solution for this task. Our results may give insight to the role of geometry (as opposed to textures) in a CNN-based geo-localization solution.  Lean images are projections of a simple 3D model of a city. They contain solely information that relates to the geometry of the scene viewed (edges, faces, or relative depth). We find that the network is capable of estimating the camera pose from lean images for a relatively large number of locations (order of hundreds of thousands of images). The main contributions of this paper are: (i) demonstrating the power of CNNs for recovering camera pose using lean images; and (ii) providing insight into the role of geometry in the CNN learning process;",
        "keywords": "Geo Localization;Deep Learning;Computer Vision;Camera Localization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Moti Kadosh;Yael Moses;Ariel Shamir",
        "authorids": ";;",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkadosh2020lean,\ntitle={Lean Images for Geo-Localization},\nauthor={Moti Kadosh and Yael Moses and Ariel Shamir},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xcLJrYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1xcLJrYwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "211;550;233",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            331.3333333333333,
            154.8813165692435
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mH_horq_PtgJ:scholar.google.com/&scioq=Lean+Images+for+Geo-Localization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1xeZJHKPB",
        "title": "Aggregating explanation methods for neural networks stabilizes explanations",
        "track": "main",
        "status": "Reject",
        "tldr": "We show in theory and in practice that combining multiple explanation methods for DNN benefits the explanation.",
        "abstract": "\tDespite a growing literature on explaining neural networks, no consensus has been reached on how to explain a neural network decision or how to evaluate an explanation.\n\tOur contributions in this paper are twofold. First, we investigate schemes to combine explanation methods and reduce model uncertainty to obtain a single aggregated explanation. The aggregation is more robust and aligns better with the neural network than any single explanation method..\n\tSecond, we propose a new approach to evaluating explanation methods that circumvents the need for manual evaluation and is not reliant on the alignment of neural networks and humans decision processes. ",
        "keywords": "explainability;deep learning;interpretability;XAI",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Laura Rieger;Lars Kai Hansen",
        "authorids": "lauri@dtu.dk;lauri@dtu.dk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nrieger2020aggregating,\ntitle={Aggregating explanation methods for neural networks stabilizes explanations},\nauthor={Laura Rieger and Lars Kai Hansen},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xeZJHKPB}\n}",
        "github": "https://drive.google.com/drive/folders/1ZWozeTQoLni13rltt6JvLYXEEsGBEF3X?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1xeZJHKPB",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "750;295;312",
        "wc_reply_reviewers": "96;18;0",
        "wc_reply_authors": "1440;425;184",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.3333333333333,
            210.59650730458205
        ],
        "wc_reply_reviewers_avg": [
            38.0,
            41.66533331199932
        ],
        "wc_reply_authors_avg": [
            683.0,
            544.2468802544179
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VsieSZJP3zcJ:scholar.google.com/&scioq=Aggregating+explanation+methods+for+neural+networks+stabilizes+explanations&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "B1xewR4KvH",
        "title": "MANIFOLD FORESTS: CLOSING THE GAP ON NEURAL NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "Classification accuracy of decision forests on structured data, data in which the feature indices matter, can be improved using a specific projection distribution.",
        "abstract": "Decision forests (DF), in particular random forests and gradient boosting trees, have  demonstrated state-of-the-art accuracy compared to other methods in many supervised learning scenarios. In particular, DFs dominate other methods in tabular data, that is, when the feature space is unstructured, so that the signal is invariant to permuting feature indices.  However, in structured data lying on a manifold---such as images, text, and speech---neural nets (NN) tend to outperform DFs. We conjecture that at least part of the reason for this is that the input to NN is not simply the feature magnitudes, but also their indices (for example, the convolution operation uses ``feature locality). In contrast, naive DF implementations fail to explicitly consider feature indices. A recently proposed DF approach demonstrates that DFs, for each node, implicitly sample a random matrix from some specific distribution.  Here, we build on that to show that one can choose distributions in a manifold aware fashion. For example, for image classification, rather than randomly selecting pixels, one can randomly select contiguous patches. We demonstrate the empirical performance of  data living on three different manifolds: images, time-series, and a torus. In all three cases, our Manifold Forest (Mf) algorithm empirically dominates other state-of-the-art approaches that ignore feature space structure, achieving a lower classification error on all sample sizes. This dominance extends to the MNIST data set as well. Moreover, both training and test time is significantly faster for manifold forests as compared to deep nets. This approach, therefore, has promise to enable DFs and other machine learning methods to close the gap with deep nets on manifold-valued data. ",
        "keywords": "machine learning;structured learning;projections;structured data;images;classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ronan Perry;Tyler M. Tomita;Jesse Patsolic;Benjamin Falk;Joshua Vogelstein",
        "authorids": "rperry27@jhu.edu;ttomita2@jhmi.edu;jpatsolic@jhu.edu;falk.ben@jhu.edu;jovo@jhu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nperry2020manifold,\ntitle={{\\{}MANIFOLD{\\}} {\\{}FORESTS{\\}}: {\\{}CLOSING{\\}} {\\{}THE{\\}} {\\{}GAP{\\}} {\\{}ON{\\}} {\\{}NEURAL{\\}} {\\{}NETWORKS{\\}}},\nauthor={Ronan Perry and Tyler M. Tomita and Jesse Patsolic and Benjamin Falk and Joshua Vogelstein},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xewR4KvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1xewR4KvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "145;205;807",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            385.6666666666667,
            298.9329170380687
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=106045009327881094&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1xfElrKPr",
        "title": "Enhancing the Transformer with explicit relational encoding for math problem solving",
        "track": "main",
        "status": "Reject",
        "tldr": "Our Tensor-Product Transformer sets a new state of the art on the recently-introduced Mathematics Dataset containing 56 categories of free-form math word-problems. ",
        "abstract": "We incorporate Tensor-Product Representations within the Transformer in order to better support the explicit representation of relation structure.\nOur Tensor-Product Transformer (TP-Transformer) sets a new state of the art on the recently-introduced Mathematics Dataset containing 56 categories of free-form math word-problems.\nThe essential component of the model is a novel attention mechanism, called TP-Attention, which explicitly encodes the relations between each Transformer cell and the other cells from which values have been retrieved by attention. TP-Attention goes beyond linear combination of retrieved values, strengthening representation-building and resolving ambiguities introduced by multiple layers of regular attention.\nThe TP-Transformer's attention maps give better insights into how it is capable of solving the Mathematics Dataset's challenging problems.\nPretrained models and code will be made available after publication.",
        "keywords": "Tensor Product Representation;Transformer;Mathematics Dataset;Attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Imanol Schlag;Paul Smolensky;Roland Fernandez;Nebojsa Jojic;J\u00fcrgen Schmidhuber;Jianfeng Gao",
        "authorids": "imanol@idsia.ch;paul.smolensky@gmail.com;rfernand@microsoft.com;jojic@microsoft.com;juergen@idsia.ch;jfgao@microsoft.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nschlag2020enhancing,\ntitle={Enhancing the Transformer with explicit relational encoding for math problem solving},\nauthor={Imanol Schlag and Paul Smolensky and Roland Fernandez and Nebojsa Jojic and J{\\\"u}rgen Schmidhuber and Jianfeng Gao},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xfElrKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1xfElrKPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "351;221;243",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "390;353;194",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            271.6666666666667,
            56.81157941437252
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            312.3333333333333,
            85.02679316283518
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 83,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9926466172343268529&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1xgQkrYwS",
        "title": "On Iterative Neural Network Pruning, Reinitialization, and the Similarity of Masks",
        "track": "main",
        "status": "Reject",
        "tldr": "Different pruning techniques identify multiple trainable sub-networks within an over-parametrize model, with similar performance but significantly different emergent connectivity structure, weight evolution, and learned functions.",
        "abstract": "We examine how recently documented, fundamental phenomena in deep learn-ing models subject to pruning are affected by changes in the pruning procedure. Specifically, we analyze differences in the connectivity structure and learning dynamics  of  pruned models found through a set of common iterative pruning techniques, to address questions of uniqueness of  trainable, high-sparsity sub-networks, and their dependence on the chosen pruning method. In convolutional layers, we document the emergence of structure induced by magnitude-based un-structured pruning in conjunction with weight rewinding that resembles the effects of structured pruning. We also show empirical evidence that weight stability can be automatically achieved through apposite pruning techniques.",
        "keywords": "Pruning;Lottery Tickets;Science of Deep Learning;Experimental Deep Learning;Empirical Study",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michela Paganini;Jessica Forde",
        "authorids": "michela@fb.com;jzf2101@columbia.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\npaganini2020on,\ntitle={On Iterative Neural Network Pruning, Reinitialization, and the Similarity of Masks},\nauthor={Michela Paganini and Jessica Forde},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xgQkrYwS}\n}",
        "github": "https://github.com/iclr-8dafb2ab/iterative-pruning-reinit",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1xgQkrYwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "185;234;500",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "390;417;827",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            306.3333333333333,
            138.39637117914458
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            544.6666666666666,
            199.94388101553784
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5825469528272500177&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1xgWpEKvH",
        "title": "ADASAMPLE: ADAPTIVE SAMPLING OF HARD POSITIVES FOR DESCRIPTOR LEARNING",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Triplet loss is commonly used in descriptor learning, where the performance heavily relies on mining triplets. Typical solution to that is first picking pairs of intra-class patches (positives) from the dataset to form batches, and then selecting in-batch negatives to construct triplets. For high-informativeness triplet collection, researchers mainly focus on mining hard negatives in the second stage, while they pay relatively less attention to constructing informative  batches, i.e., matching pairs are often randomly sampled from the dataset. To address this issue, we propose AdaSample, an adaptive and online batch sampler, in this paper. Specifically, we sample positives based on their informativeness, and formulate our hardness-aware positive mining pipeline within a novel maximum loss minimization training protocol. The efficacy of the proposed method is demonstrated in several standard benchmarks, in which it results in a significant and consistent performance gain on top of the existing strong baselines. The source code and pretrained model will be released upon acceptance.\n",
        "keywords": "Descriptor;Correspondence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin-Yu Zhang;Jia-Wang Bian;Le Zhang;Zao-Yi Zheng;Yun Liu;Ming-Ming Cheng;Ian Reid",
        "authorids": "xinyuzhang@mail.nankai.edu.cn;jiawang.bian@gmail.com;zhangleuestc@gmail.com;roymarssss@gmail.com;nk12csly@mail.nankai.edu.cn;cmm@nankai.edu.cn;ian.reid@adelaide.edu.au",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1xgWpEKvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "279;407;388",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "50;245;212",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            358.0,
            56.39739946723312
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            169.0,
            85.21736912155878
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12549751393749773314&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "B1xgv0NtwH",
        "title": "Strong Baseline Defenses Against Clean-Label Poisoning Attacks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present effective defenses to clean-label poisoning attacks. ",
        "abstract": "Targeted clean-label poisoning is a type of adversarial attack on machine learning systems where the adversary injects a few correctly-labeled, minimally-perturbed samples into the training data thus causing the deployed model to misclassify a particular test sample during inference. Although defenses have been proposed for general poisoning attacks (those which aim to reduce overall test accuracy), no reliable defense for clean-label attacks has been demonstrated, despite the attacks' effectiveness and their realistic use cases. In this work, we propose a set of simple, yet highly-effective defenses against these attacks. \nWe test our proposed approach against two recently published clean-label poisoning attacks, both of which use the CIFAR-10 dataset. After reproducing their experiments, we demonstrate that our defenses are able to detect over 99% of poisoning examples in both attacks and remove them without any compromise on model performance. Our simple defenses show that current clean-label poisoning attack strategies can be annulled, and serve as strong but simple-to-implement baseline defense for which to test future clean-label poisoning attacks.",
        "keywords": "poisoning;defenses;robustness;adversarial;ML security;ML safety",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Neal Gupta;W. Ronny Huang;Liam Fowl;Chen Zhu;Soheil Feizi;Tom Goldstein;John Dickerson",
        "authorids": "ngupta@cs.umd.edu;wronnyhuang@gmail.com;lhfowl@gmail.com;chenzhu@cs.umd.edu;sfeizi@cs.umd.edu;tomg@cs.umd.edu;john@cs.umd.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1xgv0NtwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "194;352;171",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "374;454;525",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            239.0,
            80.45288476286396
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            451.0,
            61.68198008062538
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2040096454104579622&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1xhpa4FvS",
        "title": "Modeling Fake News in Social Networks with Deep Multi-Agent Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We model fake news on social networks using deep multi-agent reinforcement learning and propose interventions to curb the effectiveness of fake news in swaying public opinion. ",
        "abstract": "We develop a practical and flexible computational model of fake news on social networks in which agents act according to learned best response functions. We achieve this by extending an information aggregation game to allow for fake news and by representing agents as recurrent deep Q-networks (DQN) trained by independent Q-learning. In the game, agents repeatedly guess whether a claim is true or false taking into account an informative private signal and observations of actions of their neighbors on the social network in the previous period. We incorporate fake news into the model by adding an adversarial agent, the attacker, that either provides biased private signals to or takes over a subset of agents. The attacker can follow either a hand-tuned or trained policy. Our model allows us to tackle questions that are analytically intractable in fully rational models, while ensuring that agents follow reasonable best response functions. Our results highlight the importance of awareness, privacy and social connectivity in curbing the adverse effects of fake news.\n",
        "keywords": "deep multi-agent reinforcement learning;fake news;social networks;information aggregation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Christoph Aymanns;Matthias Weber;Co-Pierre Georg;Jakob Foerster",
        "authorids": "christoph.aymanns@gmail.com;matthias.weber@unisg.ch;cogeorg@gmail.com;jakobfoerster@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\naymanns2020modeling,\ntitle={Modeling Fake News in Social Networks with Deep Multi-Agent Reinforcement Learning},\nauthor={Christoph Aymanns and Matthias Weber and Co-Pierre Georg and Jakob Foerster},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xhpa4FvS}\n}",
        "github": "https://github.com/DMARL-fake-news/iclr2020_submission_code",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1xhpa4FvS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "340;538;746",
        "wc_reply_reviewers": "0;67;0",
        "wc_reply_authors": "1121;514;1113",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            541.3333333333334,
            165.76556404218042
        ],
        "wc_reply_reviewers_avg": [
            22.333333333333332,
            31.584102892999123
        ],
        "wc_reply_authors_avg": [
            916.0,
            284.2756877868149
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8956760324949063863&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1xm3RVtwB",
        "title": "Simplified Action Decoder for Deep Multi-Agent Reinforcement Learning",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We develop Simplified Action Decoder, a simple MARL algorithm that beats previous SOTA on Hanabi by a big margin across 2- to 5-player games.",
        "abstract": "In recent years we have seen fast progress on a number of benchmark problems in AI, with modern methods achieving near or super human performance in Go, Poker and Dota. One common aspect of all of these challenges is that they are by design adversarial or, technically speaking, zero-sum. In contrast to these settings, success in the real world commonly requires humans to collaborate and communicate with others, in settings that are, at least partially, cooperative. In the last year, the card game Hanabi has been established as a new benchmark environment for AI to fill this gap. In particular, Hanabi is interesting to humans since it is entirely focused on theory of mind, i.e. the ability to effectively reason over the intentions, beliefs and point of view of other agents when observing their actions. Learning to be informative when observed by others is an interesting challenge for Reinforcement Learning (RL): Fundamentally, RL requires agents to explore in order to discover good policies. However, when done naively, this randomness will inherently make their actions less informative to others during training. We present a new deep multi-agent RL method, the Simplified Action Decoder (SAD), which resolves this contradiction exploiting the centralized training phase. During training SAD allows other agents to not only observe the (exploratory) action chosen, but agents instead also observe the greedy action of their team mates. By combining this simple intuition with an auxiliary task for state prediction and best practices for multi-agent learning, SAD establishes a new state of the art for 2-5 players on the self-play part of the Hanabi challenge.",
        "keywords": "multi-agent RL;theory of mind",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hengyuan Hu;Jakob N Foerster",
        "authorids": "hengyuan@fb.com;jakobfoerster@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nHu2020Simplified,\ntitle={Simplified Action Decoder for Deep Multi-Agent Reinforcement Learning},\nauthor={Hengyuan Hu and Jakob N Foerster},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xm3RVtwB}\n}",
        "github": "https://bit.ly/2mBJLyk",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1xm3RVtwB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "766;564;305",
        "wc_reply_reviewers": "165;40;0",
        "wc_reply_authors": "792;464;89",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            545.0,
            188.6813892959946
        ],
        "wc_reply_reviewers_avg": [
            68.33333333333333,
            70.27722880769338
        ],
        "wc_reply_authors_avg": [
            448.3333333333333,
            287.2122714803267
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 106,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17934059469747464722&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1xmOgrFPS",
        "title": "Meta-RCNN: Meta Learning for Few-Shot Object Detection",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop Meta-RCNN which learns both the object classifier and the region proposal network via meta-learning in order to do few-shot detection",
        "abstract": "Despite significant advances in object detection in recent years, training effective detectors in a small data regime remains an open challenge. Labelling training data for object detection is extremely expensive, and there is a need to develop techniques that can generalize well from small amounts of labelled data. We investigate this problem of few-shot object detection, where a detector has access to only limited amounts of annotated data. Based on the recently evolving meta-learning principle, we propose a novel meta-learning framework for object detection named ``Meta-RCNN\", which learns the ability to perform few-shot detection via meta-learning. Specifically, Meta-RCNN learns an object detector in an episodic learning paradigm on the (meta) training data. This learning scheme helps acquire a prior which enables Meta-RCNN to do few-shot detection on novel tasks. Built on top of the Faster RCNN model, in Meta-RCNN, both the Region Proposal Network (RPN) and the object classification branch are meta-learned. The meta-trained RPN learns to provide class-specific proposals, while the object classifier learns to do few-shot classification. The novel loss objectives and learning strategy of Meta-RCNN can be trained in an end-to-end manner. We demonstrate the effectiveness of Meta-RCNN in addressing few-shot detection on Pascal VOC dataset and achieve promising results. ",
        "keywords": "Few-shot detection;Meta-Learning;Object Detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiongwei Wu;Doyen Sahoo;Steven C. H. Hoi",
        "authorids": "xwwu.2015@smu.edu.sg;dsahoo@salesforce.com;shoi@salesforce.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwu2020metarcnn,\ntitle={Meta-{\\{}RCNN{\\}}: Meta Learning for Few-Shot Object Detection},\nauthor={Xiongwei Wu and Doyen Sahoo and Steven C. H. Hoi},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xmOgrFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1xmOgrFPS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "719;137;213",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "625;262;77",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            356.3333333333333,
            258.3141928393061
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            321.3333333333333,
            227.62005379335292
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 102,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6715646809161282656&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "B1xoserKPH",
        "title": "Analyzing Privacy Loss in Updates of Natural Language Models",
        "track": "main",
        "status": "Reject",
        "tldr": "comparing updates of language models reveals many details about changes in training data",
        "abstract": "To continuously improve quality and reflect changes in data, machine learning-based services have to regularly re-train and update their core models. In the setting of language models, we show that a comparative analysis of model snapshots before and after an update can reveal a surprising amount of detailed information about the changes in the data used for training before and after the update.\nWe discuss the privacy implications of our findings, propose mitigation strategies and evaluate their effect.",
        "keywords": "Language Modelling;Privacy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shruti Tople;Marc Brockschmidt;Boris K\u00f6pf;Olga Ohrimenko;Santiago Zanella-B\u00e9guelin",
        "authorids": "t-shtopl@microsoft.com;mabrocks@microsoft.com;boris.koepf@microsoft.com;oohrim@microsoft.com;santiago@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ntople2020analyzing,\ntitle={Analyzing Privacy Loss in Updates of Natural Language Models},\nauthor={Shruti Tople and Marc Brockschmidt and Boris K{\\\"o}pf and Olga Ohrimenko and Santiago Zanella-B{\\'e}guelin},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xoserKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1xoserKPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "281;238;227",
        "wc_reply_reviewers": "0;0;77",
        "wc_reply_authors": "970;744;511",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            248.66666666666666,
            23.299976156401726
        ],
        "wc_reply_reviewers_avg": [
            25.666666666666668,
            36.29814810090944
        ],
        "wc_reply_authors_avg": [
            741.6666666666666,
            187.39322885905517
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7127913282264115379&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1xpI1BFDS",
        "title": "Semi-Supervised Few-Shot Learning with a Controlled Degree of Task-Adaptive Conditioning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a semi-supervised few-shot learning algorithm with a controlled degree of task-adaptive conditioning by an iterative update of a task-conditioned projection space where the clustering of unlabeled samples takes place.",
        "abstract": "Few-shot learning aims to handle previously unseen tasks using only a small amount of new training data. In preparing (or meta-training) a few-shot learner, however, massive labeled data are necessary. In the real world, unfortunately, labeled data are expensive and/or scarce. In this work, we propose a few-shot learner that can work well under the semi-supervised setting where a large portion of training data is unlabeled. Our method employs explicit task-conditioning in which unlabeled sample clustering for the current task takes place in a new projection space different from the embedding feature space. The conditioned clustering space is linearly constructed so as to quickly close the gap between the class centroids for the current task and the independent per-class reference vectors meta-trained across tasks. In a more general setting, our method introduces a concept of controlling the degree of task-conditioning for meta-learning: the amount of task-conditioning varies with the number of repetitive updates for the clustering space. During each update, the soft labels of the unlabeled samples estimated in the conditioned clustering space are used to update the class averages in the original embedded space, which in turn are used to reconstruct the clustering space. Extensive simulation results based on the miniImageNet and tieredImageNet datasets show state-of-the-art semi-supervised few-shot classification performance of the proposed method. Simulation results also indicate that the proposed task-adaptive clustering shows graceful degradation with a growing number of distractor samples, i.e., unlabeled samples coming from outside the candidate classes.",
        "keywords": "few-shot learning;meta-learning;semi-supervised learning;task-adaptive clustering;task-adaptive projection space",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sung Whan Yoon;Jun Seo;Jaekyun Moon",
        "authorids": "shyoon8@kaist.ac.kr;tjwns0630@kaist.ac.kr;jmoon@kaist.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyoon2020semisupervised,\ntitle={Semi-Supervised Few-Shot Learning with a Controlled Degree of Task-Adaptive Conditioning},\nauthor={Sung Whan Yoon and Jun Seo and Jaekyun Moon},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xpI1BFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1xpI1BFDS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "320;224;946",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "508;277;822",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            496.6666666666667,
            320.13469387465994
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            535.6666666666666,
            223.35373041184891
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9Dm_Bd2WqI4J:scholar.google.com/&scioq=Semi-Supervised+Few-Shot+Learning+with+a+Controlled+Degree+of+Task-Adaptive+Conditioning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1xq264YvH",
        "title": "Encoder-Agnostic Adaptation for Conditional Language Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Large pretrained language models have changed the way researchers approach discriminative natural language understanding tasks, leading to the dominance of approaches that adapt a pretrained model for arbitrary downstream tasks. However, it is an open question how to use similar techniques for language generation. Early results in the encoder-agnostic setting have been mostly negative. In this work, we explore methods for adapting a pretrained language model to arbitrary conditional input. We observe that pretrained transformer models are sensitive to large parameter changes during tuning. Therefore, we propose an adaptation that directly injects arbitrary conditioning into self attention, an approach we call pseudo self attention. Through experiments on four diverse conditional text generation tasks, we show that this encoder-agnostic technique outperforms strong baselines, produces coherent generations, and is data-efficient.",
        "keywords": "NLP;generation;pretraining",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zachary M. Ziegler;Luke Melas-Kyriazi;Sebastian Gehrmann;Alexander M. Rush",
        "authorids": "zziegler@g.harvard.edu;lmelaskyriazi@college.harvard.edu;gehrmann@seas.harvard.edu;srush@seas.harvard.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nziegler2020encoderagnostic,\ntitle={Encoder-Agnostic Adaptation for Conditional Language Generation},\nauthor={Zachary M. Ziegler and Luke Melas-Kyriazi and Sebastian Gehrmann and Alexander M. Rush},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xq264YvH}\n}",
        "github": "https://github.com/anon37234/encoder-agnostic-adaptation",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1xq264YvH",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "264;627;192",
        "wc_reply_reviewers": "0;28;0",
        "wc_reply_authors": "759;339;28",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            361.0,
            190.37331745809337
        ],
        "wc_reply_reviewers_avg": [
            9.333333333333334,
            13.199326582148887
        ],
        "wc_reply_authors_avg": [
            375.3333333333333,
            299.5333407522813
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 62,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12044445688915234240&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "B1xtFpVtvB",
        "title": "Improving the Generalization of Visual Navigation Policies using Invariance Regularization",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a regularization term that, when added to the reinforcement learning objective, allows the policy to maximize the reward and simultaneously learn to be invariant to the irrelevant changes within the input..",
        "abstract": "Training agents to operate in one environment often yields overfitted models that are unable to generalize to the changes in that environment. However, due to the numerous variations that can occur in the real-world, the agent is often required to be robust in order to be useful. This has not been the case for agents trained with reinforcement learning (RL) algorithms. In this paper, we investigate the overfitting of RL agents to the training environments in visual navigation tasks. Our experiments show that deep RL agents can overfit even when trained on multiple environments simultaneously. \nWe propose a regularization method which combines RL with supervised learning methods by adding a term to the RL objective that would encourage the invariance of a policy to variations in the observations that ought not to affect the action taken. The results of this method, called invariance regularization, show an improvement in the generalization of policies to environments not seen during training.\n",
        "keywords": "Generalization;Deep Reinforcement Learning;Invariant Representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michel Aractingi;Christopher Dance;Julien Perez;Tomi Silander",
        "authorids": "michel.aractingi@naverlabs.com;christopher.dance@naverlabs.com;julien.perez@naverlabs.com;tomi.silander@naverlabs.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\naractingi2020improving,\ntitle={Improving the Generalization of Visual Navigation Policies using Invariance Regularization},\nauthor={Michel Aractingi and Christopher Dance and Julien Perez and Tomi Silander},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xtFpVtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1xtFpVtvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "225;719;417",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "502;775;141",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            453.6666666666667,
            203.3344262265711
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            472.6666666666667,
            259.6591783259142
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3822254304297637622&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "B1xtd1HtPS",
        "title": "Quaternion Equivariant Capsule Networks for 3D Point Clouds",
        "track": "main",
        "status": "Reject",
        "tldr": "Deep architectures for 3D point clouds that are equivariant to SO(3) rotations, as well as translations and permutations. ",
        "abstract": "We present a 3D capsule architecture for processing of point clouds that is equivariant with respect to the SO(3) rotation group, translation and permutation of the unordered input sets. The network operates on a sparse set of local reference frames, computed from an input point cloud and establishes end-to-end equivariance through a novel 3D quaternion group capsule layer, including an equivariant dynamic routing procedure. The capsule layer enables us to disentangle geometry from pose, paving the way for more informative descriptions and a structured latent space. In the process, we theoretically connect the process of dynamic routing between capsules to the well-known Weiszfeld algorithm, a scheme for solving iterative re-weighted least squares (IRLS) problems with provable convergence properties, enabling robust pose estimation between capsule layers. Due to the sparse equivariant quaternion capsules, our architecture allows joint object classification and orientation estimation, which we validate empirically on common benchmark datasets. \n\n",
        "keywords": "3d;capsule networks;pointnet;quaternion;equivariant networks;rotations;local reference frame",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yongheng Zhao;Tolga Birdal;Jan Eric Lenssen;Emanuele Menegatti;Leonidas Guibas;Federico Tombari",
        "authorids": "zhao@dei.unipd.it;tbirdal@stanford.edu;janeric.lenssen@udo.edu;emg@dei.unipd.it;guibas@cs.stanford.edu;tombari@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nzhao2020quaternion,\ntitle={Quaternion Equivariant Capsule Networks for 3D Point Clouds},\nauthor={Yongheng Zhao and Tolga Birdal and Jan Eric Lenssen and Emanuele Menegatti and Leonidas Guibas and Federico Tombari},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xtd1HtPS}\n}",
        "github": "http://s000.tinyupload.com/?file_id=93302435681489799761",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1xtd1HtPS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "502;334;697",
        "wc_reply_reviewers": "0;0;23",
        "wc_reply_authors": "349;627;763",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            511.0,
            148.3307115873176
        ],
        "wc_reply_reviewers_avg": [
            7.666666666666667,
            10.842303978193728
        ],
        "wc_reply_authors_avg": [
            579.6666666666666,
            172.29690137150527
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 112,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10843624099742987177&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 15
    },
    {
        "id": "B1xu6yStPH",
        "title": "Using Explainabilty to Detect Adversarial Attacks",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel adversarial detection approach, which uses explainability methods to identify images whose explanations are inconsistent with the predicted class.  ",
        "abstract": "Deep learning models are often sensitive to adversarial attacks, where carefully-designed input samples can cause the system to produce incorrect decisions. Here we focus on the problem of detecting attacks, rather than robust classification, since detecting that an attack occurs may be even more important than avoiding misclassification. We build on advances in explainability, where activity-map-like explanations are used to justify and validate decisions, by highlighting features that are involved with a classification decision. The key observation is that it is hard to create explanations for incorrect decisions.  We propose EXAID, a novel attack-detection approach, which uses model explainability to identify images whose explanations are inconsistent with the predicted class. Specifically, we use SHAP, which uses Shapley values in the space of the input image, to identify which input features contribute to a class decision. Interestingly, this approach does not require to modify the attacked model, and it can be applied without modelling a specific attack. It can therefore be applied successfully to detect unfamiliar attacks, that were unknown at the time the detection model was designed. We evaluate EXAID on two benchmark datasets CIFAR-10 and SVHN, and against three leading attack techniques, FGSM, PGD and C&W. We find that EXAID improves over the SoTA detection methods by a large margin across a wide range of noise levels, improving detection from 70% to over 90% for small perturbations.",
        "keywords": "adversarial;detection;explainability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ohad Amosy and Gal Chechik;Ohad Amosy and Gal Chechik",
        "authorids": "amosy3@gmail.com;gal.chechik@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nchechik2020using,\ntitle={Using Explainabilty to Detect Adversarial Attacks},\nauthor={Ohad Amosy and Gal Chechik},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xu6yStPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1xu6yStPH",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "559;296;565;155",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            393.75,
            175.49269927834604
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5162230353419605627&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "B1xv9pEKDS",
        "title": "LightPAFF: A Two-Stage Distillation Framework for Pre-training and Fine-tuning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "While pre-training and fine-tuning, e.g., BERT~\\citep{devlin2018bert}, GPT-2~\\citep{radford2019language}, have achieved great success in language understanding and generation tasks, the pre-trained models are usually too big for online deployment in terms of both memory cost and inference speed, which hinders them from practical online usage. In this paper, we propose LightPAFF, a Lightweight Pre-training And Fine-tuning Framework that leverages two-stage knowledge distillation to transfer knowledge from a big teacher model to a lightweight student model in both pre-training and fine-tuning stages. In this way the lightweight model can achieve similar accuracy as the big teacher model, but with much fewer parameters and thus faster online inference speed. LightPAFF can support different pre-training methods (such as BERT, GPT-2 and MASS~\\citep{song2019mass}) and be applied to many downstream tasks. Experiments on three language understanding tasks, three language modeling tasks and three sequence to sequence generation tasks demonstrate that while achieving similar accuracy with the big BERT, GPT-2 and MASS models, LightPAFF reduces the model size by nearly 5x and improves online inference speed by 5x-7x.",
        "keywords": "Knowledge Distillation;Pre-training;Fine-tuning;BERT;GPT-2;MASS",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kaitao Song;Hao Sun;Xu Tan;Tao Qin;Jianfeng Lu;Hongzhi Liu;Tie-Yan Liu",
        "authorids": "kt.song@njust.edu.cn;sigmeta@pku.edu.cn;xuta@microsoft.com;taoqin@microsoft.com;lujf@njust.edu.cn;liuhz@pku.edu.cn;tyliu@microsoft.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nsong2020lightpaff,\ntitle={Light{\\{}PAFF{\\}}: A Two-Stage Distillation Framework for Pre-training and Fine-tuning},\nauthor={Kaitao Song and Hao Sun and Xu Tan and Tao Qin and Jianfeng Lu and Hongzhi Liu and Tie-Yan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xv9pEKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1xv9pEKDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "274;157;196",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "136;15;15",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            209.0,
            48.641546028061235
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            55.333333333333336,
            57.03994701571483
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11394002596913286566&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "B1xw9n4Kwr",
        "title": "Model Architecture Controls Gradient Descent Dynamics: A Combinatorial Path-Based Formula",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recently, there has been a growing interest in automatically exploring neural network architecture design space with the goal of finding an architecture that improves performance (characterized as improved accuracy, speed of training, or resource requirements). However, our theoretical understanding of how model architecture affects performance or accuracy is limited. In this paper, we study the impact of model architecture on the speed of training in the context of gradient descent optimization. We model gradient descent as a first-order ODE and use ODE's coefficient matrix H to characterize the convergence rate. We introduce a simple analysis technique that enumerates H in terms of all possible ``paths'' in the network. \n We show that changes in model architecture parameters reflect as changes in the number of paths and the properties of each path, which jointly control the speed of convergence. We believe our analysis technique is useful in reasoning about more complex model architecture modifications.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin Zhou;Newsha Ardalani",
        "authorids": "chow459@gmail.com;newsha@baidu.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzhou2020model,\ntitle={Model Architecture Controls Gradient Descent Dynamics: A Combinatorial Path-Based Formula},\nauthor={Xin Zhou and Newsha Ardalani},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xw9n4Kwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1xw9n4Kwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "484;880;453",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "331;1078;314",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            605.6666666666666,
            194.39535888378498
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            574.3333333333334,
            356.2137310972476
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nnkLrgokz0AJ:scholar.google.com/&scioq=Model+Architecture+Controls+Gradient+Descent+Dynamics:+A+Combinatorial+Path-Based+Formula&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "B1xwcyHFDr",
        "title": "Learning Robust Representations via Multi-View Information Bottleneck",
        "track": "main",
        "status": "Poster",
        "tldr": "We extend the information bottleneck method to the unsupervised multiview setting and show state of the art results on standard datasets",
        "abstract": "The information bottleneck principle provides an information-theoretic method for representation learning, by training an encoder to retain all information which is relevant for predicting the label while minimizing the amount of other, excess information in the representation. The original formulation, however, requires labeled data to identify the superfluous information.  In this work, we extend this ability to the multi-view unsupervised setting, where two views of the same underlying entity are provided but the label is unknown. This enables us to identify superfluous information as that not shared by both views. A theoretical analysis leads to the definition of a new multi-view model that produces state-of-the-art results on the Sketchy dataset and label-limited versions of the MIR-Flickr dataset.  We also extend our theory to the single-view setting by taking advantage of standard data augmentation techniques, empirically showing better generalization capabilities when compared to common unsupervised approaches for representation learning.",
        "keywords": "Information Bottleneck;Multi-View Learning;Representation Learning;Information Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marco Federici;Anjan Dutta;Patrick Forr\u00e9;Nate Kushman;Zeynep Akata",
        "authorids": "m.federici@uva.nl;duttanjan@gmail.com;patrickforre@gmail.com;nate@kushman.org;zeynepakata@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nFederici2020Learning,\ntitle={Learning Robust Representations via Multi-View Information Bottleneck},\nauthor={Marco Federici and Anjan Dutta and Patrick Forr\u00e9 and Nate Kushman and Zeynep Akata},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xwcyHFDr}\n}",
        "github": "https://github.com/mfederici/Multi-View-Information-Bottleneck",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=B1xwcyHFDr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "90;462;380",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "192;1077;745",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            310.6666666666667,
            159.58557439679677
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            671.3333333333334,
            365.0354625086293
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 323,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11405202326075018962&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "B1xwv1StvS",
        "title": "Few-shot Learning by Focusing on Differences",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a model for few-shot classification that incorporates explicit prior which construct class representatives that are orthogonal to the local average of closely related class representatives.",
        "abstract": "Few-shot classification may involve differentiating data that belongs to a different level of labels granularity. Compounded by the fact that the number of available labeled examples are scarce in the novel classification set, relying solely on the loss function to implicitly guide the classifier to separate data based on its label might not be enough; few-shot classifier needs to be very biased to perform well. In this paper, we propose a model that incorporates a simple prior: focusing on differences by building a dissimilar set of class representations. The model treats a class representation as a vector and removes its component that is shared among closely related class representatives. It does so through the combination of learned attention and vector orthogonalization. Our model works well on our newly introduced dataset, Hierarchical-CIFAR, that contains different level of labels granularity. It also substantially improved the performance on fine-grained classification dataset, CUB; whereas staying competitive on standard benchmarks such as mini-Imagenet, Omniglot, and few-shot dataset derived from CIFAR.",
        "keywords": "Deep learning;few-shot learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Muhammad Rizki Maulana;Lee Wee Sun",
        "authorids": "maulana@comp.nus.edu.sg;leews@comp.nus.edu.sg",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmaulana2020fewshot,\ntitle={Few-shot Learning by Focusing on Differences},\nauthor={Muhammad Rizki Maulana and Lee Wee Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xwv1StvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=B1xwv1StvS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "352;496;637",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "512;256;329",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            495.0,
            116.35291143757426
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            365.6666666666667,
            107.67956579077058
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "B1xxAJHFwS",
        "title": "A Finite-Time Analysis of Q-Learning with Neural Network Function Approximation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Q-learning with neural network function approximation (neural Q-learning for short) is among the most prevalent deep reinforcement learning algorithms. Despite its empirical success, the non-asymptotic convergence rate of neural Q-learning remains virtually unknown. In this paper, we present a finite-time analysis of a neural Q-learning algorithm, where the data are generated from a Markov decision process and the action-value function is approximated by a deep ReLU neural network. We prove that neural Q-learning finds the optimal policy with $O(1/T)$ convergence rate if the neural function approximator is sufficiently overparameterized, where $T$ is the number of iterations. To our best knowledge, our result is the first finite-time analysis of neural Q-learning under non-i.i.d. data assumption.",
        "keywords": "Reinforcement Learning;Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pan Xu;Quanquan Gu",
        "authorids": "panxu@cs.ucla.edu;qgu@cs.ucla.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nxu2020a,\ntitle={A Finite-Time Analysis of  Q-Learning with Neural Network Function Approximation},\nauthor={Pan Xu and Quanquan Gu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xxAJHFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=B1xxAJHFwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "428;327;169",
        "wc_reply_reviewers": "0;36;0",
        "wc_reply_authors": "448;430;387",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            308.0,
            106.5864281541823
        ],
        "wc_reply_reviewers_avg": [
            12.0,
            16.97056274847714
        ],
        "wc_reply_authors_avg": [
            421.6666666666667,
            25.590796956892312
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 94,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1849985444207020139&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "B1xybgSKwB",
        "title": "Self-Attentional Credit Assignment for Transfer in Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Secret is a transfer method for RL based on the transfer of credit assignment.",
        "abstract": "The ability to transfer knowledge to novel environments and tasks is a sensible desiderata for general learning agents. Despite the apparent promises, transfer in RL is still an open and little exploited research area. In this paper, we take a brand-new perspective about transfer: we suggest that the ability to assign credit unveils structural invariants in the tasks that can be transferred to make RL more sample efficient. Our main contribution is Secret, a novel approach to transfer learning for RL that uses a backward-view credit assignment mechanism based on a self-attentive architecture. Two aspects are key to its generality: it learns to assign credit as a separate offline supervised process and exclusively modifies the reward function. Consequently, it can be supplemented by transfer methods that do not modify the reward function and it can be plugged on top of any RL algorithm.",
        "keywords": "reinforcement learning;transfer learning;credit assignment",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Johan Ferret;Rapha\u00ebl Marinier;Matthieu Geist;Olivier Pietquin",
        "authorids": "jferret@google.com;raphaelm@google.com;mfgeist@google.com;pietquin@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nferret2020selfattentional,\ntitle={Self-Attentional Credit Assignment for Transfer in Reinforcement Learning},\nauthor={Johan Ferret and Rapha{\\\"e}l Marinier and Matthieu Geist and Olivier Pietquin},\nyear={2020},\nurl={https://openreview.net/forum?id=B1xybgSKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=B1xybgSKwB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "504;363;776",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "945;600;1233",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            547.6666666666666,
            171.4104884642581
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            926.0,
            258.7701682961156
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4999626506235297058&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "BJe-91BtvH",
        "title": "Masked Based Unsupervised Content Transfer",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We consider the problem of translating, in an unsupervised manner, between two domains where one contains some additional information compared to the other. The proposed method  disentangles the common and separate parts of these domains and, through the generation of a mask, focuses the attention of the underlying network to the desired augmentation alone, without wastefully reconstructing the entire target. This enables state-of-the-art quality and variety of content translation, as demonstrated through extensive quantitative and qualitative evaluation. Our method is also capable of adding the separate content of different guide images and domains as well as remove existing separate content. Furthermore, our method enables weakly-supervised semantic segmentation of the separate part of each domain, where only class labels are provided. Our code is available at https://github.com/rmokady/mbu-content-tansfer.\n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ron Mokady;Sagie Benaim;Lior Wolf;Amit Bermano",
        "authorids": "sagiebenaim@gmail.com;ron.mokady@gmail.com;wolf@fb.com;amit.bermano@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nMokady2020Masked,\ntitle={Masked Based Unsupervised Content Transfer},\nauthor={Ron Mokady and Sagie Benaim and Lior Wolf and Amit Bermano},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe-91BtvH}\n}",
        "github": "https://github.com/rmokady/mbu-content-tansfer",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJe-91BtvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "106;552;326",
        "wc_reply_reviewers": "0;74;0",
        "wc_reply_authors": "21;1113;199",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            328.0,
            182.08422959352262
        ],
        "wc_reply_reviewers_avg": [
            24.666666666666668,
            34.883934538536344
        ],
        "wc_reply_authors_avg": [
            444.3333333333333,
            478.3703818404963
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7688758089944653651&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJe-_CNKPH",
        "title": "Attention Interpretability Across NLP Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "Analysis of attention mechanism across diverse NLP tasks.",
        "abstract": "The attention layer in a neural network model provides insights into the model\u2019s reasoning behind its prediction, which are usually criticized for being opaque. Recently, seemingly contradictory viewpoints have emerged about the interpretability of attention weights (Jain & Wallace, 2019; Vig & Belinkov, 2019). Amid such confusion arises the need to understand attention mechanism more systematically. In this work, we attempt to fill this gap by giving a comprehensive explanation which justifies both kinds of observations (i.e., when is attention interpretable and when it is not). Through a series of experiments on diverse NLP tasks, we validate our observations and reinforce our claim of interpretability of attention through manual evaluation.",
        "keywords": "Attention;NLP;Interpretability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shikhar Vashishth;Shyam Upadhyay;Gaurav Singh Tomar;Manaal Faruqui",
        "authorids": "shikhar@iisc.ac.in;shyamupa@google.com;gtomar@google.com;mfaruqui@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nvashishth2020attention,\ntitle={Attention Interpretability Across {\\{}NLP{\\}} Tasks},\nauthor={Shikhar Vashishth and Shyam Upadhyay and Gaurav Singh Tomar and Manaal Faruqui},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe-_CNKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJe-_CNKPH",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "1042;258;295",
        "wc_reply_reviewers": "409;0;0",
        "wc_reply_authors": "1396;521;423",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            531.6666666666666,
            361.1761650804524
        ],
        "wc_reply_reviewers_avg": [
            136.33333333333334,
            192.80444900353194
        ],
        "wc_reply_authors_avg": [
            780.0,
            437.411324346623
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 242,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6040107644295810836&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJe-unNYPr",
        "title": "Accelerated Information Gradient flow",
        "track": "main",
        "status": "Reject",
        "tldr": "We study the accelerated gradient flows in the probability space.",
        "abstract": "We present a systematic framework for the Nesterov's accelerated gradient flows in the spaces of probabilities embedded with information metrics. Here two metrics are considered, including both the Fisher-Rao metric and the Wasserstein-$2$ metric. For the Wasserstein-$2$ metric case, we prove the convergence properties of the accelerated gradient flows, and introduce their formulations in Gaussian families. Furthermore, we propose a practical discrete-time algorithm in particle implementations with an adaptive restart technique.  We formulate a novel bandwidth selection method, which learns the Wasserstein-$2$ gradient direction from Brownian-motion samples. Experimental results including Bayesian inference show the strength of the current method compared with the state-of-the-art.",
        "keywords": "Optimal transport;Information geometry;Nesterov accelerated gradient method",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yifei Wang;Wuchen Li",
        "authorids": "zackwang24@pku.edu.cn;wcli@math.ucla.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nwang2020accelerated,\ntitle={Accelerated Information Gradient flow},\nauthor={Yifei Wang and Wuchen Li},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe-unNYPr}\n}",
        "github": "https://www.dropbox.com/sh/niy9imw8k2gda4l/AADxv_6rbhELbQ8-nhGm7br1a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJe-unNYPr",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "341;632",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "312;785",
        "reply_reviewers": "0;0",
        "reply_authors": "1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            486.5,
            145.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            548.5,
            236.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 69,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15803466455350556526&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "BJe1334YDH",
        "title": "A Learning-based Iterative Method for Solving Vehicle Routing Problems",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "This paper is concerned with solving combinatorial optimization problems, in particular, the capacitated vehicle routing problems (CVRP). Classical Operations Research (OR) algorithms such as LKH3 \\citep{helsgaun2017extension} are inefficient and difficult to scale to larger-size problems. Machine learning based approaches have recently shown to be promising, partly because of their efficiency (once trained, they can perform solving within minutes or even seconds). However, there is still a considerable gap between the quality of a machine learned solution and what OR methods can offer (e.g., on CVRP-100, the best result of learned solutions is between 16.10-16.80, significantly worse than LKH3's 15.65). In this paper, we present ``Learn to Improve'' (L2I), the first learning based approach for CVRP that is efficient in solving speed and at the same time outperforms OR methods. Starting with a random initial solution, L2I learns to iteratively refine the solution with an improvement operator, selected by a reinforcement learning based controller. The improvement operator is selected from a pool of powerful operators that are customized for routing problems. By combining the strengths of the two worlds, our approach achieves the new state-of-the-art results on CVRP, e.g., an average cost of 15.57 on CVRP-100.",
        "keywords": "vehicle routing;reinforcement learning;optimization;heuristics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hao Lu;Xingwen Zhang;Shuang Yang",
        "authorids": "haolu@princeton.edu;xingwen.zhang@antfin.com;shuang.yang@antfin.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLu2020A,\ntitle={A Learning-based Iterative Method for Solving Vehicle Routing Problems},\nauthor={Hao Lu and Xingwen Zhang and Shuang Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe1334YDH}\n}",
        "github": "[![github](/images/github_icon.svg) rlopt/l2i](https://github.com/rlopt/l2i)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJe1334YDH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "196;406;233",
        "wc_reply_reviewers": "0;0;57",
        "wc_reply_authors": "179;1037;342",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;2;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            278.3333333333333,
            91.52898751518863
        ],
        "wc_reply_reviewers_avg": [
            19.0,
            26.870057685088806
        ],
        "wc_reply_authors_avg": [
            519.3333333333334,
            372.0450988194248
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 363,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9058744352163078607&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJe4JJBYwS",
        "title": "CROSS-DOMAIN CASCADED DEEP TRANSLATION",
        "track": "main",
        "status": "Reject",
        "tldr": "Image-to-image translation in a cascaded, deep-to-shallow, fashion, along the deep feature of a pre-trained classification network",
        "abstract": "In recent years we have witnessed tremendous progress in unpaired image-to-image translation methods, propelled by the emergence of DNNs and adversarial training strategies. However, most existing methods focus on transfer of style and appearance, rather than on shape translation. The latter task is challenging, due to its intricate non-local nature, which calls for additional supervision. We mitigate this by descending the deep layers of a pre-trained network, where the deep features contain more semantics, and applying the translation between these deep features. Specifically, we leverage VGG, which is a classification network, pre-trained with large-scale semantic supervision. Our translation is performed in a cascaded, deep-to-shallow, fashion, along the deep feature hierarchy: we first translate between the deepest layers that encode the higher-level semantic content of the image, proceeding to translate the shallower layers, conditioned on the deeper ones. We show that our method is able to translate between different domains, which exhibit significantly different shapes. We evaluate our method both qualitatively and quantitatively and compare it to state-of-the-art image-to-image translation methods. Our code and trained models will be made available.",
        "keywords": "computer vision;image translation;generative adversarial networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Oren Katzir;Dani Lischinski;Daniel Cohen-Or",
        "authorids": "orenkatzir@mail.tau.ac.il;cohenor@gmail.com;danix3d@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkatzir2020crossdomain,\ntitle={{\\{}CROSS{\\}}-{\\{}DOMAIN{\\}} {\\{}CASCADED{\\}} {\\{}DEEP{\\}} {\\{}TRANSLATION{\\}}},\nauthor={Oren Katzir and Dani Lischinski and Daniel Cohen-Or},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe4JJBYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJe4JJBYwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "656;415;321",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "866;524;234",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            464.0,
            141.08389938850806
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            541.3333333333334,
            258.30386928749033
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3300830844509043956&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJe4PyrFvB",
        "title": "Imagining the Latent Space of a Variational Auto-Encoders",
        "track": "main",
        "status": "Reject",
        "tldr": "To understand the information stored in the latent space, we train a GAN-style decoder constrained to produce images that the VAE encoder will map to the same region of latent space.",
        "abstract": "  Variational Auto-Encoders (VAEs) are designed to capture compressible information about a dataset.  As a consequence the information stored in the latent space is seldom sufficient to reconstruct a particular image.  To help understand the type of information stored in the latent space we train a GAN-style decoder constrained to produce images that the VAE encoder will map to the same region of latent space. This allows us to ''imagine'' the information captured in the latent space.  We argue that this is necessary to make a VAE into a truly generative model.  We use our GAN to visualise the latent space of a standard VAE and of a $\\beta$-VAE.",
        "keywords": "VAE;GAN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zezhen Zeng;Jonathon Hare;Adam Pr\u00fcgel-Bennett",
        "authorids": "zz8n17@ecs.soton.ac.uk;jsh2@ecs.soton.ac.uk;apb@ecs.soton.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzeng2020imagining,\ntitle={Imagining the Latent Space of a Variational Auto-Encoders},\nauthor={Zezhen Zeng and Jonathon Hare and Adam Pr{\\\"u}gel-Bennett},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe4PyrFvB}\n}",
        "github": "https://github.com/iclr-2020-zzz/LSR-GAN",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJe4PyrFvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "854;824;430",
        "wc_reply_reviewers": "204;0;0",
        "wc_reply_authors": "663;633;631",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            702.6666666666666,
            193.19305255509462
        ],
        "wc_reply_reviewers_avg": [
            68.0,
            96.16652224137046
        ],
        "wc_reply_authors_avg": [
            642.3333333333334,
            14.636332266733433
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:X142aldcQo4J:scholar.google.com/&scioq=Imagining+the+Latent+Space+of+a+Variational+Auto-Encoders&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "BJe4V1HFPr",
        "title": "Disentangling Style and Content in Anime Illustrations",
        "track": "main",
        "status": "Reject",
        "tldr": "An adversarial training-based method for disentangling two complementary sets of variations in a dataset where only one of them is labelled, tested on style vs. content in anime illustrations.",
        "abstract": "Existing methods for AI-generated artworks still struggle with generating high-quality stylized content, where high-level semantics are preserved, or separating fine-grained styles from various artists. We propose a novel Generative Adversarial Disentanglement Network which can disentangle two complementary factors of variations when only one of them is labelled in general, and fully decompose complex anime illustrations into style and content in particular. Training such model is challenging, since given a style, various content data may exist but not the other way round. Our approach is divided into two stages, one that encodes an input image into a style independent content, and one based on a dual-conditional generator. We demonstrate the ability to generate high-fidelity anime portraits with a fixed content and a large variety of styles from over a thousand artists, and vice versa, using a single end-to-end network and with applications in style transfer. We show this unique capability as well as superior output to the current state-of-the-art.",
        "keywords": "Adversarial Training;Generative Models;Style Transfer;Anime",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sitao Xiang;Hao Li",
        "authorids": "sitaoxia@usc.edu;hao@hao-li.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nxiang2020disentangling,\ntitle={Disentangling Style and Content in Anime Illustrations},\nauthor={Sitao Xiang and Hao Li},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe4V1HFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJe4V1HFPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "457;220;300",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "376;355;425",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            325.6666666666667,
            98.44231249259074
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            385.3333333333333,
            29.32954520994525
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9655520750362571219&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJe4oxHYPB",
        "title": "Winning the Lottery with Continuous Sparsification",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new algorithm that quickly finds winning tickets in neural networks.",
        "abstract": "The Lottery Ticket Hypothesis from Frankle & Carbin (2019) conjectures that, for typically-sized neural networks, it is possible to find small sub-networks which train faster and yield superior performance than their original counterparts. The proposed algorithm to search for such sub-networks (winning tickets), Iterative Magnitude Pruning (IMP), consistently finds sub-networks with 90-95% less parameters which indeed train faster and better than the overparameterized models they were extracted from, creating potential applications to problems such as transfer learning.\n\nIn this paper, we propose a new algorithm to search for winning tickets, Continuous Sparsification, which continuously removes parameters from a network during training, and learns the sub-network's structure with gradient-based methods instead of relying on pruning strategies. We show empirically that our method is capable of finding tickets that outperforms the ones learned by Iterative Magnitude Pruning, and at the same time providing up to 5 times faster search, when measured in number of training epochs.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pedro Savarese;Hugo Silva;Michael Maire",
        "authorids": "savarese@ttic.edu;hugoandradesilva664@gmail.com;mmaire@uchicago.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsavarese2020winning,\ntitle={Winning the Lottery with Continuous Sparsification},\nauthor={Pedro Savarese and Hugo Silva and Michael Maire},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe4oxHYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJe4oxHYPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "162;3968;251",
        "wc_reply_reviewers": "0;20;0",
        "wc_reply_authors": "486;1459;521",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1460.3333333333333,
            1773.5603238182293
        ],
        "wc_reply_reviewers_avg": [
            6.666666666666667,
            9.428090415820632
        ],
        "wc_reply_authors_avg": [
            822.0,
            450.6535994160777
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 152,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6340697086981943139&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BJe55gBtvH",
        "title": "Depth-Width Trade-offs for ReLU Networks via Sharkovsky's Theorem",
        "track": "main",
        "status": "Spotlight",
        "tldr": "In this work, we point to a new connection between DNNs expressivity and Sharkovsky\u2019s Theorem from dynamical systems, that enables us to characterize the depth-width trade-offs of ReLU networks ",
        "abstract": "Understanding the representational power of Deep Neural Networks (DNNs) and how their structural properties (e.g., depth, width, type of activation unit) affect the functions they can compute, has been an important yet challenging question in deep learning and approximation theory. In a seminal paper, Telgarsky high- lighted the benefits of depth by presenting a family of functions (based on sim- ple triangular waves) for which DNNs achieve zero classification error, whereas shallow networks with fewer than exponentially many nodes incur constant error. Even though Telgarsky\u2019s work reveals the limitations of shallow neural networks, it doesn\u2019t inform us on why these functions are difficult to represent and in fact he states it as a tantalizing open question to characterize those functions that cannot be well-approximated by smaller depths.\nIn this work, we point to a new connection between DNNs expressivity and Sharkovsky\u2019s Theorem from dynamical systems, that enables us to characterize the depth-width trade-offs of ReLU networks for representing functions based on the presence of a generalized notion of fixed points, called periodic points (a fixed point is a point of period 1). Motivated by our observation that the triangle waves used in Telgarsky\u2019s work contain points of period 3 \u2013 a period that is special in that it implies chaotic behaviour based on the celebrated result by Li-Yorke \u2013 we proceed to give general lower bounds for the width needed to represent periodic functions as a function of the depth. Technically, the crux of our approach is based on an eigenvalue analysis of the dynamical systems associated with such functions.",
        "keywords": "Depth-Width trade-offs;ReLU networks;chaos theory;Sharkovsky Theorem;dynamical systems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vaggos Chatziafratis;Sai Ganesh Nagarajan;Ioannis Panageas;Xiao Wang",
        "authorids": "vaggos@cs.stanford.edu;sai_nagarajan@mymail.sutd.edu.sg;ioannis@sutd.edu.sg;xiao_wang@sutd.edu.sg",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nChatziafratis2020Depth-Width,\ntitle={Depth-Width Trade-offs for ReLU Networks via Sharkovsky's Theorem},\nauthor={Vaggos Chatziafratis and Sai Ganesh Nagarajan and Ioannis Panageas and Xiao Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe55gBtvH}\n}",
        "github": "https://docs.google.com/document/d/1qr-sROZ7q93OhigF6CoPde5NQ901wI17wmnBvbZRT9s/edit?fbclid=IwAR1HwkNZ1g2QgMmTGRZ0ktCYNgeKKk91tvRNLb59QJwU3dRmuGCJbTNMwj0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJe55gBtvH",
        "pdf_size": 0,
        "rating": "8;8",
        "confidence": "0;0",
        "wc_review": "250;227",
        "wc_reply_reviewers": "0;35",
        "wc_reply_authors": "518;436",
        "reply_reviewers": "0;1",
        "reply_authors": "1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            238.5,
            11.5
        ],
        "wc_reply_reviewers_avg": [
            17.5,
            17.5
        ],
        "wc_reply_authors_avg": [
            477.0,
            41.0
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 33,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7043277200666285729&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJe6BkHYDB",
        "title": "Hardware-aware One-Shot Neural Architecture Search in Coordinate Ascent Framework",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose HURRICANE to address the challenge of hardware diversity in one-shot neural architecture search",
        "abstract": "Designing accurate and efficient convolutional neural architectures for vast amount of hardware is challenging because hardware designs are complex and diverse. This paper addresses the hardware diversity challenge in Neural Architecture Search (NAS). Unlike previous approaches that apply search algorithms on a small, human-designed search space without considering hardware diversity, we propose HURRICANE that explores the automatic hardware-aware search over a much larger search space and a multistep search scheme in coordinate ascent framework, to generate tailored models for different types of hardware. Extensive experiments on  ImageNet show that our algorithm consistently achieves a much lower inference latency with a similar or better accuracy than state-of-the-art NAS methods on three types of hardware. Remarkably, HURRICANE achieves a 76.63% top-1 accuracy on ImageNet with a inference latency of only 16.5 ms for DSP, which is a 3.4% higher accuracy and a 6.35x inference speedup than FBNet-iPhoneX. For VPU, HURRICANE achieves a 0.53% higher top-1 accuracy than Proxyless-mobile with a 1.49x speedup. Even for well-studied mobile CPU, HURRICANE achieves a 1.63% higher top-1  accuracy than FBNet-iPhoneX with a comparable inference latency. HURRICANE also reduces the training time by 54.7% on average compared to SinglePath-Oneshot.",
        "keywords": "Neural Architecture Search;Hardware Diversity;Search Space",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Li Lyna Zhang;Yuqing Yang;Yuhang Jiang;Wenwu Zhu;Yunxin Liu",
        "authorids": "lzhani@microsoft.com;yuqing.yang@microsoft.com;jyh17@mails.tsinghua.edu.cn;wwzhu@tsinghua.edu.cn;yunxin.liu@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJe6BkHYDB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "225;160;301",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            228.66666666666666,
            57.621369492769105
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4825885180711516087&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJe7h34YDS",
        "title": "Understanding and Stabilizing GANs' Training Dynamics with Control Theory",
        "track": "main",
        "status": "Reject",
        "tldr": "We adopt the control theory to understand and stabilize the dynamics of GANs.",
        "abstract": "Generative adversarial networks~(GANs) have made significant progress on realistic image generation but often suffer from instability during the training process. Most previous analyses mainly focus on the equilibrium that GANs achieve, whereas a gap exists between such theoretical analyses and practical implementations, where it is the training dynamics that plays a vital role in the convergence and stability of GANs. In this paper, we directly model the dynamics of GANs and adopt the control theory to understand and stabilize it. Specifically, we interpret the training process of various GANs as certain types of dynamics in a unified perspective of control theory which enables us to model the stability and convergence easily. Borrowed from control theory, we adopt the widely-used negative feedback control to stabilize the training dynamics, which can be considered as an $L2$ regularization on the output of the discriminator. We empirically verify our method on both synthetic data and natural image datasets. The results demonstrate that our method can stabilize the training dynamics as well as converge better than baselines.",
        "keywords": "Generative Adversarial Nets;Stability Analysis;Control Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kun Xu;Chongxuan Li;Huanshu Wei;Jun Zhu;Bo Zhang",
        "authorids": "kunxu.thu@gmail.com;chongxuanli1991@gmail.com;weihuanshu94@hotmail.com;dcszj@mail.tsinghua.edu.cn;dcszb@mail.tsinghua.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nxu2020understanding,\ntitle={Understanding and Stabilizing {\\{}GAN{\\}}s' Training Dynamics with Control Theory},\nauthor={Kun Xu and Chongxuan Li and Huanshu Wei and Jun Zhu and Bo Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe7h34YDS}\n}",
        "github": "https://anonymous.4open.science/r/a5e02628-d2d6-43b4-b645-5752fb87637a/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJe7h34YDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "594;178;338",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "789;963;558",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            370.0,
            171.33203631156277
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            770.0,
            165.88550268181967
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7361684765504686353&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJe8pkHFwS",
        "title": "GraphSAINT: Graph Sampling Based Inductive Learning Method",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a graph sampling based minibatch construction method for training deep Graph Convolutional Networks on large graphs. ",
        "abstract": "Graph Convolutional Networks (GCNs) are powerful models for learning representations of attributed graphs. To scale GCNs to large graphs, state-of-the-art methods use various layer sampling techniques to alleviate the \"neighbor explosion\" problem during minibatch training. We propose GraphSAINT, a graph sampling based inductive learning method that improves training efficiency and accuracy in a fundamentally different way. By changing perspective, GraphSAINT constructs minibatches by sampling the training graph, rather than the nodes or edges across GCN layers. Each iteration, a complete GCN is built from the properly sampled subgraph. Thus, we ensure fixed number of well-connected nodes in all layers. We further propose normalization technique to eliminate bias, and sampling algorithms for variance reduction. Importantly, we can decouple the sampling from the forward and backward propagation, and extend GraphSAINT with many architecture variants (e.g., graph attention, jumping connection).  GraphSAINT demonstrates superior performance in both accuracy and training time on five large graphs, and achieves new state-of-the-art F1 scores for PPI (0.995) and Reddit (0.970). ",
        "keywords": "Graph Convolutional Networks;Graph sampling;Network embedding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hanqing Zeng;Hongkuan Zhou;Ajitesh Srivastava;Rajgopal Kannan;Viktor Prasanna",
        "authorids": "zengh@usc.edu;hongkuaz@usc.edu;ajiteshs@usc.edu;rajgopal.kannan.civ@mail.mil;prasanna@usc.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nZeng2020GraphSAINT:,\ntitle={GraphSAINT: Graph Sampling Based Inductive Learning Method},\nauthor={Hanqing Zeng and Hongkuan Zhou and Ajitesh Srivastava and Rajgopal Kannan and Viktor Prasanna},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe8pkHFwS}\n}",
        "github": "https://github.com/GraphSAINT/GraphSAINT",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJe8pkHFwS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "226;210;80",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "457;154;193",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            172.0,
            65.38093503970914
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            268.0,
            134.58826100369973
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1395,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4707766140408831355&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BJe932EYwS",
        "title": "PNAT: Non-autoregressive Transformer by Position Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Non-autoregressive generation is a new paradigm for text generation. Previous work hardly considers to explicitly model the positions of generated words. However, position modeling of output words is an essential problem in non-autoregressive text generation. In this paper, we propose PNAT, which explicitly models positions of output words as latent variables in text generation. The proposed PNATis simple yet effective. Experimental results show that PNATgives very promising results in machine translation and paraphrase generation tasks, outperforming many strong baselines.",
        "keywords": "Text Generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Bao;Hao Zhou;Jiangtao Feng;Mingxuan Wang;Shujian Huang;Jiajun Chen;Lei Li",
        "authorids": "baoy@smail.nju.edu.cn;zhouhao.nlp@bytedance.com;fengjiangtao@bytedance.com;wangmingxuan.89@bytedance.com;huangsj@nju.edu.cn;chenjj@nju.edu.cn;lilei.02@bytedance.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nbao2020pnat,\ntitle={{\\{}PNAT{\\}}: Non-autoregressive Transformer by Position Learning},\nauthor={Yu Bao and Hao Zhou and Jiangtao Feng and Mingxuan Wang and Shujian Huang and Jiajun Chen and Lei Li},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe932EYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJe932EYwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "505;474;394",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1174;973;741",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            457.6666666666667,
            46.764183825753754
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            962.6666666666666,
            176.9224563348462
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            20,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1851591483323555984&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJeAHkrYDS",
        "title": "Fast Task Inference with Variational Intrinsic Successor Features",
        "track": "main",
        "status": "Talk",
        "tldr": "We introduce Variational Intrinsic Successor FeatuRes (VISR), a novel algorithm which learns controllable features that can be leveraged to provide fast task inference through the successor features framework.",
        "abstract": "It has been established that diverse behaviors spanning the controllable subspace of a Markov decision process can be trained by rewarding a policy for being distinguishable from other policies. However, one limitation of this formulation is the difficulty to generalize beyond the finite set of behaviors being explicitly learned, as may be needed in subsequent tasks. Successor features provide an appealing solution to this generalization problem, but require defining the reward function as linear in some grounded feature space. In this paper, we show that these two techniques can be combined, and that each method solves the other's primary limitation. To do so we introduce Variational Intrinsic Successor FeatuRes (VISR), a novel algorithm which learns controllable features that can be leveraged to provide enhanced generalization and fast task inference through the successor features framework. We empirically validate VISR on the full Atari suite, in a novel setup wherein the rewards are only exposed briefly after a long unsupervised phase. Achieving human-level performance on 12 games and beating all baselines, we believe VISR represents a step towards agents that rapidly learn from limited feedback.",
        "keywords": "Reinforcement Learning;Variational Intrinsic Control;Successor Features",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Steven Hansen;Will Dabney;Andre Barreto;David Warde-Farley;Tom Van de Wiele;Volodymyr Mnih",
        "authorids": "stevenhansen@google.com;wdabney@google.com;andrebarreto@google.com;dwf@google.com;tvdwiele@gmail.com;vmnih@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nHansen2020Fast,\ntitle={Fast Task Inference with Variational Intrinsic Successor Features},\nauthor={Steven Hansen and Will Dabney and Andre Barreto and David Warde-Farley and Tom Van de Wiele and Volodymyr Mnih},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeAHkrYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJeAHkrYDS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "517;84;375",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "449;36;66",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            325.3333333333333,
            180.22640082839015
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            183.66666666666666,
            188.01832062032915
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 203,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2930054721175686683&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJeB5hVtvB",
        "title": "Distance-Based Learning from Errors for Confidence Calibration",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Deep neural networks (DNNs) are poorly calibrated when trained in conventional ways. To improve confidence calibration of DNNs, we propose a novel training method, distance-based learning from errors (DBLE). DBLE bases its confidence estimation on distances in the representation space. In DBLE, we first adapt prototypical learning to train classification models. It yields a representation space where the distance between a test sample and its ground truth class center can calibrate the model's classification performance. At inference, however, these distances are not available due to the lack of ground truth labels. To circumvent this by inferring the distance for every test sample, we propose to train a confidence model jointly with the classification model. We integrate this into training by merely learning from mis-classified training samples, which we show to be highly beneficial for effective learning. On multiple datasets and DNN architectures, we demonstrate that DBLE outperforms alternative single-model confidence calibration approaches. DBLE also achieves comparable performance with computationally-expensive ensemble approaches with lower computational cost and lower number of parameters.",
        "keywords": "Confidence Calibration;Uncertainty Estimation;Prototypical Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chen Xing;Sercan Arik;Zizhao Zhang;Tomas Pfister",
        "authorids": "xingchen1113@gmail.com;soarik@google.com;zizhaoz@google.com;tpfister@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nXing2020Distance-Based,\ntitle={Distance-Based Learning from Errors for Confidence Calibration},\nauthor={Chen Xing and Sercan Arik and Zizhao Zhang and Tomas Pfister},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeB5hVtvB}\n}",
        "github": "https://drive.google.com/open?id=1UThGvkkvFvKX8ogsfwvdA3uY8xzDlIuL",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJeB5hVtvB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "494;150;173",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "751;50;582",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            272.3333333333333,
            157.0229990231438
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            461.0,
            298.69828701662595
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 49,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7997593511665989878&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJeFQ0NtPS",
        "title": "Parallel Neural Text-to-Speech",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this work, we first propose ParaNet, a non-autoregressive seq2seq model that converts text to spectrogram. It is fully convolutional and obtains 46.7 times speed-up over Deep Voice 3 at synthesis while maintaining comparable speech quality using a WaveNet vocoder. ParaNet also produces stable alignment between text and speech on the challenging test sentences by iteratively improving the attention in a layer-by-layer manner. Based on ParaNet, we build the first fully parallel neural text-to-speech system using parallel neural vocoders, which can synthesize speech from text through a single feed-forward pass.  We investigate several parallel vocoders within the TTS system, including variants of IAF vocoders and bipartite flow vocoder.",
        "keywords": "text-to-speech;non-autoregressive model;parallel decoding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kainan Peng;Wei Ping;Zhao Song;Kexin Zhao",
        "authorids": "pengkainan@baidu.com;weiping.thu@gmail.com;zhaosong02@baidu.com;zhaokexin01@baidu.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\npeng2020parallel,\ntitle={Parallel Neural Text-to-Speech},\nauthor={Kainan Peng and Wei Ping and Zhao Song and Kexin Zhao},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeFQ0NtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJeFQ0NtPS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "315;323;588",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "386;183;164",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            408.6666666666667,
            126.84986751624493
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            244.33333333333334,
            100.47332426514457
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16751715320045601575&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJeGA6VtPS",
        "title": "TrojanNet: Exposing the Danger of Trojan Horse Attack on Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Parameters of a trained neural network can be permuted to produce a completely separate model for a different task, enabling the embedding of Trojan horse networks inside another network.",
        "abstract": "The complexity of large-scale neural networks can lead to poor understanding of their internal  details. We show that this opaqueness provides an opportunity for adversaries to embed unintended functionalities into the network in the form of Trojan horse attacks. Our novel framework hides the existence of a malicious network within a benign transport network. Our attack is flexible, easy to execute, and difficult to detect. We prove theoretically that the malicious network's detection is computationally infeasible and demonstrate empirically that the transport network does not compromise its disguise. Our attack exposes an important, previously unknown loophole that unveils a new direction in machine learning security.",
        "keywords": "machine learning security",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chuan Guo;Ruihan Wu;Kilian Q. Weinberger",
        "authorids": "cg563@cornell.edu;rw565@cornell.edu;kqw4@cornell.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nguo2020trojannet,\ntitle={TrojanNet: Exposing the Danger of Trojan Horse Attack on Neural Networks},\nauthor={Chuan Guo and Ruihan Wu and Kilian Q. Weinberger},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeGA6VtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJeGA6VtPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "359;326;400",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            361.6666666666667,
            30.26916289265731
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9350190002551577263&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJeGZxrFvS",
        "title": "A Simple Technique to Enable Saliency Methods to Pass the Sanity Checks",
        "track": "main",
        "status": "Reject",
        "tldr": "We devise a mechanism called competition among pixels that allows (approximately) complete saliency methods to pass the sanity checks.",
        "abstract": " {\\em Saliency methods} attempt to explain a deep net's decision by assigning a {\\em score} to each feature/pixel in the input, often doing this credit-assignment via the gradient of the output with respect to input. \nRecently \\citet{adebayosan} questioned the validity of many of these methods since they do not pass simple {\\em sanity checks}, which test whether the scores shift/vanish when  layers of the trained net are randomized, or when the net is retrained using random labels for inputs. % for the inputs.   %Surprisingly, the tested methods did not pass these checks: the explanations were relatively unchanged. \n\nWe propose a simple fix to existing saliency methods that helps them pass sanity checks, which we call {\\em competition for pixels}. This involves computing saliency maps for all possible labels in the classification task, and using a simple competition among them to identify and remove less relevant pixels from the map. Some theoretical justification is provided for it  and its performance is empirically demonstrated on several popular methods.",
        "keywords": "saliency;attribution;interpretability;sanity checks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arushi Gupta;Sanjeev Arora",
        "authorids": "arushig@princeton.edu;arora@cs.princeton.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngupta2020a,\ntitle={A Simple Technique to Enable Saliency Methods to Pass the Sanity Checks},\nauthor={Arushi Gupta and Sanjeev Arora},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeGZxrFvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJeGZxrFvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "705;394;157",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "130;299;23",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            418.6666666666667,
            224.39895028473038
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            150.66666666666666,
            113.62022511663825
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:j8w8Eps_z-kJ:scholar.google.com/&scioq=A+Simple+Technique+to+Enable+Saliency+Methods+to+Pass+the+Sanity+Checks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJeGlJStPr",
        "title": "IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "IMPACT helps RL agents train faster by decreasing training wall-clock time and increasing sample efficiency simultaneously.",
        "abstract": "The practical usage of reinforcement learning agents is often bottlenecked by the duration of training time. To accelerate training, practitioners often turn to distributed reinforcement learning architectures to parallelize and accelerate the training process. However, modern methods for scalable reinforcement learning (RL) often tradeoff between the throughput of samples that an RL agent can learn from (sample throughput) and the quality of learning from each sample (sample efficiency). In these scalable RL architectures, as one increases sample throughput (i.e. increasing parallelization in IMPALA (Espeholt et al., 2018)), sample efficiency drops significantly. To address this, we propose a new distributed reinforcement learning algorithm, IMPACT. IMPACT extends PPO with three changes: a target network for stabilizing the surrogate objective, a circular buffer, and truncated importance sampling. In discrete action-space environments, we show that IMPACT attains higher reward and, simultaneously, achieves up to 30% decrease in training wall-time than that of IMPALA. For continuous control environments, IMPACT trains faster than existing scalable agents while preserving the sample efficiency of synchronous PPO.",
        "keywords": "Reinforcement Learning;Artificial Intelligence;Distributed Computing;Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael Luo;Jiahao Yao;Richard Liaw;Eric Liang;Ion Stoica",
        "authorids": "michael.luo@berkeley.edu;jiahaoyao@berkeley.edu;rliaw@berkeley.edu;ekhliang@gmail.com;istoica@berkeley.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLuo2020IMPACT:,\ntitle={IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks},\nauthor={Michael Luo and Jiahao Yao and Richard Liaw and Eric Liang and Ion Stoica},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeGlJStPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJeGlJStPr",
        "pdf_size": 0,
        "rating": "3;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "137;143;265;475",
        "wc_reply_reviewers": "0;0;0;20",
        "wc_reply_authors": "227;130;260;514",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.25,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            255.0,
            136.90142438996023
        ],
        "wc_reply_reviewers_avg": [
            5.0,
            8.660254037844387
        ],
        "wc_reply_authors_avg": [
            282.75,
            141.8051039278911
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3240010164034689617&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJeHFlBYvB",
        "title": "BERT for Sequence-to-Sequence Multi-Label Text Classification",
        "track": "main",
        "status": "Withdraw",
        "tldr": "On using BERT as an encoder for sequential prediction of labels in multi-label text classification task",
        "abstract": "We study the BERT language representation model and the sequence generation model with BERT encoder for multi-label text classification task. We experiment with both models and explore their special qualities for this setting. We also introduce and examine experimentally a mixed model, which is an ensemble of multi-label BERT and sequence generating BERT models. Our experiments demonstrated that BERT-based models and the mixed model, in particular, outperform current baselines in several metrics achieving state-of-the-art results on three well-studied multi-label classification datasets with English texts and two private Yandex Taxi datasets with Russian texts.",
        "keywords": "Multi-Label Text Classification;Sequence-to-Sequence Learning;BERT;Sequence Generation;Hierarchical Text Classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ramil Yarullin;Pavel Serdyukov",
        "authorids": "ramly@ya.ru;pavel.serdyukov@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJeHFlBYvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "166;244;335",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "370;194;485",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            248.33333333333334,
            69.06196895220666
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            349.6666666666667,
            119.66713091831953
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1961243230831483616&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJeI4TVYvS",
        "title": "Classification Logit Two-sample Testing by Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "The recent success of generative adversarial networks and variational learning suggests training a classifier network may work well in addressing the classical two-sample problem. Network-based tests have the computational advantage that the algorithm scales to large samples. This paper proposes to use the difference of the logit of a trained neural network classifier evaluated on the two finite samples as the test statistic. Theoretically, we prove the testing power to differentiate two smooth densities given that the network is sufficiently parametrized, by comparing the learned logit function to the log ratio of the densities, the latter maximizing the population training objective. When the two densities lie on or near to low-dimensional manifolds embedded in possibly high-dimensional space, the needed network complexity is reduced to only depending on the intrinsic manifold geometry. In experiments, the method demonstrates better performance than previous network-based tests which use the classification accuracy as the test statistic, and compares favorably to certain kernel maximum mean discrepancy (MMD) tests on synthetic and hand-written digits datasets.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiuyuan Cheng;Alexander Cloninger",
        "authorids": "xiuyuan.cheng@duke.edu;acloninger@ucsd.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJeI4TVYvS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1293;124;444",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            620.3333333333334,
            493.2614812539987
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "BJeKh3VYDH",
        "title": "Tranquil Clouds: Neural Networks for Learning Temporally Coherent Features in Point Clouds",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose a generative neural network approach for temporally coherent point clouds.",
        "abstract": "Point clouds, as a form of Lagrangian representation, allow for powerful and flexible applications in a large number of computational disciplines. We propose a novel deep-learning method to learn stable and temporally coherent feature spaces for points clouds that change over time. We identify a set of inherent problems with these approaches: without knowledge of the time dimension, the inferred solutions can exhibit strong flickering, and easy solutions to suppress this flickering can result in undesirable local minima that manifest themselves as halo structures. We propose a novel temporal loss function that takes into account higher time derivatives of the point positions, and encourages mingling, i.e., to prevent the aforementioned halos. We combine these techniques in a super-resolution method with a truncation approach to flexibly adapt the size of the generated positions. We show that our method works for large, deforming point sets from different sources to demonstrate the flexibility of our approach.",
        "keywords": "point clouds;spatio-temporal representations;Lagrangian data;temporal coherence;super-resolution;denoising",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lukas Prantl;Nuttapong Chentanez;Stefan Jeschke;Nils Thuerey",
        "authorids": "lukas.prantl@tum.de;nuttapong26@gmail.com;jeschke@stefan-jeschke.com;nils.thuerey@tum.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nPrantl2020Tranquil,\ntitle={Tranquil Clouds: Neural Networks for Learning Temporally Coherent Features in Point Clouds},\nauthor={Lukas Prantl and Nuttapong Chentanez and Stefan Jeschke and Nils Thuerey},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeKh3VYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJeKh3VYDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "231;318;264",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "496;213;12",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            271.0,
            35.86084215408221
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            240.33333333333334,
            198.53519139493184
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7389453192371158665&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJeKwTNFvB",
        "title": "Physics-as-Inverse-Graphics: Unsupervised Physical Parameter Estimation from Video",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a model that is able to perform physical parameter estimation of systems from video, where the differential equations governing the scene dynamics are known, but labeled states or objects are not available.",
        "abstract": "We propose a model that is able to perform physical parameter estimation of systems from video, where the differential equations governing the scene dynamics are known, but labeled states or objects are not available. Existing physical scene understanding methods require either object state supervision, or do not integrate with differentiable physics to learn interpretable system parameters and states. We address this problem through a \\textit{physics-as-inverse-graphics} approach that brings together vision-as-inverse-graphics and differentiable physics engines, where objects and explicit state and velocity representations are discovered by the model. This framework allows us to perform long term extrapolative video prediction, as well as vision-based model-predictive control. Our approach significantly outperforms related unsupervised methods in long-term future frame prediction of systems with interacting objects (such as ball-spring or 3-body gravitational systems), due to its ability to build dynamics into the model as an inductive bias. We further show the value of this tight vision-physics integration by demonstrating data-efficient learning of vision-actuated model-based control for a pendulum system. We also show that the controller's interpretability provides unique capabilities in goal-driven control and physical reasoning for zero-data adaptation.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Miguel Jaques;Michael Burke;Timothy Hospedales",
        "authorids": "m.a.m.jaques@sms.ed.ac.uk;michael.burke@ed.ac.uk;t.hospedales@ed.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nJaques2020Physics-as-Inverse-Graphics:,\ntitle={Physics-as-Inverse-Graphics: Unsupervised Physical Parameter Estimation from Video},\nauthor={Miguel Jaques and Michael Burke and Timothy Hospedales},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeKwTNFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJeKwTNFvB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "673;748;726",
        "wc_reply_reviewers": "0;0;258",
        "wc_reply_authors": "805;1081;1122",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;2;3",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            715.6666666666666,
            31.478387647541428
        ],
        "wc_reply_reviewers_avg": [
            86.0,
            121.62236636408618
        ],
        "wc_reply_authors_avg": [
            1002.6666666666666,
            140.770104149362
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 50,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7974371680951317942&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BJeRykBKDH",
        "title": "Empowering Graph Representation Learning with Paired Training and Graph Co-Attention",
        "track": "main",
        "status": "Reject",
        "tldr": "We use graph co-attention in a paired graph training system for graph classification and regression.",
        "abstract": "Through many recent advances in graph representation learning, performance achieved on tasks involving graph-structured data has substantially increased in recent years---mostly on tasks involving node-level predictions. The setup of prediction tasks over entire graphs (such as property prediction for a molecule, or side-effect prediction for a drug), however, proves to be more challenging, as the algorithm must combine evidence about several structurally relevant patches of the graph into a single prediction.\nMost prior work attempts to predict these graph-level properties while considering only one graph at a time---not allowing the learner to directly leverage structural similarities and motifs across graphs. Here we propose a setup in which a graph neural network receives pairs of graphs at once, and extend it with a co-attentional layer that allows node representations to easily exchange structural information across them. We first show that such a setup provides natural benefits on a pairwise graph classification task (drug-drug interaction prediction), and then expand to a more generic graph regression setup: enhancing predictions over QM9, a standard molecular prediction benchmark. Our setup is flexible, powerful and makes no assumptions about the underlying dataset properties, beyond anticipating the existence of multiple training graphs.",
        "keywords": "graph neural networks;graph co-attention;paired graphs;molecular properties;drug-drug interaction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andreea Deac;Yu-Hsiang Huang;Petar Velickovic;Pietro Lio;Jian Tang",
        "authorids": "deacandr@mila.quebec;huang.yu-hsiang@courrier.uqam.ca;petar.velickovic@cst.cam.ac.uk;pl219@cam.ac.uk;jian.tang@hec.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ndeac2020empowering,\ntitle={Empowering Graph Representation Learning with Paired Training and Graph Co-Attention},\nauthor={Andreea Deac and Yu-Hsiang Huang and Petar Velickovic and Pietro Lio and Jian Tang},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeRykBKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJeRykBKDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "130;277;208",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "310;480;343",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            205.0,
            60.049979184009715
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            377.6666666666667,
            73.60404578252174
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3667604458236128529&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJeS62EtwH",
        "title": "Knowledge Consistency between Neural Networks and Beyond",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "This paper aims to analyze knowledge consistency between pre-trained deep neural networks. We propose a generic definition for knowledge consistency between neural networks at different fuzziness levels. A task-agnostic method is designed to disentangle feature components, which represent the consistent knowledge, from raw intermediate-layer features of each neural network. As a generic tool, our method can be broadly used for different applications. In preliminary experiments, we have used knowledge consistency as a tool to diagnose representations of neural networks. Knowledge consistency provides new insights to explain the success of existing deep-learning techniques, such as knowledge distillation and network compression. More crucially, knowledge consistency can also be used to refine pre-trained networks and boost performance.",
        "keywords": "Deep Learning;Interpretability;Convolutional Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruofan Liang;Tianlin Li;Longfei Li;Jing Wang;Quanshi Zhang",
        "authorids": "nexuslrf@sjtu.edu.cn;litl@act.buaa.edu.cn;1776752575@sjtu.edu.cn;wangjing215@huawei.com;zqs1022@sjtu.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLiang2020Knowledge,\ntitle={Knowledge Consistency between Neural Networks and Beyond},\nauthor={Ruofan Liang and Tianlin Li and Longfei Li and Jing Wang and Quanshi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeS62EtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJeS62EtwH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "322;402;678",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "784;773;756",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            467.3333333333333,
            152.50209470328232
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            771.0,
            11.51810169544733
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 41,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15424150340274279421&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJeTCAEtDB",
        "title": "Feature Map Transform Coding for Energy-Efficient CNN Inference",
        "track": "main",
        "status": "Reject",
        "tldr": "Using PCA as decorellation transformation on activations to reduce memory bandwidth and energy footprint of NN accelerators",
        "abstract": "    Convolutional neural networks (CNNs) achieve state-of-the-art accuracy in a variety of tasks in  computer vision and beyond. One of the major obstacles hindering the ubiquitous use of CNNs for inference on low-power edge devices is their high computational complexity and memory bandwidth requirements. The latter often dominates the energy footprint on modern hardware. In this paper, we introduce a lossy transform coding approach, inspired by image and video compression, designed to reduce the memory bandwidth due to the storage of intermediate activation calculation results. Our method does not require fine-tuning the network weights and halves the data transfer volumes to the main memory by compressing feature maps, which are highly correlated, with variable length coding. Our method outperform previous approach in term of the number of bits per value with minor accuracy degradation on ResNet-34 and MobileNetV2. We analyze the performance of our approach on a variety of CNN architectures and demonstrate that FPGA implementation of ResNet-18 with our approach results in a reduction of around 40% in the memory energy footprint, compared to quantized network, with negligible impact on accuracy. When allowing accuracy degradation of up to 2%, the reduction of 60% is achieved. A reference implementation}accompanies the paper.",
        "keywords": "compression;efficient inference;quantization;memory bandwidth;entropy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Brian Chmiel;Chaim Baskin;Ron Banner;Evgenii Zheltonozhskii;Yevgeny Yermolin;Alex Karbachevsky;Alex M. Bronstein;Avi Mendelson",
        "authorids": "brian.chmiel@intel.com;chaimbaskin@cs.technion.ac.il;ron.banner@intel.com;evgeniizh@campus.technion.ac.il;yevgeny_ye@campus.technion.ac.il;alex.k@cs.technion.ac.il;bron@cs.technion.ac.il;avi.mendelson@cs.technion.ac.il",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nchmiel2020feature,\ntitle={Feature Map Transform Coding for Energy-Efficient {\\{}CNN{\\}} Inference},\nauthor={Brian Chmiel and Chaim Baskin and Ron Banner and Evgenii Zheltonozhskii and Yevgeny Yermolin and Alex Karbachevsky and Alex M. Bronstein and Avi Mendelson},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeTCAEtDB}\n}",
        "github": "https://github.com/CompressTeam/TransformCodingInference",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJeTCAEtDB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "383;234;106",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "715;414;32",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            241.0,
            113.19305043449738
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            387.0,
            279.48643377929216
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=524397208850825903&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BJeUs3VFPH",
        "title": "Domain Adaptation via Low-Rank Basis Approximation",
        "track": "main",
        "status": "Reject",
        "tldr": "The paper describes a low-rank basis transfer algorithm using only a subset from the domains with outstanding results.",
        "abstract": "Domain adaptation focuses on the reuse of supervised learning models in a new context. Prominent applications can be found in robotics, image processing or web mining. In these areas, learning scenarios change by nature, but often remain related and motivate the reuse of existing supervised models.\nWhile the majority of symmetric and asymmetric domain adaptation algorithms utilize all available source and target domain data, we show that efficient domain adaptation requires only a substantially smaller subset from both domains. This makes it more suitable for real-world scenarios where target domain data is rare. The presented approach finds a target subspace representation for source and target data to address domain differences by orthogonal basis transfer. By employing a low-rank approximation, the approach remains low in computational time. \nThe presented idea is evaluated in typical domain adaptation tasks with standard benchmark data.",
        "keywords": "Domain Adaptation;Basis Transfer;Transfer Learning;Low Rank Approximation;Nystr\u00f6m Approximation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Christoph Raab;Frank-Michael Schleif",
        "authorids": "christoph.raab@fhws.de;frank-michael.schleif@fhws.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nraab2020domain,\ntitle={Domain Adaptation via Low-Rank Basis Approximation},\nauthor={Christoph Raab and Frank-Michael Schleif},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeUs3VFPH}\n}",
        "github": "https://github.com/iclr-nbt/nbt.git",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJeUs3VFPH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "146;646;121",
        "wc_reply_reviewers": "0;0;82",
        "wc_reply_authors": "383;703;528",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            304.3333333333333,
            241.81030214244848
        ],
        "wc_reply_reviewers_avg": [
            27.333333333333332,
            38.6551707048646
        ],
        "wc_reply_authors_avg": [
            538.0,
            130.83067937860244
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1908804074857580835&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJeVXgBKDH",
        "title": "Question Generation from Paragraphs: A Tale of Two Hierarchical Models",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Automatic question generation from paragraph using hierarchical models",
        "abstract": "Automatic question generation from paragraphs is an important and challenging problem, particularly due to the long context from paragraphs. In this paper, we propose and study two hierarchical models for the task of question generation from paragraphs. Specifically, we propose (a) a novel hierarchical BiLSTM model with selective attention and (b) a novel hierarchical Transformer architecture, both of which learn hierarchical representations of paragraphs. \nWe model a paragraph in terms of its constituent sentences, and a sentence in terms of its constituent words. While the introduction of the attention mechanism benefits the hierarchical BiLSTM model, the hierarchical Transformer, with its inherent attention and positional encoding mechanisms also performs better than flat transformer model.\nWe conducted empirical evaluation on the widely used SQuAD and MS MARCO datasets using standard metrics. \nThe results demonstrate the overall effectiveness of the hierarchical models over their flat counterparts. \nQualitatively, our hierarchical models are able to generate fluent and relevant questions.\n",
        "keywords": "Question Generation;Hierarchical models;Transformer;BiLSTM;LSTM",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vishwajeet Kumar;Raktim Chaki;Sai Teja Talluri;Ganesh Ramakrishnan;Yuan-Fang Li;Gholamreza Haffari",
        "authorids": "vishwajeet@cse.iitb.ac.in;raktimchaki@cse.iitb.ac.in;saiteja.talluri@gmail.com;ganesh@cse.iitb.ac.in;yuanfang.li@monash.edu;gholamreza.haffari@monash.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJeVXgBKDH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "379;305;470",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            384.6666666666667,
            67.4800382006083
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1161193946850516717&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJeVklHtPr",
        "title": "Batch Normalization has Multiple Benefits: An Empirical Study on Residual Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "The multiple benefits of batch normalization can only be understood if one experiments at a range of batch sizes",
        "abstract": "Many state of the art models rely on two architectural innovations; skip connections and batch normalization. However batch normalization has a number of limitations. It breaks the independence between training examples within a batch, performs poorly when the batch size is too small, and significantly increases the cost of computing a parameter update in some models. This work identifies two practical benefits of batch normalization. First, it improves the final test accuracy. Second, it enables efficient training with larger batches and larger learning rates. However we demonstrate that the increase in the largest stable learning rate does not explain why the final test accuracy is increased under a finite epoch budget. Furthermore, we show that the gap in test accuracy between residual networks with and without batch normalization can be dramatically reduced by improving the initialization scheme. We introduce \u201cZeroInit\u201d, which trains a 1000 layer deep Wide-ResNet without normalization to 94.3% test accuracy on CIFAR-10 in 200 epochs at batch size 64. This initialization scheme outperforms batch normalization when the batch size is very small, and is competitive with batch normalization for batch sizes that are not too large. We also show that ZeroInit matches the validation accuracy of batch normalization when training ResNet-50-V2 on ImageNet at batch size 1024.",
        "keywords": "batch normalization;residual networks;initialization;batch size;learning rate;ImageNet",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Soham De;Samuel L Smith",
        "authorids": "sohamde@google.com;slsmith@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nde2020batch,\ntitle={Batch Normalization has Multiple Benefits: An Empirical Study on Residual Networks},\nauthor={Soham De and Samuel L Smith},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeVklHtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJeVklHtPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "472;448;106",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "785;979;422",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            342.0,
            167.1645895517349
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            728.6666666666666,
            230.85685800705357
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13971948730733555159&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJeWVpNtwr",
        "title": "On the Pareto Efficiency of Quantized CNN",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Weight Quantization for deep convolutional neural networks (CNNs) has shown promising results in compressing and accelerating CNN-powered applications such as semantic segmentation, gesture recognition, and scene understanding. Prior art has shown that different datasets, tasks, and network architectures admit different iso-accurate precision values, which increase the complexity of efficient quantized neural network implementations from both hardware and software perspectives. In this work, we show that when the number of channels is allowed to vary in an iso-model size scenario, lower precision values Pareto dominate higher precision ones (in accuracy vs. model size) for networks with standard convolutions. Relying on comprehensive empirical analyses, we find that the Pareto optimal precision value of a convolution layer depends on the number of input channels per output filters and provide theoretical insights for it. To this end, we develop a simple algorithm to select the precision values for CNNs that outperforms corresponding 8-bit quantized networks by 0.9% and 2.2% in top-1 accuracy on ImageNet for ResNet50 and MobileNetV2, respectively.   ",
        "keywords": "convolutional neural networks quantization;model compression;efficient neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ting-Wu Chin;Pierce I-Jen Chuang;Vikas Chandra;Diana Marculescu",
        "authorids": "tingwuc@cmu.edu;pichuang@fb.com;vchandra@fb.com;dianam@cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nchin2020on,\ntitle={On the Pareto Efficiency of Quantized {\\{}CNN{\\}}},\nauthor={Ting-Wu Chin and Pierce I-Jen Chuang and Vikas Chandra and Diana Marculescu},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeWVpNtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJeWVpNtwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "264;212;122",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "589;662;326",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            199.33333333333334,
            58.6590904198905
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            525.6666666666666,
            144.29676211043528
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5tF5qtMMsTMJ:scholar.google.com/&scioq=On+the+Pareto+Efficiency+of+Quantized+CNN&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJeXaJHKvB",
        "title": "P-BN: Towards Effective Batch Normalization in the Path Space",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural networks with ReLU activation functions have demonstrated their success in many applications. Recently, researchers noticed a potential issue with the optimization of ReLU networks: the ReLU activation functions are positively scale-invariant (PSI), while the weights are not. This mismatch may lead to undesirable behaviors in the optimization process. Hence, some new algorithms that conduct optimizations directly in the path space (the path space is proven to be PSI) were developed, such as Stochastic Gradient Descent (SGD) in the path space, and it was shown that SGD in the path space is superior to that in the weight space. However, it is still unknown whether other deep learning techniques beyond SGD, such as batch normalization (BN), could also have their counterparts in the path space. In this paper, we conduct a formal study on the design of BN in the path space. According to our study, the key challenge is how to ensure the forward propagation in the path space, because BN is utilized during the forward process. To tackle such challenge, we propose a novel re-parameterization of ReLU networks, with which we replace each weight in the original neural network, with a new value calculated from one or several paths, while keeping the outputs of the network unchanged for any input. Then we show that BN in the path space, namely P-BN, is just a slightly modified conventional BN on the re-parameterized ReLU networks. Our experiments on two benchmark datasets, CIFAR and ImageNet, show that the proposed P-BN can signi\ufb01cantly outperform the conventional BN in the weight space.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xufang Luo;Qi Meng;Wei Chen;Tie-Yan Liu",
        "authorids": "luoxufang@buaa.edu.cn;meq@microsoft.com;wche@microsoft.com;tyliu@microsoft.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nluo2020pbn,\ntitle={P-{\\{}BN{\\}}: Towards Effective Batch Normalization in the Path Space},\nauthor={Xufang Luo and Qi Meng and Wei Chen and Tie-Yan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeXaJHKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJeXaJHKvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "323;97;231",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "600;314;378",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            217.0,
            92.79367794557271
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            430.6666666666667,
            122.55429635698438
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wiOy1jdRT6oJ:scholar.google.com/&scioq=P-BN:+Towards+Effective+Batch+Normalization+in+the+Path+Space&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BJe_z1HFPr",
        "title": "Resizable Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "\nIn this paper, we present a deep convolutional neural network (CNN) which performs arbitrary resize operation on intermediate feature map resolution at stage-level. Motivated by weight sharing mechanism in neural architecture search, where a super-network is trained and sub-networks inherit the weights from the super-network, we present a novel CNN approach. We construct a spatial super-network which consists of multiple sub-networks, where each sub-network is a single scale network that obtain a unique spatial configuration, the convolutional layers are shared across all sub-networks. Such network, named as Resizable Neural Networks, are equivalent to training infinite single scale networks, but has no extra computational cost. Moreover, we present a training algorithm such that all sub-networks achieve better performance than individually trained counterparts. On large-scale ImageNet classification, we demonstrate its effectiveness on various modern network architectures such as MobileNet, ShuffleNet, and ResNet.\n\n\nTo go even further, we present three variants of resizable networks: 1) Resizable as Architecture Search (Resizable-NAS). On ImageNet, Resizable-NAS ResNet-50 attain 0.4% higher on accuracy and 44% smaller than the baseline model. 2) Resizable as Data Augmentation (Resizable-Aug). While we use resizable networks as a data augmentation technique, it obtains superior performance on ImageNet classification, outperform AutoAugment by 1.2% with ResNet-50. 3) Adaptive Resizable Network (Resizable-Adapt). We introduce the adaptive resizable networks as dynamic networks, which further improve the performance with less computational cost via data-dependent inference.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yichen Zhu;Xiangyu Zhang;Tong Yang;Jian Sun",
        "authorids": "k.zhu@mail.utoronto.ca;zhangxiangyu@megvii.com;yangtong@megvii.com;sunjian@megvii.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhu2020resizable,\ntitle={Resizable Neural Networks},\nauthor={Yichen Zhu and Xiangyu Zhang and Tong Yang and Jian Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=BJe_z1HFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJe_z1HFPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "457;499;717",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "608;659;321",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            557.6666666666666,
            113.96295694459475
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            529.3333333333334,
            148.77798522929692
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:d57ofQ9_8ygJ:scholar.google.com/&scioq=Resizable+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BJedHRVtPB",
        "title": "Pseudo-LiDAR++: Accurate Depth for 3D Object Detection in Autonomous Driving",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Detecting objects such as cars and pedestrians in 3D plays an indispensable role in autonomous driving. Existing approaches largely rely on expensive LiDAR sensors for accurate depth information. While recently pseudo-LiDAR has been introduced as a promising alternative, at a much lower cost based solely on stereo images, there is still a notable performance gap. \nIn this paper we provide substantial advances to the pseudo-LiDAR framework through improvements in stereo depth estimation. Concretely, we adapt the stereo network architecture and loss function to be more aligned with accurate depth estimation of faraway objects --- currently the primary weakness of pseudo-LiDAR. Further, we explore the idea to leverage cheaper but extremely sparse LiDAR sensors, which alone provide insufficient information for 3D detection, to de-bias our depth estimation. We propose a depth-propagation algorithm, guided by the initial depth estimates, to diffuse these few exact measurements across the entire depth map. We show on the KITTI object detection benchmark that our combined approach yields substantial improvements in depth estimation and stereo-based 3D object detection --- outperforming the previous state-of-the-art detection accuracy for faraway objects by 40%. Our code is available at https://github.com/mileyan/Pseudo_Lidar_V2.",
        "keywords": "pseudo-LiDAR;3D-object detection;stereo depth estimation;autonomous driving",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yurong You;Yan Wang;Wei-Lun Chao;Divyansh Garg;Geoff Pleiss;Bharath Hariharan;Mark Campbell;Kilian Q. Weinberger",
        "authorids": "yy785@cornell.edu;yw763@cornell.edu;weilunchao760414@gmail.com;dg595@cornell.edu;gp346@cornell.edu;bharathh@cs.cornell.edu;mc288@cornell.edu;kqw4@cornell.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nYou2020Pseudo-LiDAR++:,\ntitle={Pseudo-LiDAR++: Accurate Depth for 3D Object Detection in Autonomous Driving},\nauthor={Yurong You and Yan Wang and Wei-Lun Chao and Divyansh Garg and Geoff Pleiss and Bharath Hariharan and Mark Campbell and Kilian Q. Weinberger},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJedHRVtPB}\n}",
        "github": "https://github.com/mileyan/Pseudo_Lidar_V2",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJedHRVtPB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "617;723;490",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "529;207;269",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            610.0,
            95.25054680507964
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            335.0,
            139.49432485469316
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 515,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10904480408184954283&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJedt6VKPS",
        "title": "Scaling Laws for the Principled Design, Initialization, and Preconditioning of ReLU Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "A theory for initialization and scaling of ReLU neural network layers",
        "abstract": "Abstract In this work, we describe a set of rules for the design and initialization of well-conditioned neural networks, guided by the goal of naturally balancing the diagonal blocks of the Hessian at the start of training. We show how our measure of conditioning of a block relates to another natural measure of conditioning, the ratio of weight gradients to the weights. We prove that for a ReLU-based deep multilayer perceptron, a simple initialization scheme using the geometric mean of the fan-in and fan-out satisfies our scaling rule. For more sophisticated architectures, we show how our scaling principle can be used to guide design choices to produce well-conditioned neural networks, reducing guess-work.",
        "keywords": "initialization;mlp;relu",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aaron Defazio;Leon Bottou",
        "authorids": "aaron.defazio@gmail.com;leon@bottou.org",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ndefazio2020scaling,\ntitle={Scaling Laws for the Principled Design, Initialization, and Preconditioning of Re{\\{}LU{\\}} Networks},\nauthor={Aaron Defazio and Leon Bottou},\nyear={2020},\nurl={https://openreview.net/forum?id=BJedt6VKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJedt6VKPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "155;490;421",
        "wc_reply_reviewers": "0;0;124",
        "wc_reply_authors": "185;125;61",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            355.3333333333333,
            144.4306831051568
        ],
        "wc_reply_reviewers_avg": [
            41.333333333333336,
            58.45416057808793
        ],
        "wc_reply_authors_avg": [
            123.66666666666667,
            50.63156678946007
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7315155541430887485&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJegjn4FPB",
        "title": "ILS-SUMM: Iterated Local Search for Unsupervised Video Summarization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In recent years, there has been an increasing interest in building video summarization tools, where the goal is to automatically create a short summary of an input video that properly represents the original content. We consider shot-based video summarization where the summary consists of a subset of the video shots which can be of various lengths. A straightforward approach to maximize the representativeness of a subset of shots is by minimizing the total distance between shots and their nearest selected shots. We formulate the task of video summarization as an optimization problem with a knapsack-like constraint on the total summary duration. Previous studies have proposed greedy algorithms to solve this problem approximately, but no experiments were presented to measure the ability of these methods to obtain solutions with low total distance. Indeed, our experiments on video summarization datasets show that the success of current methods in obtaining results with low total distance still has much room for improvement. In this paper, we develop ILS-SUMM, a novel video summarization algorithm to solve the subset selection problem under the knapsack constraint.  Our algorithm is based on the well-known metaheuristic optimization framework -- Iterated Local Search (ILS), known for its ability to avoid weak local minima and obtain a good near-global minimum. Extensive experiments show that our method finds solutions with significantly better total distance than previous methods. Moreover, to indicate the high scalability of ILS-SUMM, we introduce a new dataset consisting of videos of various lengths.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yair Shemer;Daniel Rotman;Nahum Shimkin",
        "authorids": "sy@campus.technion.ac.il;danieln@il.ibm.com;shimkin@ee.technion.ac.il",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://github.com/ICLR-2020-ILS-SUMM/ILS-SUMM",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJegjn4FPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "341;451;175",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            322.3333333333333,
            113.44700182121264
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5977474452588109084&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJeguTEKDB",
        "title": "INSTANCE CROSS ENTROPY FOR DEEP METRIC LEARNING",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose instance cross entropy (ICE) which measures the difference between an estimated instance-level matching distribution and its ground-truth one. ",
        "abstract": "Loss functions play a crucial role in deep metric learning thus a variety of them have been proposed. Some supervise the learning process by pairwise or tripletwise similarity constraints while others take the advantage of structured similarity information among multiple data points. In this work, we approach deep metric learning from a novel perspective. We propose instance cross entropy (ICE) which measures the difference between an estimated instance-level matching distribution and its ground-truth one. ICE has three main appealing properties. Firstly, similar to categorical cross entropy (CCE), ICE has clear probabilistic interpretation and exploits structured semantic similarity information for learning supervision. Secondly, ICE is scalable to infinite training data as it learns on mini-batches iteratively and is independent of the training set size. Thirdly, motivated by our relative weight analysis, seamless sample reweighting is incorporated. It rescales samples\u2019 gradients to control the differentiation degree over training examples instead of truncating them by sample mining. In addition to its simplicity and intuitiveness, extensive experiments on three real-world benchmarks demonstrate the superiority of ICE.",
        "keywords": "Deep Metric Learning;Instance Cross Entropy;Sample Mining/Weighting;Image Retrieval",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinshao Wang;Elyor Kodirov;Yang Hua;Neil M. Robertson",
        "authorids": "xwang39@qub.ac.uk;elyor@anyvision.co;y.hua@qub.ac.uk;n.robertson@qub.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020instance,\ntitle={{\\{}INSTANCE{\\}} {\\{}CROSS{\\}} {\\{}ENTROPY{\\}} {\\{}FOR{\\}} {\\{}DEEP{\\}} {\\{}METRIC{\\}} {\\{}LEARNING{\\}}},\nauthor={Xinshao Wang and Elyor Kodirov and Yang Hua and Neil M. Robertson},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeguTEKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJeguTEKDB",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "308;765;184",
        "wc_reply_reviewers": "281;389;0",
        "wc_reply_authors": "838;1091;293",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            419.0,
            249.84128295113013
        ],
        "wc_reply_reviewers_avg": [
            223.33333333333334,
            163.9600222276421
        ],
        "wc_reply_authors_avg": [
            740.6666666666666,
            332.97280502901265
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6883449388226128738&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJej7kBYwS",
        "title": "Divide-and-Conquer Adversarial Learning for High-Resolution Image Enhancement",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "This paper introduces a divide-and-conquer inspired adversarial learning (DACAL) approach for photo enhancement. The key idea is to decompose the photo enhancement process into hierarchically multiple sub-problems, which can be better conquered from bottom to up. On the top level, we propose a perception-based division to learn additive and multiplicative components, required to translate a low-quality image into its high-quality counterpart. On the intermediate level, we use a frequency-based division with generative adversarial network (GAN) to weakly supervise the photo enhancement process. On the lower level, we design dimension-based division that enables the GAN model to better approximates the distribution distance on multiple independent one-dimensional data to train the GAN model. While considering all three hierarchies, we develop a multiscale training approach to optimize the image enhancement process, suitable for high-resolution images, in a weakly-supervised manner. Both quantitative and qualitative results clearly demonstrate that the proposed DACAL achieves the state-of-the-art performance for high-resolution image enhancement.",
        "keywords": "divide-and-conquer;adversarial learning;image enhancement",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiwu Huang;Danda Pani Paudel;Guanju Li;Jiqing Wu;Radu Timofte;Luc Van Gool",
        "authorids": "zhiwu.huang@vision.ee.ethz.ch;paudel@vision.ee.ethz.ch;ligua@student.ethz.ch;jwu@vision.ee.ethz.ch;radu.timofte@vision.ee.ethz.ch;vangool@vision.ee.ethz.ch",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJej7kBYwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1213;275;615",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "34;36;71",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            701.0,
            387.73530490099387
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            47.0,
            16.990193249832878
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lFnpDGlXiI0J:scholar.google.com/&scioq=Divide-and-Conquer+Adversarial+Learning+for+High-Resolution+Image+Enhancement&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJema04FvS",
        "title": "VISUALIZING POINT CLOUD CLASSIFIERS BY MORPHING POINT CLOUDS INTO POTATOES",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Recently, various networks that operate directly on point clouds have been proposed. It is of interest to us what features are utilized in those classifiers for their predictions. In this paper, we propose a novel approach to visualize important features used in classification decisions from point cloud networks. Following ideas in visualizing 2-D convolutional networks, our approach is based on gradually smoothing parts of the point cloud to remove certain shape features, and then evaluating the resulting point cloud on the original network to see whether the performance has dropped or remained the same. From these it can be seen whether certain parts are important to the point cloud classification. A main technical contribution of the paper is to propose an algorithm for smoothing point cloud shapes based on moving least squares and curvature flow. This algorithm can smoothly transition from the original point cloud to a either a uniform sphere, or a disk if the original shape is on a plane. With this algorithm, we can obtain a saliency map by adapting the Integrated-Gradients Optimized Saliency (I-GOS) algorithm, a state-of-the-art perturbation-based visualization techniques, to 3-D shapes. Experiment results revealed insights into these classifiers.",
        "keywords": "point cloud;3D computer vision;visualization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ziwen Chen;Wenxuan Wu;Zhongang Qi;Fuxin Li",
        "authorids": "chenziwe@grinnell.edu;wuwen@oregonstate.edu;qiz@oregonstate.edu;lif@oregonstate.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJema04FvS",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "213;693;365;158",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "157;705;488;235",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.25,
            208.14463120628406
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            396.25,
            216.2098228573346
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gO7I6nK3yaEJ:scholar.google.com/&scioq=VISUALIZING+POINT+CLOUD+CLASSIFIERS+BY+MORPHING+POINT+CLOUDS+INTO+POTATOES&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJena3VtwS",
        "title": "The Visual Task Adaptation Benchmark",
        "track": "main",
        "status": "Reject",
        "tldr": "VTAB is a unified, realistic, and challenging benchmark for general visual representation learning. With it, we evaluate many methods.",
        "abstract": "Representation learning promises to unlock deep learning for the long tail of vision tasks without expansive labelled datasets. Yet, the absence of a unified yardstick to evaluate general visual representations hinders progress. Many sub-fields promise representations, but each has different evaluation protocols that are either too constrained (linear classification), limited in scope (ImageNet, CIFAR, Pascal-VOC), or only loosely related to representation quality (generation). We present the Visual Task Adaptation Benchmark (VTAB): a diverse, realistic, and challenging benchmark to evaluate representations. VTAB embodies one principle: good representations adapt to unseen tasks with few examples. We run a large VTAB study of popular algorithms, answering questions like: How effective are ImageNet representation on non-standard datasets? Are generative models competitive? Is self-supervision useful if one already has labels?",
        "keywords": "representation learning;self-supervised learning;benchmark;large-scale study",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiaohua Zhai;Joan Puigcerver;Alexander Kolesnikov;Pierre Ruyssen;Carlos Riquelme;Mario Lucic;Josip Djolonga;Andre Susano Pinto;Maxim Neumann;Alexey Dosovitskiy;Lucas Beyer;Olivier Bachem;Michael Tschannen;Marcin Michalski;Olivier Bousquet;Sylvain Gelly;Neil Houlsby",
        "authorids": "xzhai@google.com;jpuigcerver@google.com;alexander.kolesnikoff@gmail.com;pierrot@google.com;rikel@googel.com;lucic@google.com;josipd@google.com;andresp@google.com;maximneumann@google.com;adosovitskiy@gmail.com;lbeyer@google.com;bachem@google.com;tschannen@google.com;michalski@google.com;obousquet@google.com;sylvaingelly@google.com;neilhoulsby@google.com",
        "gender": ";;;;;;;;;;;;;;;;",
        "homepage": ";;;;;;;;;;;;;;;;",
        "dblp": ";;;;;;;;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;;;;;;;",
        "orcid": ";;;;;;;;;;;;;;;;",
        "linkedin": ";;;;;;;;;;;;;;;;",
        "or_profile": ";;;;;;;;;;;;;;;;",
        "aff": ";;;;;;;;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;;;;;;;",
        "position": ";;;;;;;;;;;;;;;;",
        "bibtex": "@misc{\nzhai2020the,\ntitle={The Visual Task Adaptation Benchmark},\nauthor={Xiaohua Zhai and Joan Puigcerver and Alexander Kolesnikov and Pierre Ruyssen and Carlos Riquelme and Mario Lucic and Josip Djolonga and Andre Susano Pinto and Maxim Neumann and Alexey Dosovitskiy and Lucas Beyer and Olivier Bachem and Michael Tschannen and Marcin Michalski and Olivier Bousquet and Sylvain Gelly and Neil Houlsby},\nyear={2020},\nurl={https://openreview.net/forum?id=BJena3VtwS}\n}",
        "github": "https://www.dropbox.com/s/4ph8hfcom9xm15z/task_adaptation.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJena3VtwS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "113;189;1104",
        "wc_reply_reviewers": "0;0;307",
        "wc_reply_authors": "54;71;728",
        "reply_reviewers": "0;0;2",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            468.6666666666667,
            450.31865261044743
        ],
        "wc_reply_reviewers_avg": [
            102.33333333333333,
            144.72118788284672
        ],
        "wc_reply_authors_avg": [
            284.3333333333333,
            313.79646623603367
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            17,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 78,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14009876624179114118&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJepcaEtwB",
        "title": "Meta-Graph: Few shot Link Prediction via Meta Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We apply gradient based meta-learning to the graph domain and introduce a new graph specific transfer function to further bootstrap the process.",
        "abstract": "We consider the task of few shot link prediction, where the goal is to predict missing edges across multiple graphs using only a small sample of known edges. We show that current link prediction methods are generally ill-equipped to handle this task---as they cannot effectively transfer knowledge between graphs in a multi-graph setting and are unable to effectively learn from very sparse data. To address this challenge, we introduce a new gradient-based meta learning framework, Meta-Graph, that leverages higher-order gradients along with a learned graph signature function that conditionally generates a graph neural network initialization. Using a novel set of few shot link prediction benchmarks, we show that Meta-Graph enables not only fast adaptation but also better final convergence and can effectively learn using only a small sample of true edges.",
        "keywords": "Meta Learning;Link Prediction;Graph Representation Learning;Graph Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Avishek Joey Bose;Ankit Jain;Piero Molino;William L. Hamilton",
        "authorids": "joey.bose@mail.mcgill.ca;ankit.jain@uber.com;piero.molino@uber.com;wlh@cs.mcgill.ca",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nbose2020metagraph,\ntitle={Meta-Graph: Few shot Link Prediction via Meta Learning},\nauthor={Avishek Joey Bose and Ankit Jain and Piero Molino and William L. Hamilton},\nyear={2020},\nurl={https://openreview.net/forum?id=BJepcaEtwB}\n}",
        "github": "https://anonymous.4open.science/r/2212328f-4954-4798-b0f2-dbd75005c9ae/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJepcaEtwB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "150;312;292",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "481;273;825",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            251.33333333333334,
            72.11718858142551
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            526.3333333333334,
            227.62151821731518
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 93,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10432499634852749193&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BJepq2VtDB",
        "title": "Training Deep Networks with Stochastic Gradient Normalized by Layerwise Adaptive Second Moments",
        "track": "main",
        "status": "Reject",
        "tldr": "NovoGrad -  an adaptive SGD method with layer-wise gradient normalization and decoupled weight decay. ",
        "abstract": "We propose NovoGrad, an adaptive stochastic gradient descent method with layer-wise gradient normalization and decoupled weight decay. In our experiments on neural networks for image classification, speech recognition, machine translation, and language modeling, it performs on par or better than well tuned SGD with momentum and Adam/AdamW. \nAdditionally, NovoGrad (1) is robust to the choice of learning rate and weight initialization, (2) works well in a large batch setting, and (3) has two times smaller memory footprint than Adam.",
        "keywords": "deep learning;optimization;SGD;Adam;NovoGrad;large batch training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Boris Ginsburg;Patrice Castonguay;Oleksii Hrinchuk;Oleksii Kuchaiev;Vitaly Lavrukhin;Ryan Leary;Jason Li;Huyen Nguyen;Yang Zhang;Jonathan M. Cohen",
        "authorids": "boris.ginsburg@gmail.com;pcastonguay@nvidia.com;grinchuk.alexey@gmail.com;kuchaev@gmail.com;vlavrukhin@yahoo.com;rleary@nvidia.com;jasoli@nvidia.com;huyenntkvn@gmail.com;yangzhang@nvidia.com;jocohen@nvidia.com",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@misc{\nginsburg2020training,\ntitle={Training Deep Networks with Stochastic Gradient Normalized by Layerwise Adaptive Second Moments},\nauthor={Boris Ginsburg and Patrice Castonguay and Oleksii Hrinchuk and Oleksii Kuchaiev and Vitaly Lavrukhin and Ryan Leary and Jason Li and Huyen Nguyen and Yang Zhang and Jonathan M. Cohen},\nyear={2020},\nurl={https://openreview.net/forum?id=BJepq2VtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJepq2VtDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "249;513;387",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "733;530;535",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            383.0,
            107.81465577554843
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            599.3333333333334,
            94.53864583098044
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3032464171770882320&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJepraEFPr",
        "title": "Attention over Parameters for Dialogue Systems",
        "track": "main",
        "status": "Withdraw",
        "tldr": "In this paper, we propose to learn a dialogue system that independently parameterizes different dialogue skills, and learns to select and combine each of them through Attention over Parameters (AoP). ",
        "abstract": "Dialogue systems require a great deal of different but complementary expertise to assist, inform, and entertain humans. For example, different domains (e.g., restaurant reservation, train ticket booking) of goal-oriented dialogue systems can be viewed as different skills, and so does ordinary chatting abilities of chit-chat dialogue systems. In this paper, we propose to learn a dialogue system that independently parameterizes different dialogue skills, and learns to select and combine each of them through Attention over Parameters (AoP). The experimental results show that this approach achieves competitive performance on a combined dataset of MultiWOZ (Budzianowski et al., 2018), In-Car Assistant (Eric et al.,2017), and Persona-Chat (Zhang et al., 2018). Finally, we demonstrate that each dialogue skill is effectively learned and can be combined with other skills to produce selective responses. ",
        "keywords": "end-to-end dialogue systems;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrea Madotto;Zhaojiang Lin;Chien-Sheng Wu;Jamin Shin;Pascale Fung",
        "authorids": "amadotto@connect.ust.hk;zlinao@connect.ust.hk;wu.jason@salesforce.com;jay.shin@connect.ust.hk;pascale@ece.ust.hk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJepraEFPr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "373;175;345",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "318;391;292",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            297.6666666666667,
            87.48841193107931
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            333.6666666666667,
            41.907305117631005
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 17,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4672265548949369581&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJerUCEtPB",
        "title": "Smooth Kernels Improve Adversarial Robustness and Perceptually-Aligned Gradients",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a smoothness regularization for convolutional kernels of CNN that can help improve adversarial robustness and lead to perceptually-aligned gradients",
        "abstract": "Recent research has shown that CNNs are often overly sensitive to high-frequency textural patterns. Inspired by the intuition that humans are more sensitive to the lower-frequency (larger-scale) patterns we design a regularization scheme that penalizes large differences between adjacent components within each convolutional kernel. We apply our regularization onto several popular training methods, demonstrating that the models with the proposed smooth kernels enjoy improved adversarial robustness. Further, building on recent work establishing connections between adversarial robustness and interpretability, we show that our method appears to give more perceptually-aligned gradients. ",
        "keywords": "adversarial robustness;computer vision;smoothness regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haohan Wang;Xindi Wu;Songwei Ge;Zachary C. Lipton;Eric P. Xing",
        "authorids": "haohanw@cs.cmu.edu;xindiw@andrew.cmu.edu;songweig@andrew.cmu.edu;zlipton@cmu.edu;epxing@cs.cmu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwang2020smooth,\ntitle={Smooth Kernels Improve Adversarial Robustness and Perceptually-Aligned Gradients},\nauthor={Haohan Wang and Xindi Wu and Songwei Ge and Zachary C. Lipton and Eric P. Xing},\nyear={2020},\nurl={https://openreview.net/forum?id=BJerUCEtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJerUCEtPB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "1058;466;307",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            610.3333333333334,
            323.13499071165637
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:l4A2tvY7c4wJ:scholar.google.com/&scioq=Smooth+Kernels+Improve+Adversarial+Robustness+and+Perceptually-Aligned+Gradients&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJes_xStwS",
        "title": "GRASPEL: GRAPH SPECTRAL LEARNING AT SCALE",
        "track": "main",
        "status": "Reject",
        "tldr": "A spectral approach to scalable graph learning from data",
        "abstract": "Learning meaningful graphs from data plays important roles in many data mining and machine learning tasks, such as data representation and analysis, dimension reduction, data clustering, and visualization, etc. In this work, we present a  scalable spectral approach to graph learning from data. By limiting the precision matrix to be a graph Laplacian, our approach aims to estimate ultra-sparse weighted graphs and has a clear connection with the prior graphical Lasso method. By interleaving nearly-linear time spectral graph sparsification,  coarsening and embedding procedures, ultra-sparse yet spectrally-stable graphs can be iteratively constructed in a highly-scalable manner. Compared with prior graph learning approaches that do not scale to large problems, our approach is highly-scalable for constructing graphs that can immediately lead to substantially improved computing efficiency and solution quality for a variety of data mining and machine learning applications, such as spectral clustering (SC), and t-Distributed Stochastic Neighbor Embedding (t-SNE).  ",
        "keywords": "Spectral graph theory;graph learning;data clustering;t-SNE visualization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yongyu Wang;Zhiqiang Zhao;Zhuo Feng",
        "authorids": "yongyuw@mtu.edu;qzzhao@mtu.edu;zfeng12@stevens.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwang2020graspel,\ntitle={{\\{}GRASPEL{\\}}: {\\{}GRAPH{\\}} {\\{}SPECTRAL{\\}} {\\{}LEARNING{\\}} {\\{}AT{\\}} {\\{}SCALE{\\}}},\nauthor={Yongyu Wang and Zhiqiang Zhao and Zhuo Feng},\nyear={2020},\nurl={https://openreview.net/forum?id=BJes_xStwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJes_xStwS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "406;276;185",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "591;653;197",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            289.0,
            90.68994799131085
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            480.3333333333333,
            201.93948488484256
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10839964195259636142&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJeuKnEtDH",
        "title": "Cascade Style Transfer",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recent studies have made tremendous progress in style transfer for specific domains, e.g., artistic, semantic and photo-realistic. However, existing approaches have limited flexibility in extending to other domains, as different style representations are often specific to particular domains. This also limits the stylistic quality. To address these limitations, we propose Cascade Style Transfer, a simple yet effective framework that can improve the quality and flexibility of style transfer by combining multiple existing approaches directly. Our cascade framework contains two architectures, i.e., Serial Style Transfer (SST) and Parallel Style Transfer (PST). The SST takes the stylized output of one method as the input content of the others. This could help improve the stylistic quality. The PST uses a shared backbone and a loss module to optimize the loss functions of different methods in parallel. This could help improve the quality and flexibility, and guide us to find domain-independent approaches. Our experiments are conducted on three major style transfer domains: artistic, semantic and photo-realistic. In all these domains, our methods have shown superiority over the state-of-the-art methods.",
        "keywords": "style transfer;cascade;quality;flexibility;domain-independent;serial;parallel",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhizhong Wang;Lei Zhao;Qihang Mo;Sihuan Lin;Zhiwen Zuo;Wei Xing;Dongming Lu",
        "authorids": "endywon@zju.edu.cn;cszhl@zju.edu.cn;moqihang@zju.edu.cn;linsh@zju.edu.cn;zzwcs@zju.edu.cn;wxing@zju.edu.cn;ldm@zju.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nwang2020cascade,\ntitle={Cascade Style Transfer},\nauthor={Zhizhong Wang and Lei Zhao and Qihang Mo and Sihuan Lin and Zhiwen Zuo and Wei Xing and Dongming Lu},\nyear={2020},\nurl={https://openreview.net/forum?id=BJeuKnEtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJeuKnEtDH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "554;281;219",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            351.3333333333333,
            145.525102836437
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UlMUab84gSgJ:scholar.google.com/&scioq=Cascade+Style+Transfer&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJevJCVYvB",
        "title": "Training Neural Networks for and by Interpolation",
        "track": "main",
        "status": "Reject",
        "tldr": "An adaptive learning-rate with a single hyper-parameter for neural networks that can interpolate the data",
        "abstract": "In modern supervised learning, many deep neural networks are able to interpolate the data: the empirical loss can be driven to near zero on all samples simultaneously. In this work, we explicitly exploit this interpolation property for the design of a new optimization algorithm for deep learning. Specifically, we use it to compute an adaptive learning-rate in closed form at each iteration. This results in the Adaptive Learning-rates for Interpolation with Gradients (ALI-G) algorithm. ALI-G retains the main advantage of SGD which is a low computational cost per iteration. But unlike SGD, the learning-rate of ALI-G uses a single constant hyper-parameter and does not require a decay schedule, which makes it considerably easier to tune. We provide convergence guarantees of ALI-G in the stochastic convex setting. Notably, all our convergence results tackle the realistic case where the interpolation property is satisfied up to some tolerance. We provide experiments on a variety of architectures and tasks: (i) learning a differentiable neural computer; (ii) training a wide residual network on the SVHN data set; (iii) training a Bi-LSTM on the SNLI data set; and (iv) training wide residual networks and densely connected networks on the CIFAR data sets. ALI-G produces state-of-the-art results among adaptive methods, and even yields comparable performance with SGD, which requires manually tuned learning-rate schedules. Furthermore, ALI-G is simple to implement in any standard deep learning framework and can be used as a drop-in replacement in existing code.",
        "keywords": "optimization;adaptive learning-rate;Polyak step-size;Newton-Raphson",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Leonard Berrada;Andrew Zisserman;Pawan M. Kumar",
        "authorids": "lberrada@robots.ox.ac.uk;az@robots.ox.ac.uk;pawan@robots.ox.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nberrada2020training,\ntitle={Training Neural Networks for and by Interpolation},\nauthor={Leonard Berrada and Andrew Zisserman and Pawan M. Kumar},\nyear={2020},\nurl={https://openreview.net/forum?id=BJevJCVYvB}\n}",
        "github": "https://anonymous.4open.science/repository/14f2b37d-2bef-4b3f-b47c-dd257ce75543",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJevJCVYvB",
        "pdf_size": 0,
        "rating": "1;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "834;305;437;287",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "1048;382;390;522",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;1;1;1",
        "rating_avg": [
            4.75,
            2.165063509461097
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            465.75,
            220.3558202090428
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            585.5,
            272.75034372114
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 71,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12646838748171851359&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BJevihVtwB",
        "title": "BOOSTING ENCODER-DECODER CNN FOR INVERSE PROBLEMS",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Encoder-decoder convolutional neural networks  (CNN)  have been extensively used for various  inverse problems. However,  their prediction error  for unseen test data is difficult to estimate  a priori, since the neural networks are trained using only selected data and their architectures are largely considered blackboxes. This poses a fundamental  challenge in improving the performance of  neural networks. Recently, it was shown that Stein\u2019s unbiased risk estimator (SURE) can be used as an unbiased estimator of the prediction error for denoising problems. However, the computation of the divergence term in SURE is difficult to implement in a neural network framework, and the condition to avoid trivial identity mapping is not well defined. In this paper, inspired by the finding that  an encoder-decoder CNN can be expressed as a piecewise linear representation, we provide a  close form expression of  the unbiased estimator for the prediction error. The close form representation leads to a novel boosting scheme to prevent a neural network from converging to an identity mapping so that it can enhance the performance. Experimental results show that the proposed algorithm provides consistent improvement in various inverse problems.",
        "keywords": "Prediction error;Boosting;Encoder-decoder convolutional neural network;Inverse problem",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eunju Cha;Jaeduck Jang;Junho Lee;Eunha Lee;Jong Chul Ye",
        "authorids": "eunju.cha@kaist.ac.kr;jduck.jang@samsung.com;jh0325.lee@samsung.com;eunhayo.lee@samsung.com;jong.ye@kaist.ac.kr",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ncha2020boosting,\ntitle={{\\{}BOOSTING{\\}} {\\{}ENCODER{\\}}-{\\{}DECODER{\\}} {\\{}CNN{\\}} {\\{}FOR{\\}} {\\{}INVERSE{\\}} {\\{}PROBLEMS{\\}}},\nauthor={Eunju Cha and Jaeduck Jang and Junho Lee and Eunha Lee and Jong Chul Ye},\nyear={2020},\nurl={https://openreview.net/forum?id=BJevihVtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJevihVtwB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "190;150;199",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "371;296;348",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            179.66666666666666,
            21.29684379328438
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            338.3333333333333,
            31.372316175606514
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5pRSem0MDDcJ:scholar.google.com/&scioq=BOOSTING+ENCODER-DECODER+CNN+FOR+INVERSE+PROBLEMS&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJewThEtDB",
        "title": "Dynamical Clustering of Time Series Data Using Multi-Decoder RNN Autoencoder",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Novel time series data clustring algorithm based on dynamical system features.",
        "abstract": "Clustering algorithms have wide applications and play an important role in data analysis fields including time series data analysis. The performance of a clustering algorithm depends on the features extracted from the data. However, in time series analysis, there has been a problem that the conventional methods based on the signal shape are unstable for phase shift, amplitude and signal length variations.\nIn this paper, we propose a new clustering algorithm focused on the dynamical system aspect of the signal using recurrent neural network and variational Bayes method. Our experiments show that our proposed algorithm has a robustness against above variations and boost the classification performance.",
        "keywords": "Dynamical system;Recurrent neural network;Autoencoder;Variational Bayes;Clustering;Time series data;Driving data",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daisuke Kaji;Kazuho Watanabe;Masahiro Kobayashi",
        "authorids": "daisuke.kaji.j3a@jp.denso.com;wkazuho@cs.tut.ac.jp;kobayashi@lisl.cs.tut.ac.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://github.com/nosorog3/MDRA",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJewThEtDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "305;856;144",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "57;36;21",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            435.0,
            304.86171728616023
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            38.0,
            14.7648230602334
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tf6GGurcbPkJ:scholar.google.com/&scioq=Dynamical+Clustering+of+Time+Series+Data+Using+Multi-Decoder+RNN+Autoencoder&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJewlyStDr",
        "title": "On Bonus Based Exploration Methods In The Arcade Learning Environment",
        "track": "main",
        "status": "Poster",
        "tldr": "We find that existing bonus-based exploration methods have not been able to address the exploration-exploitation trade-off in the Arcade Learning Environment. ",
        "abstract": "Research on exploration in reinforcement learning, as applied to Atari 2600 game-playing, has emphasized tackling difficult exploration problems such as Montezuma's Revenge (Bellemare et al., 2016). Recently, bonus-based exploration methods, which explore by augmenting the environment reward, have reached above-human average performance on such domains. In this paper we reassess popular bonus-based exploration methods within a common evaluation framework. We combine Rainbow (Hessel et al., 2018) with different exploration bonuses and evaluate its performance on Montezuma's Revenge, Bellemare et al.'s set of hard of exploration games with sparse rewards, and the whole Atari 2600 suite. We find that while exploration bonuses lead to higher score on Montezuma's Revenge they do not provide meaningful gains over the simpler epsilon-greedy scheme. In fact, we find that methods that perform best on that game often underperform epsilon-greedy on easy exploration Atari 2600 games. We find that our conclusions remain valid even when hyperparameters are tuned for these easy-exploration games. Finally, we find that none of the methods surveyed benefit from additional training samples (1 billion frames, versus Rainbow's 200 million) on Bellemare et al.'s hard exploration games. Our results suggest that recent gains in Montezuma's Revenge may be better attributed to architecture change, rather than better exploration schemes; and that the real pace of progress in exploration research for Atari 2600 games may have been obfuscated by good results on a single domain.",
        "keywords": "exploration;arcade learning environment;bonus-based methods",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Adrien Ali Taiga;William Fedus;Marlos C. Machado;Aaron Courville;Marc G. Bellemare",
        "authorids": "adrien.alitaiga@gmail.com;liamfedus@google.com;marlosm@google.com;aaron.courville@gmail.com;bellemare@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nTaiga2020On,\ntitle={On Bonus Based Exploration Methods In The Arcade Learning Environment},\nauthor={Adrien Ali Taiga and William Fedus and Marlos C. Machado and Aaron Courville and Marc G. Bellemare},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJewlyStDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJewlyStDr",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "687;260",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "749;138",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            473.5,
            213.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            443.5,
            305.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 88,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17748217965214774794&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJexP6VKwH",
        "title": "Generalized Domain Adaptation with Covariate and Label Shift CO-ALignment",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a covariate and label distribution CO-ALignment (COAL) model to tackle Generalized Domain Adaptation (GDA) with covariant shift and label shift.",
        "abstract": "Unsupervised knowledge transfer has a great potential to improve the generalizability of deep models to novel domains. Yet the current literature assumes that the label distribution is domain-invariant and only aligns the covariate or vice versa. In this paper, we explore the task of Generalized Domain Adaptation (GDA): How to transfer knowledge across different domains in the presence of both covariate and label shift? We propose a covariate and label distribution CO-ALignment (COAL) model to tackle this problem. Our model leverages prototype-based conditional alignment and label distribution estimation to diminish the covariate and label shifts, respectively. We demonstrate experimentally that when both types of shift exist in the data, COAL leads to state-of-the-art performance on several cross-domain benchmarks.",
        "keywords": "Domain Adaptation;Label Shift;Covariate Shift",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shuhan Tan;Xingchao Peng;Kate Saenko",
        "authorids": "tanshh@mail2.sysu.edu.cn;xpeng@bu.edu;saenko@bu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntan2020generalized,\ntitle={Generalized Domain Adaptation with Covariate and Label Shift {\\{}CO{\\}}-{\\{}AL{\\}}ignment},\nauthor={Shuhan Tan and Xingchao Peng and Kate Saenko},\nyear={2020},\nurl={https://openreview.net/forum?id=BJexP6VKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJexP6VKwH",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "883;666;192",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "683;814;360",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            580.3333333333334,
            288.52999073849423
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            619.0,
            190.789587416784
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1122758801858576307&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJg15lrKvS",
        "title": "Towards Understanding the Spectral Bias of Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "An intriguing phenomenon observed during training neural networks is the spectral bias, where neural networks are biased towards learning less complex functions. The priority of learning functions with low complexity might be at the core of explaining generalization ability of neural network, and certain efforts have been made to provide theoretical explanation for spectral bias. However, there is still no satisfying theoretical results justifying the existence of spectral bias. In this work, we give a comprehensive and rigorous explanation for spectral bias and relate it with the neural tangent kernel function proposed in recent work. We prove that the training process of neural networks can be decomposed along different directions defined by the eigenfunctions of the neural tangent kernel, where each direction has its own convergence rate and the rate is determined by the corresponding eigenvalue. We then provide a case study when the input data is uniformly distributed over the unit shpere, and show that lower degree spherical harmonics are easier to be learned by over-parameterized neural networks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuan Cao;Zhiying Fang;Yue Wu;Ding-Xuan Zhou;Quanquan Gu",
        "authorids": "yuancao@cs.ucla.edu;zyfang4-c@my.cityu.edu.hk;ywu@cs.ucla.edu;mazhou@cityu.edu.hk;qgu@cs.ucla.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ncao2020towards,\ntitle={Towards Understanding the Spectral Bias of Deep Learning},\nauthor={Yuan Cao and Zhiying Fang and Yue Wu and Ding-Xuan Zhou and Quanquan Gu},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg15lrKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJg15lrKvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "721;254;516",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "821;246;568",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            497.0,
            191.12474111602262
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            545.0,
            235.30547521608304
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 273,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6172180032412316041&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "BJg1f6EFDB",
        "title": "On Identifiability in Transformers",
        "track": "main",
        "status": "Poster",
        "tldr": "We investigate the identifiability and interpretability of attention distributions and tokens within contextual embeddings in the self-attention based BERT model.",
        "abstract": "In this paper we delve deep in the Transformer architecture by investigating two of its core components: self-attention and contextual embeddings. In particular, we study the identifiability of attention weights and token embeddings, and the aggregation of context into hidden tokens. We show that, for sequences longer than the attention head dimension, attention weights are not identifiable. We propose effective attention as a complementary tool for improving explanatory interpretations based on attention. Furthermore, we show that input tokens retain to a large degree their identity across the model. We also find evidence suggesting that identity information is mainly encoded in the angle of the embeddings and gradually decreases with depth. Finally, we demonstrate strong mixing of input information in the generation of contextual embeddings by means of a novel quantification method based on gradient attribution. Overall, we show that self-attention distributions are not directly interpretable and present tools to better understand and further investigate Transformer models.   ",
        "keywords": "Self-attention;interpretability;identifiability;BERT;Transformer;NLP;explanation;gradient attribution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gino Brunner;Yang Liu;Damian Pascual;Oliver Richter;Massimiliano Ciaramita;Roger Wattenhofer",
        "authorids": "brunnegi@ethz.ch;liu.yang@alumni.ethz.ch;dpascual@ethz.ch;richtero@ethz.ch;massi@google.com;wattenhofer@ethz.ch",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nBrunner2020On,\ntitle={On Identifiability in Transformers},\nauthor={Gino Brunner and Yang Liu and Damian Pascual and Oliver Richter and Massimiliano Ciaramita and Roger Wattenhofer},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg1f6EFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJg1f6EFDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "404;1369;360",
        "wc_reply_reviewers": "179;35;0",
        "wc_reply_authors": "556;774;668",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            711.0,
            465.6228803083743
        ],
        "wc_reply_reviewers_avg": [
            71.33333333333333,
            77.46110134914312
        ],
        "wc_reply_authors_avg": [
            666.0,
            89.00936280339651
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 237,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1119021683739736991&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BJg1fgBYwH",
        "title": "SAFE-DNN: A Deep Neural Network with Spike Assisted Feature Extraction for Noise Robust Inference",
        "track": "main",
        "status": "Reject",
        "tldr": "A noise robust deep learning architecture.",
        "abstract": "We present a Deep Neural Network with Spike Assisted Feature Extraction (SAFE-DNN) to improve robustness of classification under stochastic perturbation of inputs. The proposed network augments a DNN with unsupervised learning of low-level features using spiking neuron network (SNN) with Spike-Time-Dependent-Plasticity (STDP). The complete network learns to ignore local perturbation while performing global feature detection and classification. The experimental results on CIFAR-10 and ImageNet subset demonstrate improved noise robustness for multiple DNN architectures without sacrificing accuracy on clean images.",
        "keywords": "Noise robust;deep learning;DNN;image classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xueyuan She;Priyabrata Saha;Daehyun Kim;Yun Long;Saibal Mukhopadhyay",
        "authorids": "xshe6@gatech.edu;priyabratasaha@gatech.edu;daehyun.kim@gatech.edu;yunlong@gatech.edu;saibal.mukhopadhyay@ece.gatech.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nshe2020safednn,\ntitle={{\\{}SAFE{\\}}-{\\{}DNN{\\}}: A Deep Neural Network with Spike Assisted Feature Extraction for Noise Robust Inference},\nauthor={Xueyuan She and Priyabrata Saha and Daehyun Kim and Yun Long and Saibal Mukhopadhyay},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg1fgBYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJg1fgBYwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "271;136;188",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "345;737;444",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            198.33333333333334,
            55.59576322786556
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            508.6666666666667,
            166.43784291907764
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7676562876342198486&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJg4NgBKvH",
        "title": "Training binary neural networks with real-to-binary convolutions",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "This paper shows how to train binary networks to within a few percent points (~3-5%) of the full precision counterpart. We first show how to build a strong baseline, which already achieves state-of-the-art accuracy, by combining recently proposed advances and carefully adjusting the optimization procedure. Secondly, we show that by attempting to minimize the discrepancy between the output of the binary and the corresponding real-valued convolution, additional significant accuracy gains can be obtained. We materialize this idea in two complementary ways: (1) with a loss function, during training, by matching the spatial attention maps computed at the output of the binary and real-valued convolutions, and (2) in a data-driven manner, by using the real-valued activations, available during inference prior to the binarization process, for re-scaling the activations right after the binary convolution. Finally, we show that, when putting all of our improvements together, the proposed model beats the current state of the art by more than 5% top-1 accuracy on ImageNet and reduces the gap to its real-valued counterpart to less than 3% and 5% top-1 accuracy on CIFAR-100 and ImageNet respectively when using a ResNet-18 architecture. Code available at https://github.com/brais-martinez/real2binary",
        "keywords": "binary networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Brais Martinez;Jing Yang;Adrian Bulat;Georgios Tzimiropoulos",
        "authorids": "brais.mart@gmail.com;psxjy3@nottingham.ac.uk;adrian@adrianbulat.com;yorgos.tzimiropoulos@nottingham.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nMartinez2020Training,\ntitle={Training binary neural networks with real-to-binary convolutions},\nauthor={Brais Martinez and Jing Yang and Adrian Bulat and Georgios Tzimiropoulos},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg4NgBKvH}\n}",
        "github": "[![github](/images/github_icon.svg) brais-martinez/real2binary](https://github.com/brais-martinez/real2binary)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJg4NgBKvH",
        "pdf_size": 0,
        "rating": "6;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "474;205;422;440",
        "wc_reply_reviewers": "0;0;0;100",
        "wc_reply_authors": "778;97;1244;394",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            385.25,
            105.72931239727231
        ],
        "wc_reply_reviewers_avg": [
            25.0,
            43.30127018922193
        ],
        "wc_reply_authors_avg": [
            628.25,
            429.73036604363904
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 298,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6977393399937358089&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJg641BKPH",
        "title": "Gradient Descent can Learn Less Over-parameterized Two-layer Neural Networks on Classification Problems",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recently, several studies have proven the global convergence and generalization abilities of the gradient descent method for two-layer ReLU networks. Most studies especially focused on the regression problems with the squared loss function, except for a few, and the importance of the positivity of the neural tangent kernel has been pointed out. However, the performance of gradient descent on classification problems using the logistic loss function has not been well studied, and further investigation of this problem structure is possible. In this work, we demonstrate that the separability assumption using a neural tangent model is more reasonable than the positivity condition of the neural tangent kernel and provide a refined convergence analysis of the gradient descent for two-layer networks with smooth activations. A remarkable point of our result is that our convergence and generalization bounds have much better dependence on the network width in comparison to related studies. Consequently, our theory significantly enlarges a class of over-parameterized networks with provable generalization ability, with respect to the network width, while most studies require much higher over-parameterization.",
        "keywords": "gradient descent;neural network;over-parameterization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Atsushi Nitanda;Geoffrey Chinot;Taiji Suzuki",
        "authorids": "nitanda@mist.i.u-tokyo.ac.jp;geoffreychinot@gmail.com;taiji@mist.i.u-tokyo.ac.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nnitanda2020gradient,\ntitle={Gradient Descent can Learn Less Over-parameterized Two-layer Neural Networks on Classification Problems},\nauthor={Atsushi Nitanda and Geoffrey Chinot and Taiji Suzuki},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg641BKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJg641BKPH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "373;273;295",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "361;330;22",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            313.6666666666667,
            42.90558109254423
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            237.66666666666666,
            153.02360021324233
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8861413964350239098&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BJg73xHtvr",
        "title": "Constant Curvature Graph Convolutional Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We generalize GCNs to (products of) spaces of constant sectional curvature using the gyrovector space formalism.",
        "abstract": " Interest has been rising lately towards methods representing data in non-Euclidean spaces, e.g. hyperbolic or spherical. These geometries provide specific inductive biases useful for certain real-world data properties, e.g. scale-free or hierarchical graphs are best embedded in a hyperbolic space. However, the very popular class of  graph neural networks is currently limited to model data only via Euclidean node embeddings and associated vector space operations. In this work, we bridge this gap by proposing mathematically grounded generalizations of graph convolutional networks (GCN) to (products of) constant curvature spaces. We do this by i) extending the gyro-vector space theory from hyperbolic to spherical spaces, providing a unified and smooth view of the two geometries, ii) leveraging gyro-barycentric coordinates that generalize the classic Euclidean concept of the center of mass. Our class of models gives strict generalizations in the sense that they recover their Euclidean counterparts when the curvature goes to zero from either side. Empirically, our methods outperform different types of classic Euclidean GCNs in the tasks of node classification and minimizing distortion for symbolic data exhibiting non-Euclidean behavior, according to their discrete curvature. ",
        "keywords": "graph convolutional neural networks;hyperbolic spaces;gyrvector spaces;riemannian manifolds;graph embeddings",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gregor Bachmann;Gary B\u00e9cigneul;Octavian-Eugen Ganea",
        "authorids": "gregorb@student.ethz.ch;garyb@mit.edu;oct@mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nbachmann2020constant,\ntitle={Constant Curvature Graph Convolutional Networks},\nauthor={Gregor Bachmann and Gary B{\\'e}cigneul and Octavian-Eugen Ganea},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg73xHtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJg73xHtvr",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "539;318;773",
        "wc_reply_reviewers": "0;0;10",
        "wc_reply_authors": "542;107;365",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            543.3333333333334,
            185.77824295529214
        ],
        "wc_reply_reviewers_avg": [
            3.3333333333333335,
            4.714045207910316
        ],
        "wc_reply_authors_avg": [
            338.0,
            178.61130983227238
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 167,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10310176411844107946&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BJg7x1HFvB",
        "title": "Well-Read Students Learn Better: On the Importance of Pre-training Compact Models",
        "track": "main",
        "status": "Reject",
        "tldr": "Studies how self-supervised learning and knowledge distillation interact in the context of building compact models.",
        "abstract": "Recent developments in natural language representations have been accompanied by large and expensive models that leverage vast amounts of general-domain text through self-supervised pre-training. Due to the cost of applying such models to down-stream tasks, several model compression techniques on pre-trained language representations have been proposed (Sun et al., 2019; Sanh, 2019). However, surprisingly,  the simple baseline of just pre-training and fine-tuning compact models has been overlooked. In this paper, we first show that pre-training remains important in the context of smaller architectures, and fine-tuning pre-trained compact models can be competitive to more elaborate methods proposed in concurrent work. Starting with pre-trained compact models, we then explore transferring task knowledge from large fine-tuned models through standard knowledge distillation. The resulting simple, yet effective and general algorithm, Pre-trained Distillation, brings further improvements. Through extensive experiments, we more generally explore the interaction between pre-training and distillation under two variables that have been under-studied: model size and properties of unlabeled task data. One surprising observation is that they have a compound effect even when sequentially applied on the same data. To accelerate future research, we will make our 24 pre-trained miniature BERT models publicly available.",
        "keywords": "NLP;self-supervised learning;language model pre-training;knowledge distillation;BERT;compact models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Iulia Turc;Ming-Wei Chang;Kenton Lee;Kristina Toutanova",
        "authorids": "iuliaturc@google.com;mingweichang@google.com;kentonl@google.com;kristout@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nturc2020wellread,\ntitle={Well-Read Students Learn Better: On the Importance of Pre-training Compact Models},\nauthor={Iulia Turc and Ming-Wei Chang and Kenton Lee and Kristina Toutanova},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg7x1HFvB}\n}",
        "github": "[![github](/images/github_icon.svg) google-research/bert](https://github.com/google-research/bert) + [![Papers with Code](/images/pwc_icon.svg) 39 community implementations](https://paperswithcode.com/paper/?openreview=BJg7x1HFvB)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJg7x1HFvB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "166;180;442",
        "wc_reply_reviewers": "0;35;0",
        "wc_reply_authors": "179;329;98",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            262.6666666666667,
            126.93655458097518
        ],
        "wc_reply_reviewers_avg": [
            11.666666666666666,
            16.49915822768611
        ],
        "wc_reply_authors_avg": [
            202.0,
            95.69743988216194
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 787,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11660874808998796000&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJg866NFvB",
        "title": "Estimating counterfactual treatment outcomes over time through adversarially balanced representations",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "Identifying when to give treatments to patients and how to select among multiple treatments over time are important medical problems with a few existing solutions. In this paper, we introduce the Counterfactual Recurrent Network (CRN), a novel sequence-to-sequence model that leverages the increasingly available patient observational data to estimate treatment effects over time and answer such medical questions. To handle the bias from time-varying confounders, covariates affecting the treatment assignment policy in the observational data, CRN uses domain adversarial training to build balancing representations of the patient history. At each timestep, CRN constructs a treatment invariant representation which removes the association between patient history and treatment assignments and thus can be reliably used for making counterfactual predictions. On a simulated model of tumour growth, with varying degree of time-dependent confounding, we show how our model achieves lower error in estimating counterfactuals and in choosing the correct treatment and timing of treatment than current state-of-the-art methods.",
        "keywords": "treatment effects over time;causal inference;counterfactual estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ioana Bica;Ahmed M Alaa;James Jordon;Mihaela van der Schaar",
        "authorids": "ioana.bica@eng.ox.ac.uk;a7med3laa@hotmail.com;james.jordon@wolfson.ox.ac.uk;mschaar@turing.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nBica2020Estimating,\ntitle={Estimating counterfactual treatment outcomes over time through adversarially balanced representations},\nauthor={Ioana Bica and Ahmed M Alaa and James Jordon and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg866NFvB}\n}",
        "github": "[![github](/images/github_icon.svg) ioanabica/Counterfactual-Recurrent-Network](https://github.com/ioanabica/Counterfactual-Recurrent-Network) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=BJg866NFvB)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJg866NFvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "137;630;145",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "55;1307;231",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            304.0,
            230.53994592405598
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            531.0,
            553.3991928677406
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 218,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15635972002030010202&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJg8_xHtPr",
        "title": "OBJECT-ORIENTED REPRESENTATION OF 3D SCENES",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, we propose a generative model, called ROOTS (Representation of Object-Oriented Three-dimension Scenes), for unsupervised object-wise 3D-scene decomposition and and rendering. For 3D scene modeling, ROOTS bases on the Generative Query Networks (GQN) framework, but unlike GQN, provides object-oriented representation decomposition. The inferred object-representation of ROOTS is 3D in the sense that it is viewpoint invariant as the full scene representation of GQN is so. ROOTS also provides hierarchical object-oriented representation: at 3D global-scene level and at 2D local-image level. We achieve this  without performance degradation. In experiments on datasets of 3D rooms with multiple objects, we demonstrate the above properties by focusing on its abilities for disentanglement, compositionality, and generalization in comparison to GQN.",
        "keywords": "unsupervised learning;representation learning;3D scene decomposition;3D detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chang Chen;Sungjin Ahn",
        "authorids": "chang.chen@rutgers.edu;sjn.ahn@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nchen2020objectoriented,\ntitle={{\\{}OBJECT{\\}}-{\\{}ORIENTED{\\}} {\\{}REPRESENTATION{\\}} {\\{}OF{\\}} 3D {\\{}SCENES{\\}}},\nauthor={Chang Chen and Sungjin Ahn},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg8_xHtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJg8_xHtPr",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "2283;286;325;589",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "1514;699;635;705",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;1;1;2",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            870.75,
            823.6517392077795
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            888.25,
            362.3171090357175
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J7DllmLeD3QJ:scholar.google.com/&scioq=OBJECT-ORIENTED+REPRESENTATION+OF+3D+SCENES&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BJg9hTNKPH",
        "title": "Behavior Regularized Offline Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In reinforcement learning (RL) research, it is common to assume access to direct online interactions with the environment. However in many real-world applications, access to the environment is limited to a fixed offline dataset of logged experience. In such settings, standard RL algorithms have been shown to diverge or otherwise yield poor performance.  Accordingly, much recent work has suggested a number of remedies to these issues.  In this work, we introduce a general framework, behavior regularized actor critic (BRAC), to empirically evaluate recently proposed methods  as well as a number of simple baselines across a variety of offline continuous control tasks. Surprisingly, we find that many of the technical complexities introduced in recent methods are unnecessary to achieve strong performance. Additional ablations provide insights into which design choices matter most in the offline RL setting.",
        "keywords": "reinforcement learning;offline RL;batch RL",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yifan Wu;George Tucker;Ofir Nachum",
        "authorids": "yw4@andrew.cmu.edu;gjt@google.com;ofirnachum@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwu2020behavior,\ntitle={Behavior Regularized Offline Reinforcement Learning},\nauthor={Yifan Wu and George Tucker and Ofir Nachum},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg9hTNKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJg9hTNKPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "387;151;322",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "442;271;693",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            286.6666666666667,
            99.53335565974298
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            468.6666666666667,
            173.3096137616786
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 871,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2805246276780413176&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJgAf6Etwr",
        "title": "XLDA: Cross-Lingual Data Augmentation for Natural Language Inference and Question Answering",
        "track": "main",
        "status": "Reject",
        "tldr": "Translating portions of the input during training can improve cross-lingual performance.",
        "abstract": "While natural language processing systems often focus on a single language, multilingual transfer learning has the potential to improve performance, especially for low-resource languages. \nWe introduce XLDA, cross-lingual data augmentation, a method that replaces a segment of the input text with its translation in another language. XLDA enhances performance of all 14 tested languages of the cross-lingual natural language inference (XNLI) benchmark. With improvements of up to 4.8, training with XLDA achieves state-of-the-art performance for Greek, Turkish, and Urdu. XLDA is in contrast to, and performs markedly better than, a more naive approach that aggregates examples in various languages in a way that each example is solely in one language. On the SQuAD question answering task, we see that XLDA provides a 1.0 performance increase on the English evaluation set. Comprehensive experiments suggest that most languages are effective as cross-lingual augmentors, that XLDA is robust to a wide range of translation quality, and that XLDA is even more effective for randomly initialized models than for pretrained models.",
        "keywords": "cross-lingual;transfer learning;BERT",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jasdeep Singh;Bryan McCann;Nitish Shirish Keskar;Caiming Xiong;Richard Socher",
        "authorids": "jasdeep@cs.stanford.edu;bmccann@salesforce.com;nkeskar@salesforce.com;cxiong@salesforce.com;rsocher@salesforce.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsingh2020xlda,\ntitle={{\\{}XLDA{\\}}: Cross-Lingual Data Augmentation for Natural Language Inference and Question Answering},\nauthor={Jasdeep Singh and Bryan McCann and Nitish Shirish Keskar and Caiming Xiong and Richard Socher},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgAf6Etwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJgAf6Etwr",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "282;246;398",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            308.6666666666667,
            64.85539470408165
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 91,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13318478954270359325&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJgEd6NYPH",
        "title": "Ellipsoidal Trust Region Methods for Neural Network Training",
        "track": "main",
        "status": "Reject",
        "tldr": "We prepose a generalization of adaptive gradient methods to second-order algorithms.",
        "abstract": "We investigate the use of ellipsoidal trust region constraints for second-order optimization of neural networks. This approach can be seen as a higher-order counterpart of adaptive gradient methods, which we here show to be interpretable as first-order trust region methods with ellipsoidal constraints. In particular, we show that the preconditioning matrix used in RMSProp and Adam satisfies the necessary conditions for provable convergence of second-order trust region methods with standard worst-case complexities. Furthermore, we run experiments across different neural architectures and datasets to find that the ellipsoidal constraints constantly outperform their spherical counterpart both in terms of number of backpropagations and asymptotic loss value. Finally, we find comparable performance to state-of-the-art first-order methods in terms of backpropagations, but further advances in hardware are needed to render Newton methods competitive in terms of time.",
        "keywords": "non-convex;optimization;neural networks;trust-region",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Leonard Adolphs;Jonas Kohler;Aurelien Lucchi",
        "authorids": "ladolphs@inf.ethz.ch;jonas.kohler@inf.ethz.ch;aurelien.lucchi@inf.ethz.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nadolphs2020ellipsoidal,\ntitle={Ellipsoidal Trust Region Methods for Neural Network Training},\nauthor={Leonard Adolphs and Jonas Kohler and Aurelien Lucchi},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgEd6NYPH}\n}",
        "github": "https://www.dropbox.com/sh/cs8cokvhirjfit1/AAA-NiyMXGCrZsJPBFAXbAHUa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJgEd6NYPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "283;349;177",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "462;586;155",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            269.6666666666667,
            70.84882183604435
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            401.0,
            181.16475006652553
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1089624317601585523&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJgLpaEtDS",
        "title": "Poincar\u00e9 Wasserstein Autoencoder",
        "track": "main",
        "status": "Reject",
        "tldr": "Wasserstein Autoencoder with hyperbolic latent space",
        "abstract": "This work presents the Poincar\u00e9 Wasserstein Autoencoder, a reformulation of\nthe recently proposed Wasserstein autoencoder framework on a non-Euclidean\nmanifold, the Poincar\u00e9 ball model of the hyperbolic space H n . By assuming the\nlatent space to be hyperbolic, we can use its intrinsic hierarchy to impose structure\non the learned latent space representations. We show that for datasets with latent\nhierarchies, we can recover the structure in a low-dimensional latent space. We\nalso demonstrate the model in the visual domain to analyze some of its properties\nand show competitive results on a graph link prediction task.",
        "keywords": "Variational inference;hyperbolic geometry;hierarchical latent space;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ivan Ovinnikov",
        "authorids": "ivan.ovinnikov@inf.ethz.ch",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\novinnikov2020poincar,\ntitle={Poincar{\\'e} Wasserstein Autoencoder},\nauthor={Ivan Ovinnikov},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgLpaEtDS}\n}",
        "github": "https://github.com/io-papercode/pwa",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJgLpaEtDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "598;153;149",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "777;386;248",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            300.0,
            210.7241482760499
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            470.3333333333333,
            224.04513434177102
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13944430498105539892&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJgMFxrYPB",
        "title": "Learning to Move with Affordance Maps",
        "track": "main",
        "status": "Poster",
        "tldr": "We address the task of autonomous exploration and navigation using spatial affordance maps that can be learned in a self-supervised manner, these outperform classic geometric baselines while being more sample efficient than contemporary RL algorithms",
        "abstract": "The ability to autonomously explore and navigate a physical space is a fundamental requirement for virtually any mobile autonomous agent, from household robotic vacuums to autonomous vehicles. Traditional SLAM-based approaches for exploration and navigation largely focus on leveraging scene geometry, but fail to model dynamic objects (such as other agents) or semantic constraints (such as wet floors or doorways). Learning-based RL agents are an attractive alternative because they can incorporate both semantic and geometric information, but are notoriously sample inefficient, difficult to generalize to novel settings, and are difficult to interpret. In this paper, we combine the best of both worlds with a modular approach that {\\em learns} a spatial representation of a scene that is trained to be effective when coupled with traditional geometric planners. Specifically, we design an agent that learns to predict a spatial affordance map that elucidates what parts of a scene are navigable through active self-supervised experience gathering. In contrast to most simulation environments that assume a static world, we evaluate our approach in the VizDoom simulator, using large-scale randomly-generated maps containing a variety of dynamic actors and hazards. We show that learned affordance maps can be used to augment traditional approaches for both exploration and navigation, providing significant improvements in performance.",
        "keywords": "navigation;exploration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "William Qi;Ravi Teja Mullapudi;Saurabh Gupta;Deva Ramanan",
        "authorids": "wq@cs.cmu.edu;raviteja.mullapudi@gmail.com;saurabhg@illinois.edu;deva@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nQi2020Learning,\ntitle={Learning to Move with Affordance Maps},\nauthor={William Qi and Ravi Teja Mullapudi and Saurabh Gupta and Deva Ramanan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgMFxrYPB}\n}",
        "github": "https://github.com/wqi/A2L",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJgMFxrYPB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "304;124;407",
        "wc_reply_reviewers": "4;0;119",
        "wc_reply_authors": "873;354;639",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;3",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            278.3333333333333,
            116.95108189134274
        ],
        "wc_reply_reviewers_avg": [
            41.0,
            55.17849822772152
        ],
        "wc_reply_authors_avg": [
            622.0,
            212.22158231433485
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 42,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10625760242588523450&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJgMn0EYPB",
        "title": "Deflecting Adversarial Attacks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "There has been an ongoing cycle where stronger detection mechanisms and defenses against adversarial attacks are subsequently broken by a more advanced defense-aware attack. We present a new approach, which we argue is a step towards ending this cycle by deflecting adversarial attacks, i.e., by forcing the attacker to produce an input which semantically resembles the attack's target class. To this end, we first propose a stronger defense mechanism based on capsule networks which combines three detection mechanisms to achieve state-of-the-art detection performance on both standard and defense-aware attacks. We then show that undetected attacks against our defense are often classified as the adversarial target class by performing a human study where participants are asked to label the class of images produced by the attack. These attack images thus can no longer be called adversarial, as our network classifies them the same way as humans do.",
        "keywords": "Adversarial Examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yao Qin;Nicholas Frosst;Colin Raffel;Garrison Cottrell;Geoffrey Hinton",
        "authorids": "yaq007@eng.ucsd.edu;frosst@google.com;craffel@google.com;gary@eng.ucsd.edu;geoffhinton@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJgMn0EYPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "573;97;535",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "497;0;157",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            401.6666666666667,
            215.98971168913477
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            218.0,
            207.43352348804825
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8519459590625216225&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJgNJgSFPS",
        "title": "Building Deep Equivariant Capsule Networks",
        "track": "main",
        "status": "Talk",
        "tldr": "A new scalable, group-equivariant model for capsule networks that preserves compositionality under transformations, and is empirically more transformation-robust to older capsule network models.",
        "abstract": "Capsule networks are constrained by the parameter-expensive nature of their layers, and the general lack of provable equivariance guarantees. We present a variation of capsule networks that aims to remedy this. We identify that learning all pair-wise part-whole relationships between capsules of successive layers is inefficient. Further, we also realise that the choice of prediction networks and the routing mechanism are both key to equivariance. Based on these, we propose an alternative framework for capsule networks that learns to projectively encode the manifold of pose-variations, termed the space-of-variation (SOV), for every capsule-type of each layer. This is done using a trainable, equivariant function defined over a grid of group-transformations. Thus, the prediction-phase of routing involves projection into the SOV of a deeper capsule using the corresponding function. As a specific instantiation of this idea, and also in order to reap the benefits of increased parameter-sharing, we use type-homogeneous group-equivariant convolutions of shallower capsules in this phase. We also introduce an equivariant routing mechanism based on degree-centrality. We show that this particular instance of our general model is equivariant, and hence preserves the compositional representation of an input under transformations. We conduct several experiments on standard object-classification datasets that showcase the increased transformation-robustness, as well as general performance, of our model to several capsule baselines.",
        "keywords": "Capsule networks;equivariance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sai Raam Venkataraman;S. Balasubramanian;R. Raghunatha Sarma",
        "authorids": "vsairaam@sssihl.edu.in;sbalasubramanian@sssihl.edu.in;rraghunathasarma@sssihl.edu.in",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nVenkataraman2020Building,\ntitle={Building Deep Equivariant Capsule Networks},\nauthor={Sai Raam Venkataraman and S. Balasubramanian and R. Raghunatha Sarma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgNJgSFPS}\n}",
        "github": "https://github.com/AnonymousCapsuleSOVNET/SOVNET",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJgNJgSFPS",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "537;370",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "1864;1743",
        "reply_reviewers": "0;0",
        "reply_authors": "4;5",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            453.5,
            83.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1803.5,
            60.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            4.5,
            0.5
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 40,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14724285179956079&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BJgQ4lSFPH",
        "title": "StructBERT: Incorporating Language Structures into Pre-training for Deep Language Understanding",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Recently, the pre-trained language model, BERT (and its robustly optimized version RoBERTa), has attracted a lot of attention in natural language understanding (NLU), and achieved state-of-the-art accuracy in various NLU tasks, such as sentiment classification, natural language inference, semantic textual similarity and question answering. Inspired by the linearization exploration work of Elman, we extend BERT to a new model, StructBERT, by incorporating language structures into pre-training. Specifically, we pre-train StructBERT with two auxiliary tasks to make the most of the sequential order of words and sentences, which leverage language structures at the word and sentence levels, respectively. As a result, the new model is adapted to different levels of language understanding required by downstream tasks.\n\nThe StructBERT with structural pre-training gives surprisingly good empirical results on a variety of downstream tasks, including pushing the state-of-the-art on the GLUE benchmark to 89.0 (outperforming all published models at the time of model submission), the F1 score on SQuAD v1.1 question answering to 93.0, the accuracy on SNLI to 91.7.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei Wang;Bin Bi;Ming Yan;Chen Wu;Jiangnan Xia;Zuyi Bao;Liwei Peng;Luo Si",
        "authorids": "hebian.ww@alibaba-inc.com;b.bi@alibaba-inc.com;ym119608@alibaba-inc.com;wuchen.wc@alibaba-inc.com;jiangnan.xjn@alibaba-inc.com;zuyi.bzy@alibaba-inc.com;liwei.peng@alibaba-inc.com;luo.si@alibaba-inc.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nWang2020StructBERT:,\ntitle={StructBERT: Incorporating Language Structures into Pre-training for Deep Language Understanding},\nauthor={Wei Wang and Bin Bi and Ming Yan and Chen Wu and Jiangnan Xia and Zuyi Bao and Liwei Peng and Luo Si},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgQ4lSFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJgQ4lSFPH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "235;213;229",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "269;317;170",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            225.66666666666666,
            9.285592184789413
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            252.0,
            61.204574992397426
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 200,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3747253241706985253&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJgQfkSYDS",
        "title": "Neural Policy Gradient Methods: Global Optimality and Rates of Convergence",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Policy gradient methods with actor-critic schemes demonstrate tremendous empirical successes, especially when the actors and critics are parameterized by neural networks. However, it remains less clear whether such \"neural\" policy gradient methods converge to globally optimal policies and whether they even converge at all. We answer both the questions affirmatively in the overparameterized regime. In detail, we prove that neural natural policy gradient converges to a globally optimal policy at a sublinear rate. Also, we show that neural vanilla policy gradient converges sublinearly to a stationary point. Meanwhile, by relating the suboptimality of the stationary points to the~representation power of neural actor and critic classes, we prove the global optimality of all stationary points under mild regularity conditions. Particularly, we show that a key to the global optimality and convergence is the \"compatibility\" between the actor and critic, which is ensured by sharing neural architectures and random initializations across the actor and critic. To the best of our knowledge, our analysis establishes the first global optimality and convergence  guarantees for neural policy gradient methods. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lingxiao Wang;Qi Cai;Zhuoran Yang;Zhaoran Wang",
        "authorids": "lingxiaowang2022@u.northwestern.edu;qicai2022@u.northwestern.edu;zy6@princeton.edu;zhaoranwang@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWang2020Neural,\ntitle={Neural Policy Gradient Methods: Global Optimality and Rates of Convergence},\nauthor={Lingxiao Wang and Qi Cai and Zhuoran Yang and Zhaoran Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgQfkSYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJgQfkSYDS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "736;287;72",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1609;504;22",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.0,
            276.63092138563735
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            711.6666666666666,
            664.3223782338076
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 281,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8177676989771600971&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJgRjTNtPH",
        "title": "iSOM-GSN: An Integrative Approach for Transforming Multi-omic Data into Gene Similarity Networks via Self-organizing Maps",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper presents a deep learning model that combines self-organizing maps and convolutional neural networks for representation learning of multi-omics data",
        "abstract": "One of the main challenges in applying graph convolutional neural networks on gene-interaction data is the lack of understanding of the vector space  to which they belong and also the inherent difficulties involved in representing those interactions on a significantly lower dimension, viz Euclidean spaces. The challenge becomes more prevalent when dealing with various types of heterogeneous data. We introduce a systematic, generalized method, called iSOM-GSN, used to transform ``multi-omic'' data with higher dimensions onto a two-dimensional grid. Afterwards, we apply a convolutional neural network to predict disease states of various types. Based on the idea of Kohonen's self-organizing map, we generate a two-dimensional grid for each sample for a given set of genes that represent a gene similarity network.  We have tested the model to predict breast and prostate cancer using gene expression, DNA methylation and copy number alteration, yielding prediction accuracies in the 94-98% range for tumor stages of breast cancer and calculated Gleason scores of prostate cancer with just 11 input genes for both cases. The scheme not only outputs nearly perfect classification accuracy, but also provides an enhanced scheme for representation learning, visualization, dimensionality reduction, and interpretation of the results.",
        "keywords": "Gene similarity networks;self-organizing maps;convolutional neural networks;multi-omics data integration;graph representation learning;dimensionality reduction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nazia Fatima;Johan Fernandes;Luis Rueda",
        "authorids": "fatiman@uwindsor.ca;ferna11i@uwindsor.ca;lrueda@uwindsor.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://gitlab.com/NF2610/isom_gsn",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJgRjTNtPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "130;85;281",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            165.33333333333334,
            83.82654048026131
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8176588431617284603&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "BJgRsyBtPB",
        "title": "A Greedy Approach to Max-Sliced Wasserstein GANs",
        "track": "main",
        "status": "Reject",
        "tldr": "We apply a greedy assignment on the projected samples instead of sorting to approximate Wasserstein distance",
        "abstract": "Generative Adversarial Networks have made data generation possible in various use cases, but in case of complex, high-dimensional distributions it can be difficult to train them, because of convergence problems and the appearance of mode collapse.\nSliced Wasserstein GANs and especially the application of the Max-Sliced Wasserstein distance made it possible to approximate Wasserstein distance during training in an efficient and stable way and helped ease convergence problems of these architectures.\n\nThis method transforms sample assignment and distance calculation into sorting the one-dimensional projection of the samples, which results a sufficient approximation of the high-dimensional Wasserstein distance. \n\nIn this paper we will demonstrate that the approximation of the Wasserstein distance by sorting the samples is not always the optimal approach and the greedy assignment of the real and fake samples can result faster convergence and better approximation of the original distribution.",
        "keywords": "GEnerative Adversarial Networks;GANs;Wasserstein distances;Sliced Wasserstein Distance;Max-sliced Wasserstein distance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andr\u00e1s Horv\u00e1th",
        "authorids": "horvath.andras@itk.ppke.hu",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nhorv{\\'a}th2020a,\ntitle={A Greedy Approach to Max-Sliced Wasserstein {\\{}GAN{\\}}s},\nauthor={Andr{\\'a}s Horv{\\'a}th},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgRsyBtPB}\n}",
        "github": "https://drive.google.com/open?id=1zwEQ9lukoIeHBOLnxKE4MoiiYOqWCQii",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJgRsyBtPB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "262;167;485",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            304.6666666666667,
            133.28249030619972
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:awAXv0OWST0J:scholar.google.com/&scioq=A+Greedy+Approach+to+Max-Sliced+Wasserstein+GANs&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJgWE1SFwS",
        "title": "PCMC-Net: Feature-based Pairwise Choice Markov Chains",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a generic neural network architecture equipping Pairwise Choice Markov Chains choice models with amortized and automatic differentiation based inference using alternatives' and individuals' features.",
        "abstract": "Pairwise Choice Markov Chains (PCMC) have been recently introduced to overcome limitations of choice models based on traditional axioms unable to express empirical observations from modern behavior economics like context effects occurring when a choice between two options is altered by adding a third alternative. The inference approach that estimates the transition rates between each possible pair of alternatives via maximum likelihood suffers when the examples of each alternative are scarce and is inappropriate when new alternatives can be observed at test time. In this work, we propose an amortized inference approach for PCMC by embedding its definition into a neural network that represents transition rates as a function of the alternatives' and individual's features. We apply our construction to the complex case of airline itinerary booking where singletons are common (due to varying prices and individual-specific itineraries), and context effects and behaviors strongly dependent on market segments are observed. Experiments show our network significantly outperforming, in terms of prediction accuracy and logarithmic loss, feature engineered standard and latent class Multinomial Logit models as well as recent machine learning approaches.",
        "keywords": "choice modeling;pairwise choice Markov chains;deep learning;amortized inference;automatic differentiation;airline itinerary choice modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alix Lh\u00e9ritier",
        "authorids": "alherit@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nLh\u00e9ritier2020PCMC-Net:,\ntitle={PCMC-Net: Feature-based Pairwise Choice Markov Chains},\nauthor={Alix Lh\u00e9ritier},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgWE1SFwS}\n}",
        "github": "https://github.com/alherit/PCMC-Net",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJgWE1SFwS",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "414;365;225;653",
        "wc_reply_reviewers": "0;0;13;0",
        "wc_reply_authors": "316;130;35;209",
        "reply_reviewers": "0;0;1;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            414.25,
            154.3071207041334
        ],
        "wc_reply_reviewers_avg": [
            3.25,
            5.629165124598851
        ],
        "wc_reply_authors_avg": [
            172.5,
            103.24364387215321
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6364308783173808929&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJgWbpEtPr",
        "title": "One-way prototypical networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We show how to adapt prototypical few-shot networks to one-class problems; the key is a null class for comparison.",
        "abstract": "Few-shot models have become a popular topic of research in the past years. They offer the possibility to determine class belongings for unseen examples using just a handful of examples for each class. Such models are trained on a wide range of classes and their respective examples, learning a decision metric in the process. Types of few-shot models include matching networks and prototypical networks. We show a new way of training prototypical few-shot models for just a single\nclass. These models have the ability to predict the likelihood of an unseen query belonging to a group of examples without any given counterexamples. The difficulty here lies in the fact that no relative distance to other classes can be calculated\nvia softmax. We solve this problem by introducing a \u201cnull class\u201d centered around zero, and enforcing centering with batch normalization. Trained on the commonly used Omniglot data set, we obtain a classification accuracy of .98 on the matched\ntest set, and of .8 on unmatched MNIST data. On the more complex MiniImageNet data set, test accuracy is .8. In addition, we propose a novel Gaussian layer for distance calculation in a prototypical network, which takes the support examples\u2019 distribution rather than just their centroid into account. This extension shows promising results when a higher number of support examples is available.",
        "keywords": "few-shot learning;one-shot learning;prototypical networks;one-class classification;anomaly detection;outlier detection;matching networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anna Kruspe",
        "authorids": "anna.kruspe@dlr.de",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nkruspe2020oneway,\ntitle={One-way prototypical networks},\nauthor={Anna Kruspe},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgWbpEtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJgWbpEtPr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "206;262;614",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            360.6666666666667,
            180.58669816892814
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5332760509050080549&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJgZBxBYPB",
        "title": "Learning Underlying Physical Properties From Observations For Trajectory Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a model that learns physical properties from observations and uses them for trajectory prediction in physical games.",
        "abstract": "In this work we present an approach that combines deep learning together with\nlaws of Newton\u2019s physics for accurate trajectory predictions in physical games.\nOur model learns to estimate physical properties and forces that generated given\nobservations, learns the relationships between available player\u2019s actions and estimated\nphysical properties and uses these extracted forces for predictions. We\nshow the advantages of using physical laws together with deep learning by evaluating\nit against two baseline models that automatically discover features from\nthe data without such a knowledge. We evaluate our model abilities to extract\nphysical properties and to generalize to unseen trajectories in two games with a\nshooting mechanism. We also evaluate our model capabilities to transfer learned\nknowledge from a 2D game for predictions in a 3D game with a similar physics.\nWe show that by using physical laws together with deep learning we achieve a better\nhuman-interpretability of learned physical properties, transfer of knowledge to\na game with similar physics and very accurate predictions for previously unseen\ndata.",
        "keywords": "Physical Games;Deep Learning;Physical Reasoning;Transfer of Knowledge",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ekaterina Nikonova;Jochen Renz",
        "authorids": "ekaterina.nikonova@anu.edu.au;jochen.renz@anu.edu.au",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nnikonova2020learning,\ntitle={Learning Underlying Physical Properties From Observations For Trajectory Prediction},\nauthor={Ekaterina Nikonova and Jochen Renz},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgZBxBYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJgZBxBYPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "916;545;600",
        "wc_reply_reviewers": "35;0;0",
        "wc_reply_authors": "299;622;462",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            687.0,
            163.47680773328878
        ],
        "wc_reply_reviewers_avg": [
            11.666666666666666,
            16.49915822768611
        ],
        "wc_reply_authors_avg": [
            461.0,
            131.86609369609258
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:57GYyu6PGvIJ:scholar.google.com/&scioq=Learning+Underlying+Physical+Properties+From+Observations+For+Trajectory+Prediction&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJgZGeHFPH",
        "title": "Dynamics-Aware Embeddings",
        "track": "main",
        "status": "Poster",
        "tldr": "State and action embeddings which incorporate the dynamics improve exploration and RL from pixels.",
        "abstract": "In this paper we consider self-supervised representation learning to improve sample efficiency in reinforcement learning (RL). We propose a forward prediction objective for simultaneously learning embeddings of states and actions. These embeddings capture the structure of the environment's dynamics, enabling efficient policy learning. We demonstrate that our action embeddings alone improve the sample efficiency and peak performance of model-free RL on control from low-dimensional states. By combining state and action embeddings, we achieve efficient learning of high-quality policies on goal-conditioned continuous control from pixel observations in only 1-2 million environment steps.",
        "keywords": "representation learning;reinforcement learning;rl",
        "primary_area": "",
        "supplementary_material": "",
        "author": "William Whitney;Rajat Agarwal;Kyunghyun Cho;Abhinav Gupta",
        "authorids": "wfwhitney@gmail.com;ra2630@nyu.edu;kyunghyun.cho@nyu.edu;abhinavg@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWhitney2020Dynamics-Aware,\ntitle={Dynamics-Aware Embeddings},\nauthor={William Whitney and Rajat Agarwal and Kyunghyun Cho and Abhinav Gupta},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgZGeHFPH}\n}",
        "github": "https://github.com/dyne-submission/dynamics-aware-embeddings",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJgZGeHFPH",
        "pdf_size": 0,
        "rating": "3;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "590;288;417;512",
        "wc_reply_reviewers": "0;63;0;44",
        "wc_reply_authors": "443;341;396;373",
        "reply_reviewers": "0;2;0;1",
        "reply_authors": "1;2;1;1",
        "rating_avg": [
            6.25,
            2.0463381929681126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            451.75,
            112.65517076459473
        ],
        "wc_reply_reviewers_avg": [
            26.75,
            27.580563808595358
        ],
        "wc_reply_authors_avg": [
            388.25,
            37.157603528753036
        ],
        "reply_reviewers_avg": [
            0.75,
            0.82915619758885
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 64,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8354834388426273229&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJg_2JHKvH",
        "title": "Semi-Supervised Learning with Normalizing Flows",
        "track": "main",
        "status": "Reject",
        "tldr": "Probabilistic semi-supervised learning method based on normalizing flows",
        "abstract": "We propose Flow Gaussian Mixture Model (FlowGMM), a general-purpose method for semi-supervised learning based on a simple and principled probabilistic framework. We approximate the joint distribution of the labeled and unlabeled data with a flexible mixture model implemented as a Gaussian mixture transformed by a normalizing flow. We train the model by maximizing the exact joint likelihood of the labeled and unlabeled data. We evaluate FlowGMM on a wide range of semi-supervised classification  problems across different data types: AG-News and Yahoo Answers text data, MNIST, SVHN and CIFAR-10 image classification problems as well as tabular UCI datasets. FlowGMM achieves promising results on image classification problems and outperforms the competing methods on other types of data. FlowGMM learns an interpretable latent repesentation space and allows hyper-parameter free feature visualization at real time rates. Finally, we show that FlowGMM can be calibrated to produce meaningful uncertainty estimates for its predictions. ",
        "keywords": "Semi-Supervised Learning;Normalizing Flows",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pavel Izmailov;Polina Kirichenko;Marc Finzi;Andrew Wilson",
        "authorids": "izmailovpavel@gmail.com;pk1822@nyu.edu;maf820@nyu.edu;andrew@cornell.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nizmailov2020semisupervised,\ntitle={Semi-Supervised Learning with Normalizing Flows},\nauthor={Pavel Izmailov and Polina Kirichenko and Marc Finzi and Andrew Wilson},\nyear={2020},\nurl={https://openreview.net/forum?id=BJg_2JHKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJg_2JHKvH",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "251;925;444",
        "wc_reply_reviewers": "0;69;0",
        "wc_reply_authors": "282;410;241",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            540.0,
            283.40900950158
        ],
        "wc_reply_reviewers_avg": [
            23.0,
            32.526911934581186
        ],
        "wc_reply_authors_avg": [
            311.0,
            71.9768481295664
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 145,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9421035999149534110&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "BJg_FgBtPH",
        "title": "Towards Holistic and Automatic Evaluation of Open-Domain Dialogue Generation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose automatic metrics to holistically evaluate open-dialogue generation and they strongly correlate with human evaluation.",
        "abstract": "Open-domain dialogue generation has gained increasing attention in Natural Language Processing. Comparing these methods requires a holistic means of dialogue evaluation. Human ratings are deemed as the gold standard. As human evaluation is inefficient and costly, an automated substitute is desirable. In this paper, we propose holistic evaluation metrics which capture both the quality and diversity of dialogues. Our metrics consists of (1) GPT-2 based context coherence between sentences in a dialogue, (2) GPT-2 based fluency in phrasing, and, (3) $n$-gram based diversity in responses to augmented queries. The empirical validity of our metrics is demonstrated by strong correlation with human judgments. We provide the associated code, datasets and human ratings.",
        "keywords": "open-dialogue system;generation evaluation;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bo Pang;Erik Nijkamp;Wenjuan Han;Alex Zhou",
        "authorids": "bopang@g.ucla.edu;erik.nijkamp@gmail.com;hanwj0309@gmail.com;alexzhou907@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJg_FgBtPH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "474;525;569",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            522.6666666666666,
            38.81866675826063
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 90,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4723192911185345253&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJgctpEKwr",
        "title": "RPGAN: random paths as a latent space for GAN interpretability",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce an alternative GAN design based on random routes in generator, which can serve as a tool for generative models interpretability.",
        "abstract": "In this paper, we introduce Random Path Generative Adversarial Network (RPGAN) --- an alternative scheme of GANs that can serve as a tool for generative model analysis. While the latent space of a typical GAN consists of input vectors, randomly sampled from the standard Gaussian distribution, the latent space of RPGAN consists of random paths in a generator network. As we show, this design allows to associate different layers of the generator with different regions of the latent space, providing their natural interpretability. With experiments on standard benchmarks, we demonstrate that RPGAN reveals several interesting insights about roles that different layers play in the image generation process. Aside from interpretability, the RPGAN model also provides competitive generation quality and allows efficient incremental learning on new data.",
        "keywords": "generative models;GAN;interpretability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrey Voynov;Artem Babenko",
        "authorids": "an.voynov@gmail.com;artem.babenko@phystech.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nvoynov2020rpgan,\ntitle={{\\{}RPGAN{\\}}: random paths as a latent space for {\\{}GAN{\\}} interpretability},\nauthor={Andrey Voynov and Artem Babenko},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgctpEKwr}\n}",
        "github": "https://github.com/rpgan-ICLR2020/RPGAN",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJgctpEKwr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "262;483;241",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "418;439;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            328.6666666666667,
            109.46638246004518
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            285.6666666666667,
            202.17868884287043
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BX35Lc7hXUkJ:scholar.google.com/&scioq=RPGAN:+random+paths+as+a+latent+space+for+GAN+interpretability&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJgcwh4FwS",
        "title": "Neural Maximum Common Subgraph Detection with Guided Subgraph Extraction",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Maximum Common Subgraph (MCS) is defined as the largest subgraph that is commonly present in both graphs of a graph pair. Exact MCS detection is NP-hard, and its state-of-the-art exact solver based on heuristic search is slow in practice without any time complexity guarantee. Given the huge importance of this task yet the lack of fast solver, we propose an efficient MCS detection algorithm, NeuralMCS, consisting of a novel neural network model that learns the node-node correspondence from the ground-truth MCS result, and a subgraph extraction procedure that uses the neural network output as guidance for final MCS prediction. The whole model guarantees polynomial time complexity with respect to the number of the nodes of the larger of the two input graphs. Experiments on four real graph datasets show that the proposed model is 48.1x faster than the exact solver and more accurate than all the existing competitive approximate approaches to MCS detection.",
        "keywords": "graph matching;maximum common subgraph;graph neural networks;subgraph extraction;graph alignment",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yunsheng Bai;Derek Xu;Ken Gu;Xueqing Wu;Agustin Marinovic;Christopher Ro;Yizhou Sun;Wei Wang",
        "authorids": "yba@ucla.edu;derekqxu@ucla.edu;ken.qgu@gmail.com;shirley0@mail.ustc.edu.cn;amarinovic@ucla.edu;christopher.j.ro@gmail.com;yzsun@cs.ucla.edu;weiwang@cs.ucla.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nbai2020neural,\ntitle={Neural Maximum Common Subgraph Detection with Guided Subgraph Extraction},\nauthor={Yunsheng Bai and Derek Xu and Ken Gu and Xueqing Wu and Agustin Marinovic and Christopher Ro and Yizhou Sun and Wei Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgcwh4FwS}\n}",
        "github": "https://github.com/openpublicforpapers/NeuralMCS",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJgcwh4FwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "312;237;250",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "644;2626;401",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;6;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            266.3333333333333,
            32.72443871006635
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1223.6666666666667,
            996.5494914397824
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=83504045766739949&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJgcxxSKvr",
        "title": "Dynamic Graph Message Passing Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Modelling long-range dependencies is critical for scene understanding tasks in computer vision. Although CNNs have excelled in many vision tasks, they are still limited in capturing long-range structured relationships as they typically consist of layers of local kernels. A fully-connected graph is beneficial for such modelling, however, its computational overhead is prohibitive. We propose a dynamic graph message passing network, that significantly reduces the computational complexity compared to related works modelling a fully-connected graph. This is achieved by adaptively sampling nodes in the graph, conditioned on the input, for message passing. Based on the sampled nodes, we dynamically predict node-dependent filter weights and the affinity matrix for propagating information between them. Using this model, we show significant improvements with respect to strong, state-of-the-art baselines on three different tasks and backbone architectures. Our approach also outperforms fully-connected graphs while using substantially fewer floating-point operations.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Li Zhang;Dan Xu;Anurag Arnab;Philip H.S. Torr",
        "authorids": "lz@robots.ox.ac.uk;danxu@robots.ox.ac.uk;anurag.arnab@gmail.com;phst@robots.ox.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJgcxxSKvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "294;140;799",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "701;371;402",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            411.0,
            281.4687667693641
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            491.3333333333333,
            148.79590794851254
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 161,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4303557091610097560&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 18
    },
    {
        "id": "BJgd81SYwr",
        "title": "Meta Dropout: Learning to Perturb Latent Features for Generalization",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "A machine learning model that generalizes well should obtain low errors on unseen test examples. Thus, if we know how to optimally perturb training examples to account for test examples, we may achieve better generalization performance. However, obtaining such perturbation is not possible in standard machine learning frameworks as the distribution of the test data is unknown. To tackle this challenge, we propose a novel regularization method, meta-dropout, which learns to perturb the latent features of training examples for generalization in a meta-learning framework. Specifically, we meta-learn a noise generator which outputs a multiplicative noise distribution for latent features, to obtain low errors on the test instances in an input-dependent manner. Then, the learned noise generator can perturb the training examples of unseen tasks at the meta-test time for improved generalization. We validate our method on few-shot classification datasets, whose results show that it significantly improves the generalization performance of the base model, and largely outperforms existing regularization methods such as information bottleneck, manifold mixup, and information dropout.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hae Beom Lee;Taewook Nam;Eunho Yang;Sung Ju Hwang",
        "authorids": "haebeom.lee@kaist.ac.kr;namsan@kaist.ac.kr;eunhoy@kaist.ac.kr;sjhwang82@kaist.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLee2020Meta,\ntitle={Meta Dropout: Learning to Perturb Latent Features for Generalization},\nauthor={Hae Beom Lee and Taewook Nam and Eunho Yang and Sung Ju Hwang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgd81SYwr}\n}",
        "github": "https://github.com/haebeom-lee/metadrop",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJgd81SYwr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "531;243;175",
        "wc_reply_reviewers": "486;0;0",
        "wc_reply_authors": "939;280;241",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            316.3333333333333,
            154.30993343124595
        ],
        "wc_reply_reviewers_avg": [
            162.0,
            229.1025971044414
        ],
        "wc_reply_authors_avg": [
            486.6666666666667,
            320.2440041940243
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 60,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14333755794039765777&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJgdOh4Ywr",
        "title": "Visual Imitation with Reinforcement Learning using Recurrent Siamese Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Learning recurrent distance models for imitation from a single video clip using reinforcement learning.",
        "abstract": "It would be desirable for a reinforcement learning (RL) based agent to learn behaviour by merely watching a demonstration.  However, defining rewards that facilitate this goal within the RL paradigm remains a challenge. Here we address this problem with Siamese networks, trained to compute distances between observed behaviours and the agent\u2019s behaviours. Given a desired motion such Siamese networks can be used to provide a reward signal to an RL agent via the distance between the desired motion and the agent\u2019s motion. We experiment with an RNN-based comparator model that can compute distances in space and time between motion clips while training an RL policy to minimize this distance.  Through experimentation, we have had also found that the inclusion of multi-task data and an additional image encoding loss helps enforce the temporal consistency.  These two components appear to balance reward for matching a specific instance of a behaviour versus that behaviour in general. Furthermore, we focus here on a particularly challenging form of this problem where only a single demonstration is provided for a given task \u2013 the one-shot learning setting. We demonstrate our approach on humanoid agents in both 2D with 10 degrees of freedom (DoF) and 3D with 38 DoF.",
        "keywords": "imitation learning;reinforcement learning;imitation from video",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Glen Berseth;Christopher Pal",
        "authorids": "gberseth@gmail.com;christopher.pal@polymtl.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nberseth2020visual,\ntitle={Visual Imitation with Reinforcement Learning using Recurrent Siamese Networks},\nauthor={Glen Berseth and Christopher Pal},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgdOh4Ywr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJgdOh4Ywr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "172;214;299",
        "wc_reply_reviewers": "0;0;108",
        "wc_reply_authors": "297;392;47",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            228.33333333333334,
            52.82886416428891
        ],
        "wc_reply_reviewers_avg": [
            36.0,
            50.91168824543142
        ],
        "wc_reply_authors_avg": [
            245.33333333333334,
            145.50677723811432
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Hkd9gp2KhIgJ:scholar.google.com/&scioq=Visual+Imitation+with+Reinforcement+Learning+using+Recurrent+Siamese+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "BJge3TNKwH",
        "title": "Sliced Cramer Synaptic Consolidation for Preserving Deeply Learned Representations",
        "track": "main",
        "status": "Spotlight",
        "tldr": "\"A novel framework for overcoming catastrophic forgetting by preserving the distribution of the network's output at an arbitrary layer.\"",
        "abstract": "Deep neural networks suffer from the inability to preserve the learned data representation (i.e., catastrophic forgetting) in domains where the input data distribution is non-stationary, and it changes during training.  Various selective synaptic plasticity approaches have been recently proposed to preserve network parameters, which are crucial for previously learned tasks while learning new tasks. We explore such selective synaptic plasticity approaches through a unifying lens of memory replay and show the close relationship between methods like Elastic Weight Consolidation (EWC) and Memory-Aware-Synapses (MAS).  We then propose a fundamentally different class of preservation methods that aim at preserving the distribution of internal neural representations for previous tasks while learning a new one. We propose the sliced Cram\\'{e}r distance as a suitable choice for such preservation and evaluate our Sliced Cramer Preservation (SCP) algorithm through extensive empirical investigations on various network architectures in both supervised and unsupervised learning settings. We show that SCP consistently utilizes the learning capacity of the network better than online-EWC and MAS methods on various incremental learning tasks.",
        "keywords": "selective plasticity;catastrophic forgetting;intransigence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Soheil Kolouri;Nicholas A. Ketz;Andrea Soltoggio;Praveen K. Pilly",
        "authorids": "skolouri@hrl.com;naketz@hrl.com;a.soltoggio@lboro.ac.uk;pkpilly@hrl.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nKolouri2020Sliced,\ntitle={Sliced Cramer Synaptic Consolidation for Preserving Deeply Learned Representations},\nauthor={Soheil Kolouri and Nicholas A. Ketz and Andrea Soltoggio and Praveen K. Pilly},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJge3TNKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJge3TNKwH",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "245;185",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "503;300",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            215.0,
            30.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            401.5,
            101.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7776282871391252254&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJgedkStDS",
        "title": "Relevant-features based Auxiliary Cells for Robust and Energy Efficient Deep Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Improve the robustness and energy efficiency of a deep neural network using the hidden representations.",
        "abstract": "Deep neural networks are complex non-linear models used as predictive analytics tool and have demonstrated state-of-the-art performance on many classification tasks.  However, they have no inherent capability to recognize when their predictions might go wrong. There have been several efforts in the recent past to detect natural errors i.e.  misclassified inputs but these mechanisms pose additional energy requirements.  To address this issue, we present a novel post-hoc framework to detect natural errors in an energy efficient way.  We achieve this by appending relevant features based linear classifiers per class referred as Relevant features based Auxiliary Cells (RACs).   The proposed technique makes use of the consensus between RACs appended at few selected hidden layers to distinguish the correctly classified inputs from misclassified inputs. The combined confidence of RACs is utilized to determine if classification should terminate at an early stage. We demonstrate the effectiveness of our technique on various image classification datasets such as CIFAR10, CIFAR100 and Tiny-ImageNet. Our results show that for CIFAR100 dataset trained on VGG16 network, RACs can detect 46% of the misclassified examples along with 12% reduction in energy compared to the baseline network while 69% of the examples are correctly classified.\n",
        "keywords": "Machine learning;deep neural networks;error detection;robust deep learning;energy efficiency;adversarial robustness;out-of-distribution detection;abnormal inputs detection;misclassified samples detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aparna Aketi;Priyadarshini Panda;Kaushik Roy",
        "authorids": "saketi@purdue.edu;priya.panda@yale.edu;kaushik@purdue.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJgedkStDS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "222;444;99",
        "wc_reply_reviewers": "31;0;0",
        "wc_reply_authors": "471;662;463",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            255.0,
            142.765542061101
        ],
        "wc_reply_reviewers_avg": [
            10.333333333333334,
            14.613540144521982
        ],
        "wc_reply_authors_avg": [
            532.0,
            91.98188227399277
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZBIyOFM7iVcJ:scholar.google.com/&scioq=Relevant-features+based+Auxiliary+Cells+for+Robust+and+Energy+Efficient+Deep+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BJgkbyHKDS",
        "title": "Invertible generative models for inverse problems: mitigating representation error and dataset bias",
        "track": "main",
        "status": "Reject",
        "tldr": "Invertible generative neural networks provide effective natural image priors for inverse problems, outperforming GAN and Lasso priors in Compressive Sensing Problems, while exhibiting strong out-of-distribution performance.",
        "abstract": "Trained generative models have shown remarkable performance as priors for inverse problems in imaging.  For example, Generative Adversarial Network priors permit recovery of test images from 5-10x fewer measurements than sparsity priors.  Unfortunately, these models may be unable to represent any particular image because of architectural choices, mode collapse, and bias in the training dataset. In this paper, we demonstrate that invertible neural networks, which have zero representation error by design, can be effective natural signal priors at inverse problems such as denoising, compressive sensing, and inpainting.   Our formulation is an empirical risk minimization that does not directly optimize the likelihood of images, as one would expect.  Instead we optimize the likelihood of the latent representation of images as a proxy, as this is empirically easier.\nFor compressive sensing, our formulation can yield higher accuracy than sparsity priors across almost all undersampling ratios.  For the same accuracy on test images, they can use 10-20x fewer measurements.  We demonstrate that invertible priors can yield better reconstructions than sparsity priors for images that have rare features of variation within the biased training set, including out-of-distribution natural images.  ",
        "keywords": "Invertible generative models;inverse problems;generative prior;Glow;compressed sensing;denoising;inpainting.",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Muhammad Asim;Ali Ahmed;Paul Hand",
        "authorids": "msee16001@itu.edu.pk;ali.ahmed@itu.edu.pk;p.hand@northeastern.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nasim2020invertible,\ntitle={Invertible generative models for  inverse problems: mitigating representation error and dataset bias},\nauthor={Muhammad Asim and Ali Ahmed and Paul Hand},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgkbyHKDS}\n}",
        "github": "https://drive.google.com/file/d/1oqm_fnh3l7NP0Dycxq744mbH_-SU-KIf/view",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJgkbyHKDS",
        "pdf_size": 0,
        "rating": "1;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "161;131;449;265",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "392;146;255;141",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            4.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            251.5,
            124.39754820735013
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            233.5,
            102.22157306557163
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 171,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18360186920065669378&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BJglA3NKwS",
        "title": "Siamese Attention Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Attention operators have been widely applied on data of various orders and dimensions such as texts, images, and videos. One challenge of applying attention operators is the excessive usage of computational resources. This is due to the usage of dot product and softmax operator when computing similarity scores. In this work, we propose the Siamese similarity function that uses a feed-forward network to compute similarity scores. This results in the Siamese attention operator (SAO). In particular, SAO leads to a dramatic reduction in the requirement of computational resources. Experimental results show that our SAO can save 94% memory usage and speed up the computation by a factor of 58 compared to the regular attention operator. The computational advantage of SAO is even larger on higher-order and higher-dimensional data. Results on image classification and restoration tasks demonstrate that networks with SAOs are as effective as models with regular attention operator, while significantly outperform those without attention operators.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongyang Gao;Yaochen Xie;Shuiwang Ji",
        "authorids": "hongyang.gao@tamu.edu;ethanycx@tamu.edu;sji@tamu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngao2020siamese,\ntitle={Siamese Attention Networks},\nauthor={Hongyang Gao and Yaochen Xie and Shuiwang Ji},\nyear={2020},\nurl={https://openreview.net/forum?id=BJglA3NKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJglA3NKwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "437;145;234",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;14;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;0",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            272.0,
            122.19929077808376
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            4.666666666666667,
            6.599663291074443
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "BJgnXpVYwS",
        "title": "Why Gradient Clipping Accelerates Training: A Theoretical Justification for Adaptivity",
        "track": "main",
        "status": "Talk",
        "tldr": "Gradient clipping provably accelerates gradient descent for non-smooth non-convex functions.",
        "abstract": "We provide a theoretical explanation for the effectiveness of gradient clipping in training deep neural networks. The key ingredient is a new smoothness condition derived from practical neural network training examples. We observe that gradient smoothness, a concept central to the analysis of first-order optimization algorithms that is often assumed to be a constant, demonstrates significant variability along the training trajectory of deep neural networks. Further, this smoothness positively correlates with the gradient norm, and contrary to standard assumptions in the literature, it can grow with the norm of the gradient. These empirical observations limit the applicability of existing theoretical analyses of algorithms that rely on a fixed bound on smoothness. These observations motivate us to introduce a novel relaxation of gradient smoothness that is weaker than the commonly used Lipschitz smoothness assumption. Under the new condition, we prove that two popular methods, namely, gradient clipping and normalized gradient, converge arbitrarily faster than gradient descent with fixed stepsize. We further explain why such adaptively scaled gradient methods can accelerate empirical convergence and verify our results empirically in popular neural network training settings.",
        "keywords": "Adaptive methods;optimization;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jingzhao Zhang;Tianxing He;Suvrit Sra;Ali Jadbabaie",
        "authorids": "jzhzhang@mit.edu;tianxing@mit.edu;suvrit@mit.edu;jadbabai@mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nZhang2020Why,\ntitle={Why Gradient Clipping Accelerates Training: A Theoretical Justification for Adaptivity},\nauthor={Jingzhao Zhang and Tianxing He and Suvrit Sra and Ali Jadbabaie},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgnXpVYwS}\n}",
        "github": "https://github.com/JingzhaoZhang/why-clipping-accelerates",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJgnXpVYwS",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "381;430;334",
        "wc_reply_reviewers": "0;28;44",
        "wc_reply_authors": "19;224;374",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            381.6666666666667,
            39.19467083956979
        ],
        "wc_reply_reviewers_avg": [
            24.0,
            18.184242262647807
        ],
        "wc_reply_authors_avg": [
            205.66666666666666,
            145.5067772381143
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 609,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2986024522916828418&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJgpDyHKwH",
        "title": "Compressive Hyperspherical Energy Minimization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Minimum hyperspherical energy (MHE) has demonstrated its potential in regularizing neural networks and improving the generalization. MHE was inspired by the Thomson problem in physics where the distribution of multiple propelling electrons on a unit sphere can be modeled via minimizing some potential energy. Despite its practical effectiveness, MHE suffers from some difficulties in optimization as the dimensionality of the space becomes higher, therefore limiting the potential to improve network generalization. To address these problems, we propose the compressive minimum hyperspherical energy (CoMHE) as a more effective regularization for neural networks. Specifically, CoMHE utilizes a projection mapping to reduce the dimensionality of neurons and minimizes their hyperspherical energy. According to different constructions for the projection mapping, we propose two major variants: random projection CoMHE and angle-preserving CoMHE. As a novel extension, We further consider adversarial projection CoMHE and group CoMHE. We also provide some theoretical insights to justify the effectiveness. Our comprehensive experiments show that CoMHE consistently outperforms MHE by a considerable margin, and can be easily applied to improve different tasks such as image recognition and point cloud recognition.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rongmei Lin;Weiyang Liu;Zhen Liu;Chen Feng;Zhiding Yu;James M. Rehg;Li Xiong;Le Song",
        "authorids": "rongmei.lin@emory.edu;wyliu@gatech.edu;zhen.liu.2@umontreal.ca;cfeng@nyu.edu;zhidingy@nvidia.com;rehg@gatech.edu;lxiong@emory.edu;lsong@cc.gatech.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJgpDyHKwH",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "453;474;79;139",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "604;886;449;455",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;3;2;2",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            286.25,
            178.66921251295648
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            598.5,
            177.22090734447784
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.25,
            0.4330127018922193
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=567251630020618164&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJgqQ6NYvB",
        "title": "FasterSeg: Searching for Faster Real-time Semantic Segmentation",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a real-time segmentation model automatically discovered by a multi-scale NAS framework, achieving 30% faster than state-of-the-art models.",
        "abstract": "We present FasterSeg, an automatically designed semantic segmentation network with not only state-of-the-art performance but also faster speed than current methods. Utilizing neural architecture search (NAS), FasterSeg is discovered from a novel and broader search space integrating multi-resolution branches, that has been recently found to be vital in manually designed segmentation models. To better calibrate the balance between the goals of high accuracy and low latency, we propose a decoupled and fine-grained latency regularization, that effectively overcomes our observed phenomenons that the searched networks are prone to \"collapsing\" to low-latency yet poor-accuracy models. Moreover, we seamlessly extend FasterSeg to a new collaborative search (co-searching) framework, simultaneously searching for a teacher and a student network in the same single run. The teacher-student distillation further boosts the student model\u2019s accuracy. Experiments on popular segmentation benchmarks demonstrate the competency of FasterSeg. For example, FasterSeg can run over 30% faster than the closest manually designed competitor on Cityscapes, while maintaining comparable accuracy.",
        "keywords": "neural architecture search;real-time;segmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wuyang Chen;Xinyu Gong;Xianming Liu;Qian Zhang;Yuan Li;Zhangyang Wang",
        "authorids": "wuyang.chen@tamu.edu;xy_gong@tamu.edu;xianming.liu@horizon.ai;qian01.zhang@horizon.ai;yuan.li@horizon.ai;atlaswang@tamu.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nChen2020FasterSeg:,\ntitle={FasterSeg: Searching for Faster Real-time Semantic Segmentation},\nauthor={Wuyang Chen and Xinyu Gong and Xianming Liu and Qian Zhang and Yuan Li and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgqQ6NYvB}\n}",
        "github": "https://github.com/TAMU-VITA/FasterSeg",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJgqQ6NYvB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "701;291;735",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1742;527;367",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            575.6666666666666,
            201.7677432649288
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            878.6666666666666,
            613.953490819043
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 255,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11587095836376020772&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJgr4kSFDS",
        "title": "Query2box: Reasoning over Knowledge Graphs in Vector Space Using Box Embeddings",
        "track": "main",
        "status": "Poster",
        "tldr": "Answering a wide class of logical queries over knowledge graphs with box embeddings in vector space",
        "abstract": "Answering complex logical queries on large-scale incomplete knowledge graphs (KGs) is a fundamental yet challenging task. Recently, a promising approach to this problem has been to embed KG entities as well as the query into a vector space such that entities that answer the query are embedded close to the query. However, prior work models queries as single points in the vector space, which is problematic because a complex query represents a potentially large set of its answer entities, but it is unclear how such a set can be represented as a single point. Furthermore, prior work can only handle queries that use conjunctions ($\\wedge$) and existential quantifiers ($\\exists$). Handling queries with logical disjunctions ($\\vee$) remains an open problem. Here we propose query2box, an embedding-based framework for reasoning over arbitrary queries with $\\wedge$, $\\vee$, and $\\exists$ operators in massive and incomplete KGs. Our main insight is that queries can be embedded as boxes (i.e., hyper-rectangles), where a set of points inside the box corresponds to a set of answer entities of the query. We show that conjunctions can be naturally represented as intersections of boxes and also prove a negative result that handling disjunctions would require embedding with dimension proportional to the number of KG entities. However, we show that by transforming queries into a Disjunctive Normal Form, query2box is capable of handling arbitrary logical queries with $\\wedge$, $\\vee$, $\\exists$ in a scalable manner. We demonstrate the effectiveness of query2box on two large KGs and show that query2box achieves up to 25% relative improvement over the state of the art.\n",
        "keywords": "knowledge graph embeddings;logical reasoning;query answering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongyu Ren*;Weihua Hu*;Jure Leskovec",
        "authorids": "hyren@cs.stanford.edu;weihuahu@stanford.edu;jure@cs.stanford.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nRen*2020Query2box:,\ntitle={Query2box: Reasoning over Knowledge Graphs in Vector Space Using Box Embeddings},\nauthor={Hongyu Ren* and Weihua Hu* and Jure Leskovec},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgr4kSFDS}\n}",
        "github": "https://github.com/hyren/query2box",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJgr4kSFDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "152;763;355",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "124;1282;30",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            423.3333333333333,
            254.07654139823472
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            478.6666666666667,
            569.3372365205782
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 388,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12162114509339906104&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "BJgxzlSFvr",
        "title": "AN ATTENTION-BASED DEEP NET FOR LEARNING TO RANK",
        "track": "main",
        "status": "Reject",
        "tldr": "learning to rank with several embeddings and attentions",
        "abstract": "In information retrieval, learning to rank constructs a machine-based ranking model which given a query, sorts the search results by their degree of relevance or importance to the query. Neural networks have been successfully applied to this problem, and in this paper, we propose an attention-based deep neural network which better incorporates different embeddings of the queries and search results with an attention-based mechanism. This model also applies a decoder mechanism to learn the ranks of the search results in a listwise fashion. The embeddings are trained with convolutional neural networks or the word2vec model. We demonstrate the performance of this model with image retrieval and text querying data sets.",
        "keywords": "learning to rank;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Diego Klabjan;Baiyang Wang",
        "authorids": "d-klabjan@northwestern.edu;baiyang@u.northwestern.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nklabjan2020an,\ntitle={{\\{}AN{\\}} {\\{}ATTENTION{\\}}-{\\{}BASED{\\}} {\\{}DEEP{\\}} {\\{}NET{\\}} {\\{}FOR{\\}} {\\{}LEARNING{\\}} {\\{}TO{\\}} {\\{}RANK{\\}}},\nauthor={Diego Klabjan and Baiyang Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgxzlSFvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJgxzlSFvr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "557;281;205",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            347.6666666666667,
            151.23785534345853
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1125337478816460853&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJgy96EYvr",
        "title": "Influence-Based Multi-Agent Exploration",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "Intrinsically motivated reinforcement learning aims to address the exploration challenge for sparse-reward tasks. However, the study of exploration methods in transition-dependent multi-agent settings is largely absent from the literature. We aim to take a step towards solving this problem. We present two exploration methods: exploration via information-theoretic influence (EITI) and exploration via decision-theoretic influence (EDTI), by exploiting the role of interaction in coordinated behaviors of agents. EITI uses mutual information to capture the interdependence between the transition dynamics of agents. EDTI uses a novel intrinsic reward, called Value of Interaction (VoI), to characterize and quantify the influence of one agent's behavior on expected returns of other agents. By optimizing EITI or EDTI objective as a regularizer, agents are encouraged to coordinate their exploration and learn policies to optimize the team performance. We show how to optimize these regularizers so that they can be easily integrated with policy gradient reinforcement learning. The resulting update rule draws a connection between coordinated exploration and intrinsic reward distribution. Finally, we empirically demonstrate the significant strength of our methods in a variety of multi-agent scenarios.",
        "keywords": "Multi-agent reinforcement learning;Exploration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tonghan Wang*;Jianhao Wang*;Yi Wu;Chongjie Zhang",
        "authorids": "tonghanwang1996@gmail.com;1040594377@qq.com;jxwuyi@openai.com;chongjie@tsinghua.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWang*2020Influence-Based,\ntitle={Influence-Based Multi-Agent Exploration},\nauthor={Tonghan Wang* and Jianhao Wang* and Yi Wu and Chongjie Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgy96EYvr}\n}",
        "github": "https://github.com/TonghanWang/EITI-EDTI",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJgy96EYvr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "447;164;423",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "278;479;588",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.6666666666667,
            128.1258062305257
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            448.3333333333333,
            128.40128071358512
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 183,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3107558689865611591&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJgyn1BFwS",
        "title": "Global Adversarial Robustness Guarantees for Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Given a neural network f we investigate the global adversarial robustness properties of f, showing how these can be computed up to any a priori specified statistical error.",
        "abstract": "We investigate global adversarial robustness guarantees for machine learning models.  Specifically, given a trained model we consider the problem of computing the probability that its prediction at any point sampled from the (unknown) input distribution is susceptible to adversarial attacks.  Assuming continuity of the model, we prove measurability for a selection of local robustness properties used in the literature. We then show how concentration inequalities can be employed to compute global robustness with estimation error upper-bounded by $\\epsilon$, for any $\\epsilon > 0$ selected a priori. We utilise the methods to provide statistically sound analysis of the robustness/accuracy trade-off for a variety of neural networks architectures and training methods on MNIST, Fashion-MNIST and CIFAR. We empirically observe that robustness and accuracy tend to be negatively correlated for networks trained via stochastic gradient descent and with iterative pruning techniques, while a positive trend is observed between them in Bayesian settings.",
        "keywords": "Adversarial Robustness;Statistical Guarantees;Deep Neural Networks;Bayesian Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Luca Laurenti;Andrea Patane;Matthew Wicker;Luca Bortolussi;Luca Cardelli;Marta Kwiatkowska",
        "authorids": "luca.laurenti@cs.ox.ac.uk;andrea.patane@chch.ox.ac.uk;matthew.wicker@wolfson.ox.ac.uk;luca.bortolussi@gmail.com;luca.a.cardelli@gmail.com;marta.kwiatkowska@cs.ox.ac.uk",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nlaurenti2020global,\ntitle={Global Adversarial Robustness Guarantees for Neural Networks},\nauthor={Luca Laurenti and Andrea Patane and Matthew Wicker and Luca Bortolussi and Luca Cardelli and Marta Kwiatkowska},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgyn1BFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJgyn1BFwS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "587;154;178",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1788;515;318",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            306.3333333333333,
            198.70301680872612
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            873.6666666666666,
            651.5143044391956
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13973549294128375697&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJgza6VtPB",
        "title": "Language GANs Falling Short",
        "track": "main",
        "status": "Poster",
        "tldr": "GANs have been applied to text generation and are believed SOTA. However, we propose a new evaluation protocol demonstrating that maximum-likelihood trained models are still better.",
        "abstract": "Traditional natural language generation  (NLG) models are trained using maximum likelihood estimation (MLE) which differs from the sample generation inference procedure. During training the ground truth tokens are passed to the model, however, during inference, the model instead reads its previously generated samples - a phenomenon coined exposure bias. Exposure bias was hypothesized to be a root cause of poor sample quality and thus many generative adversarial networks (GANs) were proposed as a remedy since they have identical training and inference.  However, many of the ensuing GAN variants validated sample quality improvements but ignored loss of sample diversity. This work reiterates the fallacy of quality-only metrics and clearly demonstrate that the well-established technique of reducing softmax temperature can outperform GANs on a quality-only metric. Further, we establish a definitive quality-diversity evaluation procedure using temperature tuning over local and global sample metrics. Under this, we find that MLE models consistently outperform the proposed GAN variants over the whole quality-diversity space.  Specifically, we find that 1) exposure bias appears to be less of an issue than the complications arising from non-differentiable, sequential GAN training;  2) MLE trained models provide a better quality/diversity trade-off compared to their GAN counterparts, all while being easier to train, easier to cross-validate, and less computationally expensive.",
        "keywords": "NLP;GAN;MLE;adversarial;text generation;temperature",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Massimo Caccia;Lucas Caccia;William Fedus;Hugo Larochelle;Joelle Pineau;Laurent Charlin",
        "authorids": "massimo.p.caccia@gmail.com;lucas.page-caccia@mail.mcgill.ca;liam.fedus@gmail.com;hugolarochelle@google.com;jpineau@cs.mcgill.ca;lcharlin@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nCaccia2020Language,\ntitle={Language GANs Falling Short},\nauthor={Massimo Caccia and Lucas Caccia and William Fedus and Hugo Larochelle and Joelle Pineau and Laurent Charlin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJgza6VtPB}\n}",
        "github": "https://github.com/pclucas14/GansFallingShort",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJgza6VtPB",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "289;269",
        "wc_reply_reviewers": "78;177",
        "wc_reply_authors": "866;477",
        "reply_reviewers": "2;2",
        "reply_authors": "4;3",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            279.0,
            10.0
        ],
        "wc_reply_reviewers_avg": [
            127.5,
            49.5
        ],
        "wc_reply_authors_avg": [
            671.5,
            194.5
        ],
        "reply_reviewers_avg": [
            2.0,
            0.0
        ],
        "reply_authors_avg": [
            3.5,
            0.5
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 265,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5625942263097164405&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJl-5pNKDB",
        "title": "On Computation and Generalization of Generative Adversarial Imitation Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Generative Adversarial Imitation Learning (GAIL) is a powerful and practical approach for learning sequential decision-making policies. Different from Reinforcement Learning (RL), GAIL takes advantage of demonstration data by experts (e.g., human), and learns both the policy and reward function of the unknown environment. Despite the significant empirical progresses, the theory behind GAIL is still largely unknown. The major difficulty comes from the underlying temporal dependency of the demonstration data and the minimax computational formulation of GAIL without convex-concave structure. To bridge such a gap between theory and practice, this paper investigates the theoretical properties of GAIL. Specifically, we show: (1) For GAIL with general reward parameterization, the generalization can be guaranteed as long as the class of the reward functions is properly controlled; (2) For GAIL, where the reward is parameterized as a reproducing kernel function, GAIL can be efficiently solved by stochastic first order optimization algorithms, which attain sublinear convergence to a stationary solution. To the best of our knowledge, these are the first results on statistical and computational guarantees of imitation learning with reward/policy function ap- proximation. Numerical experiments are provided to support our analysis.\n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Minshuo Chen;Yizhou Wang;Tianyi Liu;Zhuoran Yang;Xingguo Li;Zhaoran Wang;Tuo Zhao",
        "authorids": "mchen393@gatech.edu;wyzjack990122@gmail.com;tianyiliu@gatech.edu;zy6@princeton.edu;xingguol@princeton.edu;zhaoran.wang@northwestern.edu;tourzhao@gatech.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nChen2020On,\ntitle={On Computation and Generalization of Generative Adversarial Imitation Learning},\nauthor={Minshuo Chen and Yizhou Wang and Tianyi Liu and Zhuoran Yang and Xingguo Li and Zhaoran Wang and Tuo Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl-5pNKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJl-5pNKDB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "620;563;99",
        "wc_reply_reviewers": "0;26;0",
        "wc_reply_authors": "267;521;147",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            427.3333333333333,
            233.32999997619015
        ],
        "wc_reply_reviewers_avg": [
            8.666666666666666,
            12.256517540566824
        ],
        "wc_reply_authors_avg": [
            311.6666666666667,
            155.91735702679873
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 50,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=515553638881373584&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BJl07ySKvS",
        "title": "Guiding Program Synthesis by Learning to Generate Examples",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "A key challenge of existing program synthesizers is ensuring that the synthesized program generalizes well. This can be difficult to achieve as the specification provided by the end user is often limited, containing as few as one or two input-output examples. In this paper we address this challenge via an iterative approach that finds ambiguities in the provided specification and learns to resolve these by generating additional input-output examples. The main insight is to reduce the problem of selecting which program generalizes well to the simpler task of deciding which output is correct. As a result, to train our probabilistic models, we can take advantage of the large amounts of data in the form of program outputs, which are often much easier to obtain than the corresponding ground-truth programs.",
        "keywords": "program synthesis;programming by examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Larissa Laich;Pavol Bielik;Martin Vechev",
        "authorids": "llaich@ethz.ch;pavol.bielik@inf.ethz.ch;martin.vechev@inf.ethz.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLaich2020Guiding,\ntitle={Guiding Program Synthesis by Learning to Generate Examples},\nauthor={Larissa Laich and Pavol Bielik and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl07ySKvS}\n}",
        "github": "https://github.com/eth-sri/guiding-synthesizers",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJl07ySKvS",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "718;505;279",
        "wc_reply_reviewers": "252;81;8",
        "wc_reply_authors": "1146;205;298",
        "reply_reviewers": "2;1;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            500.6666666666667,
            179.2471912068793
        ],
        "wc_reply_reviewers_avg": [
            113.66666666666667,
            102.25567085605678
        ],
        "wc_reply_authors_avg": [
            549.6666666666666,
            423.37716308537733
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5759998545534932408&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJl2_nVFPB",
        "title": "Automatically Discovering and Learning New Visual Categories with Ranking Statistics",
        "track": "main",
        "status": "Poster",
        "tldr": "A method to automatically discover new categories in unlabelled data, by effectively transferring knowledge from labelled data of other different categories using feature rank statistics.",
        "abstract": "We tackle the problem of discovering novel classes in an image collection given labelled examples of other classes. This setting is similar to semi-supervised learning, but significantly harder because there are no labelled examples for the new classes. The challenge, then, is to leverage the information contained in the labelled images in order to learn a general-purpose clustering model and use the latter to identify the new classes in the unlabelled data. In this work we address this problem by combining three ideas: (1) we suggest that the common approach of bootstrapping an image representation using the labeled data only introduces an unwanted bias, and that this can be avoided by using self-supervised learning to train the representation from scratch on the union of labelled and unlabelled data; (2) we use rank statistics to transfer the model's knowledge of the labelled classes to the problem of clustering the unlabelled images; and, (3) we train the data representation by optimizing a joint objective function on the labelled and unlabelled subsets of the data, improving both the supervised classification of the labelled data, and the clustering of the unlabelled data. We evaluate our approach on standard classification benchmarks and outperform current methods for novel category discovery by a significant margin.",
        "keywords": "deep learning;classification;novel classes;transfer learning;clustering;incremental learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kai Han;Sylvestre-Alvise Rebuffi;Sebastien Ehrhardt;Andrea Vedaldi;Andrew Zisserman",
        "authorids": "khan@robots.ox.ac.uk;srebuffi@robots.ox.ac.uk;hyenal@robots.ox.ac.uk;vedaldi@robots.ox.ac.uk;az@robots.ox.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nHan2020Automatically,\ntitle={Automatically Discovering and Learning New Visual Categories with Ranking Statistics},\nauthor={Kai Han and Sylvestre-Alvise Rebuffi and Sebastien Ehrhardt and Andrea Vedaldi and Andrew Zisserman},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl2_nVFPB}\n}",
        "github": "http://www.robots.ox.ac.uk/~vgg/research/auto_novel/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJl2_nVFPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "647;315;223",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1195;554;554",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            395.0,
            182.10619612376362
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            767.6666666666666,
            302.1702978270513
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 250,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6046841849136229502&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BJl4g0NYvB",
        "title": "Causal Induction from Visual Observations for Goal Directed Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "Meta-learning algorithm for inducing causal structure from visual observations and using it to complete goal conditioned tasks",
        "abstract": "Causal reasoning has been an indispensable capability for humans and other intelligent animals to interact with the physical world. In this work, we propose to endow an artificial agent with the capability of causal reasoning for completing goal-directed tasks. We develop learning-based approaches to inducing causal knowledge in the form of directed acyclic graphs, which can be used to contextualize a learned goal-conditional policy to perform tasks in novel environments with latent causal structures. We leverage attention mechanisms in our causal induction model and goal-conditional policy, enabling us to incrementally generate the causal graph from the agent's visual observations and to selectively use the induced graph for determining actions. Our experiments show that our method effectively generalizes towards completing new tasks in novel environments with previously unseen causal structures.",
        "keywords": "meta-learning;causal reasoning;policy learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Suraj Nair;Yuke Zhu;Silvio Savarese;Li Fei-Fei",
        "authorids": "surajn@stanford.edu;yukez@cs.stanford.edu;ssilvio@stanford.edu;feifeili@cs.stanford.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nnair2020causal,\ntitle={Causal Induction from Visual Observations for Goal Directed Tasks},\nauthor={Suraj Nair and Yuke Zhu and Silvio Savarese and Li Fei-Fei},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl4g0NYvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJl4g0NYvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "763;499;469",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "597;578;269",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            577.0,
            132.09087780766694
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            481.3333333333333,
            150.34257177821885
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 70,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9060897567661119684&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJl4pA4Kwr",
        "title": "FAST LEARNING VIA EPISODIC MEMORY: A PERSPECTIVE FROM ANIMAL DECISION-MAKING",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Fast learning via episodic memory verified by a biologically plausible framework for prefrontal cortex-basal ganglia-hippocampus (PFC-BG) circuit",
        "abstract": "A typical experiment to study cognitive function is to train animals to perform tasks, while the researcher records the electrical activity of the animals neurons. The main obstacle faced, when using this type of electrophysiological experiment to uncover the circuit mechanisms underlying complex behaviors, is our incomplete access to relevant circuits in the brain. One promising approach is to model neural circuits using an artificial neural network (ANN), which can provide complete access to the \u201cneural circuits\u201d responsible for a behavior. More recently, reinforcement learning models have been adopted to understand the functions of cortico-basal ganglia circuits as reward-based learning has been found in mammalian brain. In this paper, we propose a Biologically-plausible Actor-Critic with Episodic Memory (B-ACEM) framework to model a prefrontal cortex-basal ganglia-hippocampus (PFC-BG) circuit, which is verified to capture the behavioral findings from a well-known perceptual decision-making task, i.e., random dots motion discrimination. This B-ACEM framework links neural computation to behaviors, on which we can explore how episodic memory should be considered to govern future decision. Experiments are conducted using different settings of the episodic memory and results show that all patterns of episodic memories can speed up learning. In particular, salient events are prioritized to propagate reward information and guide decisions. Our B-ACEM framework and the built-on experiments give inspirations to both designs for more standard decision-making models in biological system and a more biologically-plausible ANN.",
        "keywords": "neuroscience;cognitive science;memory;perception",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiaohan Zhang;Lu Liu;Guodong Long;jing jiang;Shenquan Liu",
        "authorids": "xh1315255662@gmail.com;lu.liu.cs@icloud.com;guodong.long@uts.edu.au;jing.jiang@uts.edu.au;mashqliu@scut.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJl4pA4Kwr",
        "pdf_size": 0,
        "rating": "1;1",
        "confidence": "0;0",
        "wc_review": "306;693",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            499.5,
            193.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_gZewFbQZ-MJ:scholar.google.com/&scioq=FAST+LEARNING+VIA+EPISODIC+MEMORY:+A+PERSPECTIVE+FROM+ANIMAL+DECISION-MAKING&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BJl6bANtwH",
        "title": "Detecting Extrapolation with Local Ensembles",
        "track": "main",
        "status": "Poster",
        "tldr": "We present local ensembles, a method for detecting extrapolation in trained models, which approximates the variance of an ensemble using local-second order information.",
        "abstract": "We present local ensembles, a method for detecting extrapolation at test time in a pre-trained model. We focus on underdetermination as a key component of extrapolation: we aim to detect when many possible predictions are consistent with the training data and model class. Our method uses local second-order information to approximate the variance of predictions across an ensemble of models from the same class. We compute this approximation by estimating the norm of the component of a test point's gradient that aligns with the low-curvature directions of the Hessian, and provide a tractable method for estimating this quantity. Experimentally, we show that our method is capable of detecting when a pre-trained model is extrapolating on test data, with applications to out-of-distribution detection, detecting spurious correlates, and active learning.",
        "keywords": "extrapolation;reliability;influence functions;laplace approximation;ensembles;Rashomon set",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David Madras;James Atwood;Alexander D'Amour",
        "authorids": "david.madras@mail.utoronto.ca;atwoodj@google.com;alexdamour@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nMadras2020Detecting,\ntitle={Detecting Extrapolation with Local Ensembles},\nauthor={David Madras and James Atwood and Alexander D'Amour},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl6bANtwH}\n}",
        "github": "https://github.com/dmadras/local-ensembles",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJl6bANtwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "1215;647;579",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1647;822;458",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            813.6666666666666,
            285.1401214997442
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            975.6666666666666,
            497.42023369469894
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 29,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6794265888871108208&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJl6t64tvr",
        "title": "Revisiting the Generalization of Adaptive Gradient Methods",
        "track": "main",
        "status": "Reject",
        "tldr": "Adaptive gradient methods, when done right, do not incur a generalization penalty. ",
        "abstract": "A commonplace belief in the machine learning community is that using adaptive gradient methods hurts generalization. We re-examine this belief both theoretically and experimentally, in light of insights and trends from recent years.\nWe revisit some previous oft-cited experiments and theoretical accounts in more depth, and provide a new set of experiments in larger-scale, state-of-the-art settings. We conclude that with proper tuning, the improved training performance of adaptive optimizers does not in general carry an overfitting penalty, especially in contemporary deep learning. Finally, we synthesize a ``user's guide'' to adaptive optimizers, including some proposed modifications to AdaGrad to mitigate some of its empirical shortcomings.",
        "keywords": "Adaptive Methods;AdaGrad;Generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Naman Agarwal;Rohan Anil;Elad Hazan;Tomer Koren;Cyril Zhang",
        "authorids": "namanagarwal@google.com;rohananil@google.com;ehazan@cs.princeton.edu;tkoren@google.com;cyril.zhang@princeton.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nagarwal2020revisiting,\ntitle={Revisiting the Generalization of Adaptive Gradient Methods},\nauthor={Naman Agarwal and Rohan Anil and Elad Hazan and Tomer Koren and Cyril Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl6t64tvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJl6t64tvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "313;340;256",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "302;206;44",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            303.0,
            35.014282800023196
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            184.0,
            106.47065323364932
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17414234852689023857&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJl750VYwH",
        "title": "Rethinking Generalized Matrix Factorization for Recommendation: The Importance of Multi-hot Encoding",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A simple extension of generalized matrix factorization can outperform state-of-the-art approaches for recommendation.",
        "abstract": "Learning good representations of users and items is crucially important to recommendation with implicit feedback. Matrix factorization is the basic idea to derive the representations of users and items by decomposing the given interaction matrix. However, existing matrix factorization based approaches share the limitation in that the interaction between user embedding and item embedding is only weakly enforced by fitting the given individual rating value, which may lose potentially useful information. In this paper, we propose a novel Augmented Generalized Matrix Factorization (AGMF) approach that is able to incorporate the historical interaction information of users and items for learning effective representations of users and items. Despite the simplicity of our proposed approach, extensive experiments on four public implicit feedback datasets demonstrate that our approach outperforms state-of-the-art counterparts. Furthermore, the ablation study demonstrates that by using multi-hot encoding to enrich user embedding and item embedding for Generalized Matrix Factorization, better performance, faster convergence, and lower training loss can be achieved.",
        "keywords": "supervised representation learning;recommender systems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lei Feng;Hongxin Wei;Qingyu Guo;Zhuoyi Lin;Bo An",
        "authorids": "feng0093@e.ntu.edu.sg;owenwei@ntu.edu.sg;qguo005@e.ntu.edu.sg;zhuoyi001@e.ntu.edu.sg;boan@ntu.edu.sg",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nfeng2020rethinking,\ntitle={Rethinking Generalized Matrix Factorization for Recommendation: The Importance of Multi-hot Encoding},\nauthor={Lei Feng and Hongxin Wei and Qingyu Guo and Zhuoyi Lin and Bo An},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl750VYwH}\n}",
        "github": "https://www.dropbox.com/sh/40qy1gzn9qp0sui/AAAIF2uwQwVXdXcbp-0NRIP1a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJl750VYwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1273;209;682",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            721.3333333333334,
            435.2656915290042
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10028075462744458587&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJl7WyHFDS",
        "title": "PNEN: Pyramid Non-Local Enhanced Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Existing neural networks proposed for low-level image processing tasks are usually implemented by stacking convolution layers with limited kernel size. Every convolution layer merely involves in context information from a small local neighborhood. More contextual features can be explored as more convolution layers are adopted. However it is difficult and costly to take full advantage of long-range dependencies. We employ non-local operation to build up connection between every pixel and all remain pixels. Moreover a novel \\emph{Pyramid Non-local Block} is devised to robustly estimate pairwise similarity coefficients between different scales of content patterns. Considering computation burden and memory consumption, we exploit embedding feature maps with coarser resolution to represent content patterns with larger spatial scale. Through elaborately combining the pyramid non-local blocks and dilated residual blocks, we set up a \\emph{Pyramid Non-local Enhanced Network} for edge-preserving image smoothing. It achieves state-of-the-art performance in imitating three classical image smoothing algorithms. Additionally, the pyramid non-local block can be directly incorporated into existing convolution neural networks for other image processing tasks. We integrate it into two state-of-the-art methods for image denoising and single image super-resolution respectively, achieving consistently improved performance.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Feida Zhu;Chaowei Fang;Kai-Kuang Ma",
        "authorids": "feida.zhu@ntu.edu.sg;chwfang@connect.hku.hk;ekkma@ntu.edu.sg",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJl7WyHFDS",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "563;143;673",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            459.6666666666667,
            228.37590843947518
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5133612688208668023&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJl7mxBYvB",
        "title": "Robust Reinforcement Learning via Adversarial Training with Langevin Dynamics",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We re-think the Two-Player Reinforcement Learning (RL) as an instance of a distribution sampling problem in infinite dimensions. Using the powerful Stochastic Gradient Langevin Dynamics, we propose a new two-player RL algorithm, which is a sampling variant of the two-player policy gradient method. Our new algorithm consistently outperforms existing baselines, in terms of generalization across differing training and testing conditions, on several MuJoCo environments.",
        "keywords": "deep reinforcement learning;robust reinforcement learning;min-max problem",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Huang Yu-Ting;Parameswaran Kamalaruban;Paul Rolland;Ya-Ping Hsieh;Volkan Cevher",
        "authorids": "yu.huang@epfl.ch;kamalaruban.parameswaran@epfl.ch;paul.rolland@epfl.ch;ya-ping.hsieh@epfl.ch;volkan.cevher@epfl.ch",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyu-ting2020robust,\ntitle={Robust Reinforcement Learning via Adversarial Training with  Langevin Dynamics},\nauthor={Huang Yu-Ting and Parameswaran Kamalaruban and Paul Rolland and Ya-Ping Hsieh and Volkan Cevher},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl7mxBYvB}\n}",
        "github": "https://anonymous.4open.science/r/658167da-96b7-4689-8dd9-ca3dcaf19dd1/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJl7mxBYvB",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "341;108;422;215",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            271.5,
            119.79670279268959
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 73,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11127479187874020553&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BJl8ZlHFwr",
        "title": "Relation-based Generalized Zero-shot Classification with the Domain Discriminator on the shared representation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Generalized zero-shot learning (GZSL) is the task of predicting a test image from seen or unseen classes using pre-defined class-attributes and images from the seen classes.  Typical ZSL models assign the class corresponding to the most relevant attribute as the predicted label of the test image based on the learned relation between the attribute and the image. However, this relation-based approach presents a difficulty: many of the test images are predicted as biased to the seen domain, i.e., the \\emph{domain bias problem}. Recently, many methods have addressed this difficulty using a synthesis-based approach that, however, requires generation of large amounts of high-quality unseen images after training and the additional training of classifier given them.  Therefore, for this study, we aim at alleviating this difficulty in the manner of the relation-based approach. First, we consider the requirements for good performance in a ZSL setting and introduce a new model based on a variational autoencoder that learns to embed attributes and images into the shared representation space which satisfies those requirements. Next, we assume that the domain bias problem in GZSL derives from a situation in which embedding of the unseen domain overlaps that of the seen one. We introduce a discriminator that distinguishes domains in a shared space and learns jointly with the above embedding model to prevent this situation.  After training, we can obtain prior knowledge from the discriminator of which domain is more likely to be embedded anywhere in the shared space. We propose combination of this knowledge and the relation-based classification on the embedded shared space as a mixture model to compensate class prediction. Experimentally obtained results confirm that the proposed method significantly improves the domain bias problem in relation-based settings and achieves almost equal accuracy to that of high-cost synthesis-based methods.\n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Masahiro Suzuki;Yutaka Matsuo",
        "authorids": "masa@weblab.t.u-tokyo.ac.jp;matsuo@weblab.t.u-tokyo.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsuzuki2020relationbased,\ntitle={Relation-based Generalized Zero-shot Classification with the Domain Discriminator on the shared representation},\nauthor={Masahiro Suzuki and Yutaka Matsuo},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl8ZlHFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJl8ZlHFwr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "746;229;167",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1299;321;337",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            380.6666666666667,
            259.5667330165576
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            652.3333333333334,
            457.30903725550354
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:N1TLEqWZaP8J:scholar.google.com/&scioq=Relation-based+Generalized+Zero-shot+Classification+with+the+Domain+Discriminator+on+the+shared+representation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJl9PRVKDS",
        "title": "A Functional Characterization of Randomly Initialized Gradient Descent in Deep ReLU Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "A functional approach reveals that flat initialization, preserved by gradient descent, leads to generalization ability.",
        "abstract": "Despite their popularity and successes, deep neural networks are poorly understood theoretically and treated as 'black box' systems. Using a functional view of these networks gives us a useful new lens with which to understand them. This allows us us to theoretically or experimentally probe properties of these networks, including the effect of standard initializations, the value of depth, the underlying loss surface, and the origins of generalization. One key result is that generalization results from smoothness of the functional approximation, combined with a flat initial approximation. This smoothness increases with number of units, explaining why massively overparamaterized networks continue to generalize well.",
        "keywords": "Inductive Bias;Generalization;Interpretability;Functional Characterization;Loss Surface;Initialization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Justin Sahs;Aneel Damaraju;Ryan Pyle;Onur Tavaslioglu;Josue Ortega Caro;Hao Yang Lu;Ankit Patel",
        "authorids": "justin.sahs@bcm.edu;amd18@rice.edu;ryan.pyle@bcm.edu;onur.tavaslioglu@bcm.edu;josue.ortegacaro@bcm.edu;hl61@rice.edu;ankitp@bcm.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nsahs2020a,\ntitle={A Functional Characterization of Randomly Initialized Gradient Descent in Deep Re{\\{}LU{\\}} Networks},\nauthor={Justin Sahs and Aneel Damaraju and Ryan Pyle and Onur Tavaslioglu and Josue Ortega Caro and Hao Yang Lu and Ankit Patel},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl9PRVKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJl9PRVKDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "459;262;250",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "178;55;173",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            323.6666666666667,
            95.82043391446084
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            135.33333333333334,
            56.84090858606053
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11359492912516920445&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJl9ZTVKwB",
        "title": "MIM: Mutual Information Machine",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an alternative latent variable modelling framework to variational auto-encoders that encourages the principles of symmetry and high mutual information.",
        "abstract": "    We introduce the Mutual Information Machine (MIM), an autoencoder framework\n    for learning joint distributions over observations and latent states. \n    The model formulation reflects two key design principles: 1) symmetry, to encourage \n    the encoder     and decoder to learn different factorizations of the same \n    underlying distribution; and 2) mutual information, to encourage the learning \n    of useful representations for downstream tasks. \n    The objective comprises the Jensen-Shannon divergence between the encoding and \n    decoding joint distributions, plus a mutual information regularizer. \n    We show that this can be bounded by a tractable cross-entropy loss between \n    the true model and a parameterized approximation, and relate this to \n    maximum likelihood estimation and variational autoencoders.\n    Experiments show that MIM is capable of learning a latent representation with high mutual information,\n    and good unsupervised clustering, while providing NLL comparable to VAE \n    (with a sufficiently expressive architecture).",
        "keywords": "Mutual Information;Representation Learning;Generative Models;Probability Density Estimator",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Micha Livne;Kevin Swersky;David J. Fleet",
        "authorids": "mlivne@cs.toronto.edu;kswersky@google.com;leet@cs.toronto.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlivne2020mim,\ntitle={{\\{}MIM{\\}}: Mutual Information Machine},\nauthor={Micha Livne and Kevin Swersky and David J. Fleet},\nyear={2020},\nurl={https://openreview.net/forum?id=BJl9ZTVKwB}\n}",
        "github": "https://www.dropbox.com/s/idnls2layat77sj/MIM-master.zip?dl=1",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJl9ZTVKwB",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "580;192;135",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "757;456;59",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            302.3333333333333,
            197.7141595558823
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            424.0,
            285.85427522894713
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Y6zu-lXx7lgJ:scholar.google.com/&scioq=MIM:+Mutual+Information+Machine&hl=en&as_sdt=0,33",
        "gs_version_total": 4
    },
    {
        "id": "BJlA6eBtvH",
        "title": "Differentiable Hebbian Consolidation for Continual Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Hebbian plastic weights can behave as a compressed episodic memory storage in neural networks and with the combination of task-specific synaptic consolidation can improve the ability to alleviate catastrophic forgetting in continual learning.",
        "abstract": "Continual learning is the problem of sequentially learning new tasks or knowledge while protecting previously acquired knowledge. However, catastrophic forgetting poses a grand challenge for neural networks performing such learning process. Thus, neural networks that are deployed in the real world often struggle in scenarios where the data distribution is non-stationary (concept drift), imbalanced, or not always fully available, i.e., rare edge cases. We propose a Differentiable Hebbian Consolidation model which is composed of a Differentiable Hebbian Plasticity (DHP) Softmax layer that adds a rapid learning plastic component (compressed episodic memory) to the fixed (slow changing) parameters of the softmax output layer; enabling learned representations to be retained for a longer timescale. We demonstrate the flexibility of our method by integrating well-known task-specific synaptic consolidation methods to penalize changes in the slow weights that are important for each target task. We evaluate our approach on the Permuted MNIST, Split MNIST and Vision Datasets Mixture benchmarks, and introduce an imbalanced variant of Permuted MNIST --- a dataset that combines the challenges of class imbalance and concept drift. Our proposed model requires no additional hyperparameters and outperforms comparable baselines by reducing forgetting.",
        "keywords": "continual learning;catastrophic forgetting;Hebbian learning;synaptic plasticity;neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vithursan Thangarasa;Thomas Miconi;Graham W. Taylor",
        "authorids": "vthangar@uoguelph.ca;tmiconi@uber.com;gwtaylor@uoguelph.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nthangarasa2020differentiable,\ntitle={Differentiable Hebbian Consolidation for Continual Learning},\nauthor={Vithursan Thangarasa and Thomas Miconi and Graham W. Taylor},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlA6eBtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJlA6eBtvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "272;374;238",
        "wc_reply_reviewers": "121;0;0",
        "wc_reply_authors": "823;1229;1180",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            294.6666666666667,
            57.78888782071822
        ],
        "wc_reply_reviewers_avg": [
            40.333333333333336,
            57.03994701571483
        ],
        "wc_reply_authors_avg": [
            1077.3333333333333,
            180.9499623898521
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GzKQ3061L7YJ:scholar.google.com/&scioq=Differentiable+Hebbian+Consolidation+for+Continual+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJlAzTEKwS",
        "title": "Attraction-Repulsion Actor-Critic for Continuous Control Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Continuous control tasks in reinforcement learning are important because they provide an important framework for learning in high-dimensional state spaces with deceptive rewards, where the agent can easily become trapped into suboptimal solutions.\nOne way to avoid local optima is to use a population of agents to ensure coverage of the policy space, yet learning a population with the ``best\" coverage is still an open problem. In this work, we present a novel approach to population-based RL in continuous control that leverages properties of normalizing flows to perform attractive and repulsive operations between current members of the population and previously observed policies. Empirical results on the MuJoCo suite demonstrate a high performance gain for our algorithm compared to prior work, including Soft-Actor Critic (SAC). ",
        "keywords": "reinforcement learning;continuous control;multi-agent;mujoco",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thang Doan;Bogdan Mazoure;Audrey Durand;Joelle Pineau;R Devon Hjelm",
        "authorids": "thang.doan@mail.mcgill.ca;bogdan.mazoure@mail.mcgill.ca;audrey.durand@ift.ulaval.ca;jpineau@cs.mcgill.ca;devon.hjelm@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ndoan2020attractionrepulsion,\ntitle={Attraction-Repulsion Actor-Critic for Continuous Control Reinforcement Learning},\nauthor={Thang Doan and Bogdan Mazoure and Audrey Durand and Joelle Pineau and R Devon Hjelm},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlAzTEKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJlAzTEKwS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "390;454;463",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "343;490;63",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            435.6666666666667,
            32.49957264676294
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            298.6666666666667,
            177.11829066725122
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8103316404747463737&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJlBSkHtDS",
        "title": "Pad\u00e9 Activation Units: End-to-end Learning of Flexible Activation Functions in Deep Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce PAU, a new learnable activation function for neural networks. They free the network designers from the activation selection process and increase the test prediction accuracy.",
        "abstract": "The performance of deep network learning strongly depends on the choice of the non-linear activation function associated with each neuron. However, deciding on the best activation is non-trivial and the choice depends on the architecture, hyper-parameters, and even on the dataset. Typically these activations are fixed by hand before training. Here, we demonstrate how to eliminate the reliance on first picking fixed activation functions by using flexible parametric rational functions instead. The resulting Pad\u00e9 Activation Units (PAUs) can both approximate common activation functions and also learn new ones while providing compact representations. Our empirical evidence shows that end-to-end learning deep networks with PAUs can increase the predictive performance. Moreover, PAUs pave the way to approximations with provable robustness.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alejandro Molina;Patrick Schramowski;Kristian Kersting",
        "authorids": "molina@cs.tu-darmstadt.de;schramowski@cs.tu-darmstadt.de;kersting@cs.tu-darmstadt.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nMolina2020Pad\u00e9,\ntitle={Pad\u00e9 Activation Units: End-to-end Learning of Flexible Activation Functions in Deep Networks},\nauthor={Alejandro Molina and Patrick Schramowski and Kristian Kersting},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlBSkHtDS}\n}",
        "github": "https://github.com/ml-research/pau",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJlBSkHtDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "178;207;354",
        "wc_reply_reviewers": "95;0;34",
        "wc_reply_authors": "423;470;179",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            246.33333333333334,
            77.04688327390163
        ],
        "wc_reply_reviewers_avg": [
            43.0,
            39.30224760324359
        ],
        "wc_reply_authors_avg": [
            357.3333333333333,
            127.55216797669712
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 100,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10060434819073628670&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BJlEEaEFDS",
        "title": "Towards an Adversarially Robust Normalization Approach",
        "track": "main",
        "status": "Reject",
        "tldr": "Investigation of how BatchNorm causes adversarial vulnerability and how to avoid it. ",
        "abstract": "Batch Normalization (BatchNorm) has shown to be effective for improving and accelerating the training of deep neural networks. However, recently it has been shown that it is also vulnerable to adversarial perturbations. In this work, we aim to investigate the cause of adversarial vulnerability of the BatchNorm. We hypothesize that the use of different normalization statistics during training and inference (mini-batch statistics for training and moving average of these values at inference) is the main cause of this adversarial vulnerability in the BatchNorm layer. We empirically proved this by experiments on various neural network architectures and datasets. Furthermore, we introduce Robust Normalization (RobustNorm) and experimentally show that it is not only resilient to adversarial perturbation but also inherit the benefits of BatchNorm.",
        "keywords": "robustness;BatchNorm;adversarial",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Muhammad Awais;Fahad Shamshad;Sung-Ho Bae",
        "authorids": "awais@khu.ac.kr;fahad.shamshad@itu.edu.pk;shbae@khu.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nawais2020towards,\ntitle={Towards an Adversarially Robust Normalization Approach},\nauthor={Muhammad Awais and Fahad Shamshad and Sung-Ho Bae},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlEEaEFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJlEEaEFDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "258;201;141",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            200.0,
            47.77028364998475
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 29,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5406431601597129660&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJlFzkSFwr",
        "title": "Joint text classification on multiple levels with multiple labels",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Natural language uses words in an associative way to construct sentences: it is not words in isolation, but the appropriate use of hierarchical structures that makes communication successful. We propose a deep learning framework for explicitly tying together the representations between single words and full sentences, resulting in a fluid transfer of knowledge between these two levels of granularity. We construct a multi-head attention mechanism for sentence classification, where the individual attention heads simultaneously learn to perform multi-class sequence labeling. Supervision on individual tokens explicitly teaches the classifier which areas it needs to focus on in each sentence, while the sentence-level objective regularizes the token-level predictions and even enables sequence labeling without token-level training data. Our experiments show that the proposed architecture systematically outperforms its single-task counterparts and exhibits strong transfer capabilities, while also achieving reasonable performance as a zero-shot sequence labeler.",
        "keywords": "multi-head attention;zero-shot learning;multi-task learning;text classification;sequence labeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Miruna P\u00eeslar;Marek Rei",
        "authorids": "miruna.pislar@gmail.com;marek.rei@cl.cam.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlFzkSFwr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "456;427;495",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "595;416;550",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            459.3333333333333,
            27.860764925289153
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            520.3333333333334,
            76.02777270328404
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XYcFZSpV_kUJ:scholar.google.com/&scioq=Joint+text+classification+on+multiple+levels+with+multiple+labels&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BJlG5a4FvB",
        "title": "DP-LSSGD: An Optimization Method to Lift the Utility in Privacy-Preserving ERM",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a differentially private Laplacian smoothing stochastic gradient descent to train machine learning models with better utility and maintain differential privacy guarantees.",
        "abstract": "Machine learning (ML) models trained by differentially private stochastic gradient descent (DP-SGD) have much lower utility than the non-private ones. To mitigate this degradation, we propose a DP Laplacian smoothing SGD (DP-LSSGD) to train ML models with differential privacy (DP) guarantees. At the core of DP-LSSGD is the Laplacian smoothing, which smooths out the Gaussian noise used in the Gaussian mechanism. Under the same amount of noise used in the Gaussian mechanism, DP-LSSGD attains the same DP guarantee, but a better utility especially for the scenarios with strong DP guarantees. In practice, DP-LSSGD makes training both convex and nonconvex ML models more stable and enables the trained models to generalize better. The proposed algorithm is simple to implement and the extra computational complexity and memory overhead compared with DP-SGD are negligible. DP-LSSGD is applicable to train a large variety of ML models, including DNNs.",
        "keywords": "Privacy-preserving ERM;Laplacian Smoothing;Improve Utility",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bao Wang;Quanquan Gu;March Boedihardjo;Farzin Barekat;Stanley J. Osher",
        "authorids": "wangbaonj@gmail.com;qgu@cs.ucla.edu;march@math.ucla.edu;fbarekat@math.ucla.edu;sjo@math.ucla.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJlG5a4FvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "210;505;268",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "635;803;557",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            327.6666666666667,
            127.60964784146309
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            665.0,
            102.64501936285072
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FYX438CcDOoJ:scholar.google.com/&scioq=DP-LSSGD:+An+Optimization+Method+to+Lift+the+Utility+in+Privacy-Preserving+ERM&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BJlITC4KDB",
        "title": "Multi-Sample Dropout for Accelerated Training and Better Generalization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Dropout is a simple but efficient regularization technique for achieving better generalization of deep neural networks (DNNs); hence it is widely used in tasks based on DNNs. During training, dropout randomly discards a portion of the neurons to avoid overfitting. This paper presents an enhanced dropout technique, which we call multi-sample dropout, for both accelerating training and improving generalization over the original dropout. The original dropout creates a randomly selected subset (called a dropout sample) from the input in each training iteration while the multi-sample dropout creates multiple dropout samples. The loss is calculated for each sample, and then the sample losses are averaged to obtain the final loss. This technique can be easily implemented without implementing a new operator by duplicating a part of the network after the dropout layer while sharing the weights among the duplicated fully connected layers. Experimental results showed that multi-sample dropout significantly accelerates training by reducing the number of iterations until convergence for image classification tasks using the ImageNet, CIFAR-10, CIFAR-100, and SVHN datasets. Multi-sample dropout does not significantly increase computation cost per iteration for deep convolutional networks because most of the computation time is consumed in the convolution layers before the dropout layer, which are not duplicated. Experiments also showed that networks trained using multi-sample dropout achieved lower error rates and losses for both the training set and validation set.\n",
        "keywords": "dropout;regularization;convolutional neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hiroshi Inoue",
        "authorids": "inouehrs@jp.ibm.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\ninoue2020multisample,\ntitle={Multi-Sample Dropout for Accelerated Training and Better Generalization},\nauthor={Hiroshi Inoue},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlITC4KDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlITC4KDB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "567;529;394",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "52;128;48",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            496.6666666666667,
            74.23536144512863
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            76.0,
            36.8057966449127
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 81,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18429624137950637254&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJlJVCEYDB",
        "title": "Neural networks with motivation",
        "track": "main",
        "status": "Reject",
        "tldr": "We developed an RL model of motivated behaviors, then trained both our model and lab mice on biologically realistic task to make and validate predictions about neuronal connectivity in mouse brain.",
        "abstract": "How can animals behave effectively in conditions involving different motivational contexts? Here, we propose how reinforcement learning neural networks can learn optimal behavior for dynamically changing motivational salience vectors. First, we show that Q-learning neural networks with motivation can navigate in environment with dynamic rewards. Second, we show that such networks can learn complex behaviors simultaneously directed towards several goals distributed in an environment. Finally, we show that in Pavlovian conditioning task, the responses of the neurons in our model resemble the firing patterns of neurons in the ventral pallidum (VP), a basal ganglia structure involved in motivated behaviors. We show that, similarly to real neurons, recurrent networks with motivation are composed of two oppositely-tuned classes of neurons, responding to positive and negative rewards. Our model generates predictions for the VP connectivity. We conclude that networks with motivation can rapidly adapt their behavior to varying conditions without changes in synaptic strength when expected reward is modulated by motivation. Such networks may also provide a mechanism for how hierarchical reinforcement learning is implemented in the brain.",
        "keywords": "neuroscience;brain;motivation;learning;reinforcement learning;recurrent neural network;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sergey A. Shuvaev;Ngoc B. Tran;Marcus Stephenson-Jones;Bo Li;Alexei A. Koulakov",
        "authorids": "sshuvaev@cshl.edu;ntran@cshl.edu;mstephen@cshl.edu;bli@cshl.edu;akula@cshl.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nshuvaev2020neural,\ntitle={Neural networks with motivation},\nauthor={Sergey A. Shuvaev and Ngoc B. Tran and Marcus Stephenson-Jones and Bo Li and Alexei A. Koulakov},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlJVCEYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlJVCEYDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1303;691;570",
        "wc_reply_reviewers": "36;49;0",
        "wc_reply_authors": "3223;1228;1204",
        "reply_reviewers": "1;1;0",
        "reply_authors": "5;2;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            854.6666666666666,
            320.8450647205837
        ],
        "wc_reply_reviewers_avg": [
            28.333333333333332,
            20.725722075613085
        ],
        "wc_reply_authors_avg": [
            1885.0,
            946.1596059862205
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            3.0,
            1.4142135623730951
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11004582386770752346&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "BJlLQlrFwS",
        "title": "Minimizing Change in Classifier Likelihood to Mitigate Catastrophic Forgetting",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Another perspective on catastrophic forgetting",
        "abstract": "Continual learning is a longstanding goal of artificial intelligence, but is often counfounded by catastrophic forgetting that prevents neural networks from learning tasks sequentially. Previous methods in continual learning have demonstrated how to mitigate catastrophic forgetting, and learn new tasks while retaining performance on the previous tasks. We analyze catastrophic forgetting from the perspective of change in classifier likelihood and propose a simple L1 minimization criterion which can be adapted to different use cases. We further investigate two ways to minimize forgetting as quantified by this criterion and propose strategies to achieve finer control over forgetting. Finally, we evaluate our strategies on 3 datasets of varying difficulty and demonstrate improvements over previously known L2 strategies for mitigating catastrophic forgetting.",
        "keywords": "catastrophic forgetting;continual learning;classification;regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ashish Gaurav;Sachin Vernekar;Sean Sedwards;Jaeyoung Lee;Vahdat Abdelzad;Krzysztof Czarnecki",
        "authorids": "ashish.gaurav@uwaterloo.ca;sachin.vernekar@uwaterloo.ca;sean.sedwards@uwaterloo.ca;jaeyoung.lee@uwaterloo.ca;vabdelza@gsd.uwaterloo.ca;kczarnec@gsd.uwaterloo.ca",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJlLQlrFwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "292;80;489",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            287.0,
            167.0109776831052
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RSj6IFtd__wJ:scholar.google.com/&scioq=Minimizing+Change+in+Classifier+Likelihood+to+Mitigate+Catastrophic+Forgetting&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "BJlLdhNFPr",
        "title": "Explaining A Black-box By Using A Deep Variational Information Bottleneck Approach",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Interpretable machine learning has gained much attention recently. Briefness and comprehensiveness are necessary in order to provide a large amount of information concisely  when explaining a black-box decision system. However, existing interpretable machine learning methods fail to consider briefness and comprehensiveness simultaneously, leading to redundant explanations. We propose the variational information bottleneck for interpretation, VIBI, a system-agnostic interpretable method that provides a brief but comprehensive explanation. VIBI adopts an information theoretic principle, information bottleneck principle, as a criterion for finding such explanations. For each instance, VIBI selects key features that are maximally compressed about an input (briefness), and informative about a decision made by a black-box system on that input (comprehensive). We evaluate VIBI on three datasets and compare with state-of-the-art interpretable machine learning methods in terms of both interpretability and fidelity evaluated by human and quantitative metrics.",
        "keywords": "interpretable machine learning;information bottleneck principle;black-box",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Seojin Bang;Pengtao Xie;Heewook Lee;Wei Wu;Eric Xing",
        "authorids": "seojinb@cs.cmu.edu;seojinb@cs.cmu.edu;seojinb@cs.cmu.edu;seojinb@cs.cmu.edu;seojinb@cs.cmu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nbang2020explaining,\ntitle={Explaining A Black-box By Using A Deep Variational Information Bottleneck Approach},\nauthor={Seojin Bang and Pengtao Xie and Heewook Lee and Wei Wu and Eric Xing},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlLdhNFPr}\n}",
        "github": "https://drive.google.com/open?id=1IHOf9qw1sQ5KNUtHsO6wHGXK1wjcvxxP",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJlLdhNFPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "501;1349;347",
        "wc_reply_reviewers": "0;413;0",
        "wc_reply_authors": "1443;2201;1208",
        "reply_reviewers": "0;2;0",
        "reply_authors": "2;5;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            732.3333333333334,
            440.55823174190675
        ],
        "wc_reply_reviewers_avg": [
            137.66666666666666,
            194.6900670866961
        ],
        "wc_reply_authors_avg": [
            1617.3333333333333,
            423.718722215051
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            3.0,
            1.4142135623730951
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 96,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10812582567902140943&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "BJlLvnEtDB",
        "title": "Analysis and Interpretation of Deep CNN Representations as Perceptual Quality Features",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Pre-trained Deep Convolutional Neural Network (CNN) features have popularly been used as full-reference perceptual quality features for CNN based image quality assessment, super-resolution, image restoration and a variety of image-to-image translation problems. In this paper, to get more insight, we link basic human visual perception to characteristics of learned deep CNN representations as a novel and first attempt to interpret them. We characterize the frequency and orientation tuning of channels in trained object detection deep CNNs (e.g., VGG-16) by applying grating stimuli of different spatial frequencies and orientations as input. We observe that the behavior of CNN channels as spatial frequency and orientation selective filters can be used to link basic human visual perception models to their characteristics. Doing so, we develop a theory to get more insight into deep CNN representations as perceptual quality features. We conclude that sensitivity to spatial frequencies that have lower contrast masking thresholds in human visual perception and a definite and strong orientation selectivity are important attributes of deep CNN channels that deliver better perceptual quality features. ",
        "keywords": "interpretation;perceptual quality;perceptual loss;image-restoration.",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Taimoor Tariq;Munchurl Kim",
        "authorids": "taimoor.tariq@kaist.ac.kr;mkimee@kaist.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ntariq2020analysis,\ntitle={Analysis and Interpretation of Deep {\\{}CNN{\\}} Representations as Perceptual Quality Features},\nauthor={Taimoor Tariq and Munchurl Kim},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlLvnEtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJlLvnEtDB",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "687;362;290;280",
        "wc_reply_reviewers": "0;0;0;223",
        "wc_reply_authors": "995;708;351;1237",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "2;1;1;2",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.75,
            165.99905873227112
        ],
        "wc_reply_reviewers_avg": [
            55.75,
            96.5618325219649
        ],
        "wc_reply_authors_avg": [
            822.75,
            330.52562306120836
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7195932965610329469&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJlNs0VYPB",
        "title": "The Sooner The Better: Investigating Structure of Early Winning Lottery Tickets",
        "track": "main",
        "status": "Reject",
        "tldr": "A method to find winning lottery tickets in early epoch of training, saving computational cost for fast pruning.",
        "abstract": "The recent success of the lottery ticket hypothesis by Frankle & Carbin (2018) suggests that small, sparsified neural networks can be trained as long as the network is initialized properly. Several follow-up discussions on the initialization of the sparsified model have discovered interesting characteristics such as the necessity of rewinding (Frankle et al. (2019)), importance of sign of the initial weights (Zhou et al. (2019)), and the transferability of the winning lottery tickets (S. Morcos et al. (2019)). In contrast, another essential aspect of the winning ticket, the structure of the sparsified model, has been little discussed. To find the lottery ticket, unfortunately, all the prior work still relies on computationally expensive iterative pruning. \n\nIn this work, we conduct an in-depth investigation of the structure of winning lottery tickets. Interestingly, we discover that there exist many lottery tickets that can achieve equally good accuracy much before the regular training schedule even finishes. We provide insights into the structure of these early winning tickets with supporting evidence. 1) Under stochastic gradient descent optimization, lottery ticket emerges when weight magnitude of a model saturates; 2) Pruning before the saturation of a model causes the loss of capability in learning complex patterns, resulting in the accuracy degradation. We employ the memorization capacity analysis to quantitatively confirm it, and further explain why gradual pruning can achieve better accuracy over the one-shot pruning. Based on these insights, we discover the early winning tickets for various ResNet architectures on both CIFAR10 and ImageNet, achieving state-of-the-art accuracy at a high pruning rate without expensive iterative pruning. In the case of ResNet50 on ImageNet, this comes to the winning ticket of 75:02% Top-1 accuracy at 80% pruning rate in only 22% of the total epochs for iterative pruning.",
        "keywords": "pruning;lottery ticket hypothesis;deep neural network;compression;image classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shihui Yin;Kyu-Hyoun Kim;Jinwook Oh;Naigang Wang;Mauricio Serrano;Jae-Sun Seo;Jungwook Choi",
        "authorids": "shihui.yin@ibm.com;kimk@us.ibm.com;ohj@us.ibm.com;nwang@us.ibm.com;mserrano@us.ibm.com;jaesun.seo@asu.edu;choij@hanyang.ac.kr",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nyin2020the,\ntitle={The Sooner The Better: Investigating Structure of Early Winning Lottery Tickets},\nauthor={Shihui Yin and Kyu-Hyoun Kim and Jinwook Oh and Naigang Wang and Mauricio Serrano and Jae-Sun Seo and Jungwook Choi},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlNs0VYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlNs0VYPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "344;585;426",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "623;279;813",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            451.6666666666667,
            100.04776636964743
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            571.6666666666666,
            221.00578172426972
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=69835501114850393&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJlOcR4KwS",
        "title": "Channel Equilibrium Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "A design of  building block  for performance boosting",
        "abstract": "Convolutional Neural Networks (CNNs) typically treat normalization methods such as batch normalization (BN) and rectified linear function (ReLU) as building blocks. Previous work showed that this basic block would lead to channel-level sparsity (i.e. channel of zero values), reducing computational complexity of CNNs. However, over-sparse CNNs have many collapsed channels (i.e. many channels with undesired zero values), impeding their learning ability. This problem is seldom explored in the literature. To recover the collapsed channels and enhance learning capacity, we propose a  building block, Channel Equilibrium (CE), which takes the output of a normalization layer as input and switches between two branches, batch decorrelation (BD) branch and adaptive instance inverse (AII) branch. CE is able to prevent implicit channel-level sparsity in both experiments and theory. It has several appealing properties. First, CE can be stacked after many normalization methods such as BN and Group Normalization (GN), and integrated into many advanced CNN architectures such as ResNet and MobileNet V2 to form a series of CE networks (CENets), consistently improving their performance. Second, extensive experiments show that CE achieves state-of-the-art results on various challenging benchmarks such as ImageNet and COCO. Third, we show an interesting connection between CE and Nash Equilibrium, a well-known solution of a non-cooperative game. The models and code will be released soon.",
        "keywords": "Deep learning;convolutional neural networks;building block design",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenqi Shao;Shitao Tang;Xingang Pan;Ping Tan;Xiaogang Wang;Ping Luo",
        "authorids": "weqish@link.cuhk.edu.hk;shitaot@sfu.ca;px117@ie.cuhk.edu.hk;pingtan@sfu.ca;xgwang@ee.cuhk.edu.hk;pluo.lhi@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nshao2020channel,\ntitle={Channel Equilibrium Networks},\nauthor={Wenqi Shao and Shitao Tang and Xingang Pan and Ping Tan and Xiaogang Wang and Ping Luo},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlOcR4KwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer5;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJlOcR4KwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "248;275;593",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "361;565;792",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            372.0,
            156.65886505397643
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            572.6666666666666,
            176.03850588878433
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:29iyfUQU0yoJ:scholar.google.com/&scioq=Channel+Equilibrium+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJlPLlrFvH",
        "title": "Variable Complexity in the Univariate and Multivariate Structural Causal Model",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We show that by comparing the individual complexities of univariante cause and effect in the Structural Causal Model, one can identify the cause and the effect, without considering their interaction at all. The entropy of each variable is ineffective in measuring the complexity, and we propose to capture it by an autoencoder that operates on the list of sorted samples. Comparing the reconstruction errors of the two autoencoders, one for each variable, is shown to perform well on the accepted benchmarks of the field.\n\nIn the multivariate case, where one can ensure that the complexities of the cause and effect are balanced, we propose a new method that mimics the disentangled structure of the causal model. We extend the results of~\\cite{Zhang:2009:IPC:1795114.1795190} to the multidimensional case, showing that such modeling is only likely in the direction of causality. Furthermore, the learned model is shown theoretically to perform the separation to the causal component and to the residual  (noise) component. Our multidimensional method obtains a significantly higher accuracy than the literature methods.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tomer Galanti;Ofir Nabati;Lior Wolf",
        "authorids": "tomerga2@post.tau.ac.il;ofirnabati@mail.tau.ac.il;wolf@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngalanti2020variable,\ntitle={Variable Complexity in the Univariate and Multivariate Structural Causal Model},\nauthor={Tomer Galanti and Ofir Nabati and Lior Wolf},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlPLlrFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJlPLlrFvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "107;240;755",
        "wc_reply_reviewers": "0;0;216",
        "wc_reply_authors": "311;312;245",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            367.3333333333333,
            279.4474707147819
        ],
        "wc_reply_reviewers_avg": [
            72.0,
            101.82337649086284
        ],
        "wc_reply_authors_avg": [
            289.3333333333333,
            31.351058816073323
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LIIE88XUzcYJ:scholar.google.com/&scioq=Variable+Complexity+in+the+Univariate+and+Multivariate+Structural+Causal+Model&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BJlPOlBKDB",
        "title": "Closed loop deep Bayesian inversion: Uncertainty driven acquisition for fast MRI",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that a single cWGAN can model the posterior distribution an array of inverse problems in MRI and leverage the posterior variance to design effective sampling strategies.",
        "abstract": "This work proposes a closed-loop, uncertainty-driven adaptive sampling frame- work (CLUDAS) for accelerating magnetic resonance imaging (MRI) via deep Bayesian inversion. By closed-loop, we mean that our samples adapt in real- time to the incoming data. To our knowledge, we demonstrate the first genera- tive adversarial network (GAN) based framework for posterior estimation over a continuum sampling rates of an inverse problem. We use this estimator to drive the sampling for accelerated MRI. Our numerical evidence demonstrates that the variance estimate strongly correlates with the expected MSE improvement for dif- ferent acceleration rates even with few posterior samples. Moreover, the resulting masks bring improvements to the state-of-the-art fixed and active mask designing approaches across MSE, posterior variance and SSIM on real undersampled MRI scans.",
        "keywords": "Deep Bayesian Inversion;accelerated MRI;uncertainty quantification;sampling mask design",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thomas Sanchez;Igor Krawczuk;Zhaodong Sun;Volkan Cevher",
        "authorids": "thomas.sanchez@epfl.ch;igor.krawczuk@epfl.ch;zhaodong.sun@epfl.ch;volkan.cevher@epfl.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsanchez2020closed,\ntitle={Closed loop deep Bayesian inversion:  Uncertainty driven acquisition for fast {\\{}MRI{\\}}},\nauthor={Thomas Sanchez and Igor Krawczuk and Zhaodong Sun and Volkan Cevher},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlPOlBKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJlPOlBKDB",
        "pdf_size": 0,
        "rating": "3;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "318;539;715;2827",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "356;254;570;739",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1099.75,
            1007.0996412967289
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            479.75,
            188.1706340001011
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13005292530172567632&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJlQtJSKDB",
        "title": "Watch the Unobserved: A Simple Approach to Parallelizing Monte Carlo Tree Search",
        "track": "main",
        "status": "Talk",
        "tldr": "We developed an effective parallel UCT algorithm that achieves linear speedup and suffers negligible performance loss.",
        "abstract": "Monte Carlo Tree Search (MCTS) algorithms have achieved great success on many challenging benchmarks (e.g., Computer Go). However, they generally require a large number of rollouts, making their applications costly. Furthermore, it is also extremely challenging to parallelize MCTS due to its inherent sequential nature: each rollout heavily relies on the statistics (e.g., node visitation counts) estimated from previous simulations to achieve an effective exploration-exploitation tradeoff. In spite of these difficulties, we develop an algorithm, WU-UCT, to effectively parallelize MCTS, which achieves linear speedup and exhibits only limited performance loss with an increasing number of workers. The key idea in WU-UCT is a set of statistics that we introduce to track the number of on-going yet incomplete simulation queries (named as unobserved samples). These statistics are used to modify the UCT tree policy in the selection steps in a principled manner to retain effective exploration-exploitation tradeoff when we parallelize the most time-consuming expansion and simulation steps. Experiments on a proprietary benchmark and the Atari Game benchmark demonstrate the linear speedup and the superior performance of WU-UCT comparing to existing techniques.",
        "keywords": "parallel Monte Carlo Tree Search (MCTS);Upper Confidence bound for Trees (UCT);Reinforcement Learning (RL)",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anji Liu;Jianshu Chen;Mingze Yu;Yu Zhai;Xuewen Zhou;Ji Liu",
        "authorids": "anjiliu219@gmail.com;chenjianshu@gmail.com;yumingze@kuaishou.com;zhaiyu@kuaishou.com;zhouxuewen@kuaishou.com;ji.liu.uwisc@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nLiu2020Watch,\ntitle={Watch the Unobserved: A Simple Approach to Parallelizing Monte Carlo Tree Search},\nauthor={Anji Liu and Jianshu Chen and Mingze Yu and Yu Zhai and Xuewen Zhou and Ji Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlQtJSKDB}\n}",
        "github": "[![github](/images/github_icon.svg) liuanji/WU-UCT](https://github.com/liuanji/WU-UCT) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=BJlQtJSKDB)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJlQtJSKDB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "275;421;1322",
        "wc_reply_reviewers": "0;0;591",
        "wc_reply_authors": "1051;479;1896",
        "reply_reviewers": "0;0;2",
        "reply_authors": "2;1;3",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            672.6666666666666,
            463.00059995161513
        ],
        "wc_reply_reviewers_avg": [
            197.0,
            278.6000717874997
        ],
        "wc_reply_authors_avg": [
            1142.0,
            582.0555529042452
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 39,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11844597295814428948&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJlRs34Fvr",
        "title": "Skip Connections Matter: On the Transferability of Adversarial Examples Generated with ResNets",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We identify the security weakness of skip connections in ResNet-like neural networks",
        "abstract": "Skip connections are an essential component of current state-of-the-art deep neural networks (DNNs) such as ResNet, WideResNet, DenseNet, and ResNeXt. Despite their huge success in building deeper and more powerful DNNs, we identify a surprising \\emph{security weakness} of skip connections in this paper. Use of skip connections \\textit{allows easier generation of highly transferable adversarial examples}. Specifically, in ResNet-like (with skip connections) neural networks, gradients can backpropagate through either skip connections or residual modules. We find that using more gradients from the skip connections rather than the residual modules according to a decay factor, allows one to craft adversarial examples with high transferability. Our method is termed \\emph{Skip Gradient Method} (SGM). We conduct comprehensive transfer attacks against state-of-the-art DNNs including ResNets, DenseNets, Inceptions, Inception-ResNet, Squeeze-and-Excitation Network (SENet) and robustly trained DNNs. We show that employing SGM on the gradient flow can greatly improve the transferability of crafted attacks in almost all cases. Furthermore, SGM can be easily combined with existing black-box attack techniques, and obtain high improvements over state-of-the-art transferability methods. Our findings not only motivate new research into the architectural vulnerability of DNNs, but also open up further challenges for the design of secure DNN architectures.",
        "keywords": "Adversarial Example;Transferability;Skip Connection;Neural Network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongxian Wu;Yisen Wang;Shu-Tao Xia;James Bailey;Xingjun Ma",
        "authorids": "wu-dx16@mails.tsinghua.edu.cn;eewangyisen@gmail.com;xiast@sz.tsinghua.edu.cn;baileyj@unimelb.edu.au;xingjun.ma@unimelb.edu.au",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nWu2020Skip,\ntitle={Skip Connections Matter: On the Transferability of Adversarial Examples Generated with ResNets},\nauthor={Dongxian Wu and Yisen Wang and Shu-Tao Xia and James Bailey and Xingjun Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlRs34Fvr}\n}",
        "github": "[![github](/images/github_icon.svg) csdongxian/skip-connections-matter](https://github.com/csdongxian/skip-connections-matter) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=BJlRs34Fvr)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJlRs34Fvr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "607;151;959",
        "wc_reply_reviewers": "0;0;80",
        "wc_reply_authors": "779;212;499",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            572.3333333333334,
            330.7741760711612
        ],
        "wc_reply_reviewers_avg": [
            26.666666666666668,
            37.71236166328253
        ],
        "wc_reply_authors_avg": [
            496.6666666666667,
            231.48266073773695
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 423,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6211233010132912229&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJlS634tPr",
        "title": "PC-DARTS: Partial Channel Connections for Memory-Efficient Architecture Search",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Allowing partial channel connection in super-networks to regularize and accelerate differentiable architecture search",
        "abstract": "Differentiable architecture search (DARTS) provided a fast solution in finding effective network architectures, but suffered from large memory and computing overheads in jointly training a super-net and searching for an optimal architecture. In this paper, we present a novel approach, namely  Partially-Connected DARTS, by sampling a small part of super-net to reduce the redundancy in exploring the network space, thereby performing a more efficient search without comprising the performance. In particular, we perform operation search in a subset of channels while bypassing the held out part in a shortcut. This strategy may suffer from an undesired inconsistency on selecting the edges of super-net caused by sampling different channels. We solve it by introducing  edge normalization, which adds a new set of edge-level hyper-parameters to reduce uncertainty in search. Thanks to the reduced memory cost, PC-DARTS can be trained with a larger batch size and, consequently, enjoy both faster speed and higher training stability. Experiment results demonstrate the effectiveness of the proposed method. Specifically, we achieve an error rate of 2.57% on CIFAR10 within merely 0.1 GPU-days for architecture search, and a state-of-the-art top-1 error rate of 24.2% on ImageNet (under the mobile setting) within 3.8 GPU-days for search. Our code has been made available at https://www.dropbox.com/sh/on9lg3rpx1r6dkf/AABG5mt0sMHjnEJyoRnLEYW4a?dl=0.",
        "keywords": "Neural Architecture Search;DARTS;Regularization;Normalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuhui Xu;Lingxi Xie;Xiaopeng Zhang;Xin Chen;Guo-Jun Qi;Qi Tian;Hongkai Xiong",
        "authorids": "yuhuixu@sjtu.edu.cn;198808xc@gmail.com;zxphistory@gmail.com;1410452@tongji.edu.cn;guojunq@gmail.com;tian.qi1@huawei.com;xionghongkai@sjtu.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nXu2020PC-DARTS:,\ntitle={PC-DARTS: Partial Channel Connections for Memory-Efficient Architecture Search},\nauthor={Yuhui Xu and Lingxi Xie and Xiaopeng Zhang and Xin Chen and Guo-Jun Qi and Qi Tian and Hongkai Xiong},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlS634tPr}\n}",
        "github": "https://www.dropbox.com/sh/on9lg3rpx1r6dkf/AABG5mt0sMHjnEJyoRnLEYW4a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJlS634tPr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "499;960;191",
        "wc_reply_reviewers": "0;296;0",
        "wc_reply_authors": "595;1130;171",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            550.0,
            316.00738387997626
        ],
        "wc_reply_reviewers_avg": [
            98.66666666666667,
            139.53573815414538
        ],
        "wc_reply_authors_avg": [
            632.0,
            392.3833159891825
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 920,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1268458894093697275&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJlSPRVFwS",
        "title": "Asynchronous Stochastic Subgradient Methods for General Nonsmooth Nonconvex Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "Asymptotic convergence for stochastic subgradien method with momentum under general parallel asynchronous computation for general nonconvex nonsmooth optimization",
        "abstract": "Asynchronous distributed methods are a popular way to reduce the communication and synchronization costs of large-scale optimization. Yet, for all their success, little is known about their convergence guarantees in the challenging case of general non-smooth, non-convex objectives, beyond cases where closed-form proximal operator solutions are available.\nThis is all the more surprising since these objectives are the ones appearing in the training of deep neural networks.\n\nIn this paper, we introduce the first convergence analysis covering asynchronous methods in the case of general non-smooth, non-convex objectives. Our analysis applies to stochastic sub-gradient descent methods both with and without block variable partitioning, and both with and without momentum. It is phrased in the context of a general probabilistic model of asynchronous scheduling accurately adapted to modern hardware properties. We validate our analysis experimentally in the context of training deep neural network architectures. We show their overall successful asymptotic convergence as well as exploring how momentum, synchronization, and partitioning all affect performance.",
        "keywords": "optimziation;stochastic optimization;asynchronous parallel architecture;deep neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vyacheslav Kungurtsev;Malcolm Egan;Bapi Chatterjee;Dan Alistarh",
        "authorids": "vyacheslav.kungurtsev@fel.cvut.cz;malcom.egan@insa-lyon.fr;bapi.chatterjee@ist.ac.at;dan.alistarh@ist.ac.at",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkungurtsev2020asynchronous,\ntitle={Asynchronous Stochastic Subgradient Methods for General Nonsmooth Nonconvex Optimization},\nauthor={Vyacheslav Kungurtsev and Malcolm Egan and Bapi Chatterjee and Dan Alistarh},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlSPRVFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlSPRVFwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "306;473;164",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "444;943;232",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            314.3333333333333,
            126.28627091739713
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            539.6666666666666,
            298.04287536452796
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9123478226300384789&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJlVeyHFwH",
        "title": "On the Invertibility of Invertible Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Little known fact: Invertible Neural Networks can be non-invertible; we show why, when and how to fix it.",
        "abstract": "Guarantees in deep learning are hard to achieve due to the interplay of flexible modeling schemes and complex tasks. Invertible neural networks (INNs), however, provide several mathematical guarantees by design, such as the ability to approximate non-linear diffeomorphisms. One less studied advantage of INNs is that they enable the design of bi-Lipschitz functions. This property has been used implicitly by various works to design generative models, memory-saving gradient computation, regularize classifiers, and solve inverse problems. \nIn this work, we study Lipschitz constants of invertible architectures in order to investigate guarantees on stability of their inverse and forward mapping. Our analysis reveals that commonly-used INN building blocks can easily become non-invertible, leading to questionable ``exact'' log likelihood computations and training difficulties. We introduce a set of numerical analysis tools to diagnose non-invertibility in practice. Finally, based on our theoretical analysis, we show how to guarantee numerical invertibility for one of the most common INN architectures.",
        "keywords": "Invertible Neural Networks;Stability;Normalizing Flows;Generative Models;Evaluation of Generative Models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jens Behrmann;Paul Vicol;Kuan-Chieh Wang;Roger B. Grosse;J\u00f6rn-Henrik Jacobsen",
        "authorids": "jensb@uni-bremen.de;pvicol@cs.toronto.edu;wangkua1@cs.toronto.edu;rgrosse@cs.toronto.edu;j.jacobsen@vectorinstitute.ai",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nbehrmann2020on,\ntitle={On the Invertibility of Invertible Neural Networks},\nauthor={Jens Behrmann and Paul Vicol and Kuan-Chieh Wang and Roger B. Grosse and J{\\\"o}rn-Henrik Jacobsen},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlVeyHFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJlVeyHFwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "1035;533;242",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1011;1030;62",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            603.3333333333334,
            327.53863215742285
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            701.0,
            451.9078077071325
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5245010048075218935&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJlXgkHYvS",
        "title": "Information-Theoretic Local Minima Characterization and Regularization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recent advances in deep learning theory have evoked the study of generalizability across different local minima of deep neural networks (DNNs). While current work focused on either discovering properties of good local minima or developing regularization techniques to induce good local minima, no approach exists that can tackle both problems. We achieve these two goals successfully in a unified manner. Specifically, based on the Fisher information we propose a metric both strongly indicative of generalizability of local minima and effectively applied as a practical regularizer. We provide theoretical analysis including a generalization bound and empirically demonstrate the success of our approach in both capturing and improving the generalizability of DNNs. Experiments are performed on CIFAR-10 and CIFAR-100 for various network architectures.",
        "keywords": "local minima;generalization;regularization;deep learning theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiwei Jia;Hao Su",
        "authorids": "zjia@ucsd.edu;haosu@eng.ucsd.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\njia2020informationtheoretic,\ntitle={Information-Theoretic Local Minima Characterization and Regularization},\nauthor={Zhiwei Jia and Hao Su},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlXgkHYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJlXgkHYvS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "873;431;390",
        "wc_reply_reviewers": "1788;0;294",
        "wc_reply_authors": "2901;1027;302",
        "reply_reviewers": "5;0;1",
        "reply_authors": "6;3;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            564.6666666666666,
            218.66615853599495
        ],
        "wc_reply_reviewers_avg": [
            694.0,
            782.8307607650584
        ],
        "wc_reply_authors_avg": [
            1410.0,
            1095.0546409502435
        ],
        "reply_reviewers_avg": [
            2.0,
            2.160246899469287
        ],
        "reply_authors_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "replies_avg": [
            23,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16854698489852164998&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BJlZ5ySKPH",
        "title": "U-GAT-IT: Unsupervised Generative Attentional Networks with Adaptive Layer-Instance Normalization for Image-to-Image Translation",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We propose a novel method for unsupervised image-to-image translation, which incorporates a new attention module and a new learnable normalization function in an end-to-end manner. The attention module guides our model to focus on more important regions distinguishing between source and target domains based on the attention map obtained by the auxiliary classifier. Unlike previous attention-based method which cannot handle the geometric changes between domains, our model can translate both images requiring holistic changes and images requiring large shape changes. Moreover, our new AdaLIN (Adaptive Layer-Instance Normalization) function helps our attention-guided model to flexibly control the amount of change in shape and texture by learned parameters depending on datasets. Experimental results show the superiority of the proposed method compared to the existing state-of-the-art models with a fixed network architecture and hyper-parameters. ",
        "keywords": "Image-to-Image Translation;Generative Attentional Networks;Adaptive Layer-Instance Normalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junho Kim;Minjae Kim;Hyeonwoo Kang;Kwang Hee Lee",
        "authorids": "takis0112@gmail.com;minjaekim@ncsoft.com;hwkang0131@ncsoft.com;lkwanghee@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nKim2020U-GAT-IT:,\ntitle={U-GAT-IT: Unsupervised Generative Attentional Networks with Adaptive Layer-Instance Normalization for Image-to-Image Translation},\nauthor={Junho Kim and Minjae Kim and Hyeonwoo Kang and Kwang Hee Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlZ5ySKPH}\n}",
        "github": "https://github.com/taki0112/UGATIT",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJlZ5ySKPH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "410;175;731",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "265;42;613",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            438.6666666666667,
            227.88934936840633
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            306.6666666666667,
            234.96429988877506
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 764,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10390912983102174696&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJl_a2VYPH",
        "title": "Natural Language Adversarial Attack and Defense in Word Level",
        "track": "main",
        "status": "Withdraw",
        "tldr": "The first text adversarial defense method in word level, and the improved generic based attack method against synonyms substitution based attacks.",
        "abstract": "Up until very recently, inspired by a mass of researches on adversarial examples for computer vision, there has been a growing interest in designing adversarial attacks for Natural Language Processing (NLP) tasks, followed by very few works of adversarial defenses for NLP. To our knowledge, there exists no defense method against the successful synonym substitution based attacks that aim to satisfy all the lexical, grammatical, semantic constraints and thus are hard to perceived by humans. We contribute to fill this gap and propose a novel adversarial defense method called Synonym Encoding Method (SEM), which inserts an encoder before the input layer of the model and then trains the model to eliminate adversarial perturbations. Extensive experiments demonstrate that SEM can efficiently defend current best synonym substitution based adversarial attacks with little decay on the accuracy for benign examples. To better evaluate SEM, we also design a strong attack method called Improved Genetic Algorithm (IGA) that adopts the genetic metaheuristic for synonym substitution based attacks. Compared with existing genetic based adversarial attack, IGA can achieve higher attack success rate while maintaining the transferability of the adversarial examples.",
        "keywords": "adversarial examples;text adversarial defense;text adversarial attack;synonym encoding;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiaosen Wang;Hao Jin;Kun He",
        "authorids": "xiaosen@hust.edu.cn;mailtojinhao@hust.edu.cn;brooklet60@hust.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJl_a2VYPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "311;630;266",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            402.3333333333333,
            162.02948977132388
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 111,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6543532954668773126&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJlaG0VFDH",
        "title": "Decoupling Weight Regularization from Batch Size for Model Compression",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that stronger regularization and high model compression ratio can be achieved when weight updates are conducted less frequently.",
        "abstract": "Conventionally, compression-aware training performs weight compression for every mini-batch to compute the impact of compression on the loss function. In this paper, in order to study when would be the right time to compress weights during optimization steps, we propose a new hyper-parameter called  Non-Regularization period or NR period during which weights are not updated for regularization. We first investigate the influence of NR period on regularization using weight decay and weight random noise insertion. Throughout various experiments, we show that stronger weight regularization demands longer NR period (regardless of batch size) to best utilize regularization effects. From our empirical evidence, we argue that weight regularization for every mini-batch allows small weight updates only and limited regularization effects such that there is a need to search for right NR period and weight regularization strength to enhance model accuracy. Consequently, NR period becomes especially crucial for model compression where large weight updates are necessary to increase compression ratio. Using various models, we show that simple weight updates to comply with compression formats along with long NR period is enough to achieve high compression ratio and model accuracy.",
        "keywords": "Model compression;Weight Regularization;Batch Size;Gradient Descent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongsoo Lee;Se Jung Kwon;Byeongwook Kim;Yongkweon Jeon;Baeseong Park;Jeongin Yun;Gu-Yeon Wei",
        "authorids": "dslee3@gmail.com;mogndrewk@gmail.com;quddnr145@gmail.com;dragwon.jeon@gmail.com;qkrqotjd91@gmail.com;yji6373@naver.com;gywei@g.harvard.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nlee2020decoupling,\ntitle={Decoupling Weight Regularization from Batch Size for Model Compression},\nauthor={Dongsoo Lee and Se Jung Kwon and Byeongwook Kim and Yongkweon Jeon and Baeseong Park and Jeongin Yun and Gu-Yeon Wei},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlaG0VFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJlaG0VFDH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "151;336;117",
        "wc_reply_reviewers": "9;0;0",
        "wc_reply_authors": "453;317;102",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            201.33333333333334,
            96.23004843718111
        ],
        "wc_reply_reviewers_avg": [
            3.0,
            4.242640687119285
        ],
        "wc_reply_authors_avg": [
            290.6666666666667,
            144.4999038830899
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:plXXUHR_8gsJ:scholar.google.com/&scioq=Decoupling+Weight+Regularization+from+Batch+Size+for+Model+Compression&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJlahxHYDS",
        "title": "Conservative Uncertainty Estimation By Fitting Prior Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We provide theoretical support to uncertainty estimates for deep learning obtained fitting random priors.",
        "abstract": "Obtaining high-quality uncertainty estimates is essential for many applications of deep neural networks. In this paper, we theoretically justify a scheme for estimating uncertainties, based on sampling from a prior distribution. Crucially, the uncertainty estimates are shown to be conservative in the sense that they never underestimate a posterior uncertainty obtained by a hypothetical Bayesian algorithm. We also show concentration, implying that the uncertainty estimates converge to zero as we get more data. Uncertainty estimates obtained from random priors can be adapted to any deep network architecture and trained using standard supervised learning pipelines. We provide experimental evaluation of random priors on calibration and out-of-distribution detection on typical computer vision tasks, demonstrating that they outperform deep ensembles in practice.",
        "keywords": "uncertainty quantification;deep learning;Gaussian process;epistemic uncertainty;random network;prior;Bayesian inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kamil Ciosek;Vincent Fortuin;Ryota Tomioka;Katja Hofmann;Richard Turner",
        "authorids": "kamil.ciosek@microsoft.com;fortuin@inf.ethz.ch;ryoto@microsoft.com;katja.hofmann@microsoft.com;ret26@cam.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nCiosek2020Conservative,\ntitle={Conservative Uncertainty Estimation By Fitting  Prior Networks},\nauthor={Kamil Ciosek and Vincent Fortuin and Ryota Tomioka and Katja Hofmann and Richard Turner},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlahxHYDS}\n}",
        "github": "https://github.com/microsoft/conservative-uncertainty-estimation-random-priors",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJlahxHYDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "977;516;332",
        "wc_reply_reviewers": "123;0;0",
        "wc_reply_authors": "1497;735;809",
        "reply_reviewers": "1;0;0",
        "reply_authors": "4;2;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            608.3333333333334,
            271.2936088856909
        ],
        "wc_reply_reviewers_avg": [
            41.0,
            57.982756057296896
        ],
        "wc_reply_authors_avg": [
            1013.6666666666666,
            343.10089219094465
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.9428090415820634
        ],
        "replies_avg": [
            22,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 79,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8113822958414013521&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJlbo6VtDH",
        "title": "A Generalized Framework of Sequence Generation with Application to Undirected Sequence Models",
        "track": "main",
        "status": "Reject",
        "tldr": "We unify several language generation paradigms (monotonic autoregressive, non-autoregressive, etc.) in a single framework, and use the framework to do machine translation with undirected sequence models.",
        "abstract": "Undirected neural sequence models such as BERT (Devlin et al., 2019) have received renewed interest due to their success on discriminative natural language understanding tasks such as question-answering and natural language inference. \nThe problem of generating sequences directly from these models has received relatively little attention, in part because generating from such models departs significantly from the conventional approach of monotonic generation in directed sequence models. We investigate this problem by first proposing a generalized model of sequence generation that unifies decoding in directed and undirected models. The proposed framework models the process of generation rather than a resulting sequence, and under this framework, we derive various neural sequence models as special cases, such as autoregressive, semi-autoregressive, and refinement-based non-autoregressive models. This unification enables us to adapt decoding algorithms originally developed for directed sequence models to undirected models. We demonstrate this by evaluating various decoding strategies for a cross-lingual masked translation model (Lample and Conneau, 2019). Our experiments show that generation from undirected sequence models, under our framework, is competitive with the state of the art on WMT'14 English-German translation. We also demonstrate that the proposed approach enables constant-time translation with similar performance to linear-time translation from the same model by rescoring hypotheses with an autoregressive model.",
        "keywords": "nlp;sequence modeling;natural language generation;machine translation;BERT;Sesame Street",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Elman Mansimov;Alex Wang;Kyunghyun Cho",
        "authorids": "elman.mansimov@gmail.com;wangalexc@gmail.com;kyunghyun.cho@nyu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmansimov2020a,\ntitle={A Generalized Framework of Sequence Generation with Application to Undirected Sequence Models},\nauthor={Elman Mansimov and Alex Wang and Kyunghyun Cho},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlbo6VtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlbo6VtDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "250;421;633",
        "wc_reply_reviewers": "0;100;0",
        "wc_reply_authors": "238;383;782",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            434.6666666666667,
            156.65744653719963
        ],
        "wc_reply_reviewers_avg": [
            33.333333333333336,
            47.14045207910317
        ],
        "wc_reply_authors_avg": [
            467.6666666666667,
            230.014975357886
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3464312793297960392&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJldrxBtwB",
        "title": "On the Anomalous Generalization of GANs",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Generative models, especially Generative Adversarial Networks (GANs), have received significant attention recently. However, it has been observed that in terms of some attributes, \\emph{e.g.} the number of simple geometric primitives in an image, GANs are not able to learn the target distribution in practice. Motivated by this observation, we discover two specific problems of GANs leading to anomalous generalization behaviour, which we refer to as the sample insufficiency and the pixel-wise combination. For the first problem of sample insufficiency, we show theoretically and empirically that the batchsize of the training samples in practice may be insufficient for the discriminator to learn an accurate discrimination function. It could result in unstable training dynamics for the generator, leading to anomalous generalization. For the second problem of pixel-wise combination, we find that besides recognizing the positive training samples as real, under certain circumstances, the discriminator could be fooled to recognize the pixel-wise combinations (\\emph{e.g.} pixel-wise average) of the positive training samples as real. However, those combinations could be visually different from the real samples in the target distribution. With the fooled discriminator as reference, the generator would obtain biased supervision further, leading to the anomalous generalization behaviour. Additionally, in this paper, we propose methods to mitigate the anomalous generalization of GANs. Extensive experiments on benchmark show our proposed methods improve the FID score up to 30\\% on natural image dataset. ",
        "keywords": "GANs;Generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jinchen Xuan;Yunchang Yang;Ze Yang;Di He;Liwei Wang",
        "authorids": "1600012865@pku.edu.cn;1500010650@pku.edu.cn;yangze@pku.edu.cn;dihe@microsoft.com;wanglw@cis.pku.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJldrxBtwB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "437;643;307",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            462.3333333333333,
            138.3361445497484
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16526232873718856158&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJleph4KvS",
        "title": "HaarPooling: Graph Pooling with Compressive Haar Basis",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep Graph Neural Networks (GNNs) are instrumental in graph classification and graph-based regression tasks. In these tasks, graph pooling is a critical ingredient by which GNNs adapt to input graphs of varying size and structure. We propose a new graph pooling operation based on compressive Haar transforms, called HaarPooling. HaarPooling is computed following a chain of sequential clusterings of the input graph. The input of each pooling layer is transformed by the compressive Haar basis of the corresponding clustering. HaarPooling operates in the frequency domain by the synthesis of nodes in the same cluster and filters out fine detail information by compressive Haar transforms. Such transforms provide an effective characterization of the data and preserve the structure information of the input graph. By the sparsity of the Haar basis, the computation of HaarPooling is of linear complexity. The GNN with HaarPooling and existing graph convolution layers achieves state-of-the-art performance on diverse graph classification problems.",
        "keywords": "graph pooling;graph neural networks;tree;graph classification;graph regression;deep learning;Haar wavelet basis;fast Haar transforms",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Guang Wang;Ming Li;Zheng Ma;Guido Montufar;Xiaosheng Zhuang;Yanan Fan",
        "authorids": "yuguang.wang@unsw.edu.au;ming.li.ltu@gmail.com;mzheng@princeton.edu;montufar@math.ucla.edu;xzhuang7@cityu.edu.hk;y.fan@unsw.edu.au",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nwang2020haarpooling,\ntitle={HaarPooling: Graph Pooling with Compressive Haar Basis},\nauthor={Yu Guang Wang and Ming Li and Zheng Ma and Guido Montufar and Xiaosheng Zhuang and Yanan Fan},\nyear={2020},\nurl={https://openreview.net/forum?id=BJleph4KvS}\n}",
        "github": "https://www.dropbox.com/sh/33s71980xnde0m1/AADyiIYXON6pFFWcWChO3_B0a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJleph4KvS",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "924;235;123;250",
        "wc_reply_reviewers": "344;0;0;0",
        "wc_reply_authors": "1932;429;326;261",
        "reply_reviewers": "2;0;0;0",
        "reply_authors": "4;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            383.0,
            316.1779562208599
        ],
        "wc_reply_reviewers_avg": [
            86.0,
            148.95636945092346
        ],
        "wc_reply_authors_avg": [
            737.0,
            692.5290607620738
        ],
        "reply_reviewers_avg": [
            0.5,
            0.8660254037844386
        ],
        "reply_authors_avg": [
            1.75,
            1.299038105676658
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17643661017910287325&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJlgt2EYwr",
        "title": "Stabilizing DARTS with Amended Gradient Estimation on Architectural Parameters",
        "track": "main",
        "status": "Reject",
        "tldr": "An improved optimization of differentiable NAS that largely improves search stability",
        "abstract": "Differentiable neural architecture search has been a popular methodology of exploring architectures for deep learning. Despite the great advantage of search efficiency, it often suffers weak stability, which obstacles it from being applied to a large search space or being flexibly adjusted to different scenarios. This paper investigates DARTS, the currently most popular differentiable search algorithm, and points out an important factor of instability, which lies in its approximation on the gradients of architectural parameters. In the current status, the optimization algorithm can converge to another point which results in dramatic inaccuracy in the re-training process. Based on this analysis, we propose an amending term for computing architectural gradients by making use of a direct property of the optimality of network parameter optimization. Our approach mathematically guarantees that gradient estimation follows a roughly correct direction, which leads the search stage to converge on reasonable architectures. In practice, our algorithm is easily implemented and added to DARTS-based approaches efficiently. Experiments on CIFAR and ImageNet demonstrate that our approach enjoys accuracy gain and, more importantly, enables DARTS-based approaches to explore much larger search spaces that have not been studied before.",
        "keywords": "Neural Architecture Search;DARTS;Stability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kaifeng Bi;Changping Hu;Lingxi Xie;Xin Chen;Longhui Wei;Qi Tian",
        "authorids": "bikaifeng@huawei.com;huchangping@huawei.com;198808xc@gmail.com;1410452@tongji.edu.cn;weilonghui1@huawei.com;tian.qi1@huawei.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nbi2020stabilizing,\ntitle={Stabilizing {\\{}DARTS{\\}} with Amended Gradient Estimation on Architectural Parameters},\nauthor={Kaifeng Bi and Changping Hu and Lingxi Xie and Xin Chen and Longhui Wei and Qi Tian},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlgt2EYwr}\n}",
        "github": "https://www.dropbox.com/sh/j4rfzi6586iw3me/AAB1bnUMid-5DLzaEGxmQAkCa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlgt2EYwr",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "541;1270;135;364",
        "wc_reply_reviewers": "0;476;0;0",
        "wc_reply_authors": "707;1760;165;341",
        "reply_reviewers": "0;4;0;0",
        "reply_authors": "1;5;1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            577.5,
            424.934406702964
        ],
        "wc_reply_reviewers_avg": [
            119.0,
            206.11404610069638
        ],
        "wc_reply_authors_avg": [
            743.25,
            618.7230297152354
        ],
        "reply_reviewers_avg": [
            1.0,
            1.7320508075688772
        ],
        "reply_authors_avg": [
            2.0,
            1.7320508075688772
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 65,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14776804219341807316&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJlguT4YPr",
        "title": "Scalable Neural Methods for Reasoning With a Symbolic Knowledge Base",
        "track": "main",
        "status": "Poster",
        "tldr": "A scalable differentiable neural module that implements reasoning on symbolic KBs.",
        "abstract": "We describe a novel way of representing a symbolic knowledge base (KB) called a sparse-matrix reified KB.  This representation enables neural modules that are fully differentiable, faithful to the original semantics of the KB, expressive enough to model multi-hop inferences, and scalable enough to use with realistically large KBs. The sparse-matrix reified KB can be distributed across multiple GPUs, can scale to tens of millions of entities and facts, and is orders of magnitude faster than naive sparse-matrix implementations.  The reified KB enables very simple end-to-end architectures to obtain competitive performance on several benchmarks representing two families of tasks: KB completion, and learning semantic parsers from denotations.",
        "keywords": "question-answering;knowledge base completion;neuro-symbolic reasoning;multihop reasoning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "William W. Cohen;Haitian Sun;R. Alex Hofer;Matthew Siegler",
        "authorids": "wcohen@google.com;haitiansun@google.com;rofer@google.com;msiegler@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nCohen2020Scalable,\ntitle={Scalable Neural Methods for Reasoning With a Symbolic Knowledge   Base},\nauthor={William W. Cohen and Haitian Sun and R. Alex Hofer and Matthew Siegler},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlguT4YPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJlguT4YPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "367;272;314",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "504;622;605",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            317.6666666666667,
            38.870154217456985
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            577.0,
            52.083266666624
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 81,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9775444676179054529&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJliakStvH",
        "title": "Maximum Likelihood Constraint Inference for Inverse Reinforcement Learning",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Our method infers constraints on task execution by leveraging the principle of maximum entropy to quantify how demonstrations differ from expected, un-constrained behavior.",
        "abstract": "While most approaches to the problem of Inverse Reinforcement Learning (IRL) focus on estimating a reward function that best explains an expert agent\u2019s policy or demonstrated behavior on a control task, it is often the case that such behavior is more succinctly represented by a simple reward combined with a set of hard constraints. In this setting, the agent is attempting to maximize cumulative rewards subject to these given constraints on their behavior. We reformulate the problem of IRL on Markov Decision Processes (MDPs) such that, given a nominal model of the environment and a nominal reward function, we seek to estimate state, action, and feature constraints in the environment that motivate an agent\u2019s behavior. Our approach is based on the Maximum Entropy IRL framework, which allows us to reason about the likelihood of an expert agent\u2019s demonstrations given our knowledge of an MDP. Using our method, we can infer which constraints can be added to the MDP to most increase the likelihood of observing these demonstrations. We present an algorithm which iteratively infers the Maximum Likelihood Constraint to best explain observed behavior, and we evaluate its efficacy using both simulated behavior and recorded data of humans navigating around an obstacle.",
        "keywords": "learning from demonstration;inverse reinforcement learning;constraint inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dexter R.R. Scobee;S. Shankar Sastry",
        "authorids": "dscobee@eecs.berkeley.edu;sastry@eecs.berkeley.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nScobee2020Maximum,\ntitle={Maximum Likelihood Constraint Inference for Inverse Reinforcement Learning},\nauthor={Dexter R.R. Scobee and S. Shankar Sastry},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJliakStvH}\n}",
        "github": "https://drive.google.com/drive/folders/1h2J7o4w4J0_dpldTRpFu_jWQR8CkBbXw",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJliakStvH",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "376;120;88;423",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "492;241;503;722",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            251.75,
            149.11132586091506
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            489.5,
            170.2916615692031
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 86,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14315270106426025185&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJlisySYPS",
        "title": "Modelling the influence of data structure on learning in neural networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We demonstrate how structure in data sets impacts neural networks and introduce a generative model for synthetic data sets that reproduces this impact.",
        "abstract": "The lack of crisp mathematical models that capture the structure of real-world\ndata sets is a major obstacle to the detailed theoretical understanding of deep\nneural networks. Here, we first demonstrate the effect of structured data sets\nby experimentally comparing the dynamics and the performance of two-layer\nnetworks trained on two different data sets: (i) an unstructured synthetic data\nset containing random i.i.d. inputs, and (ii) a simple canonical data set such\nas MNIST images. Our analysis reveals two phenomena related to the dynamics of\nthe networks and their ability to generalise that only appear when training on\nstructured data sets. Second, we introduce a generative model for data sets,\nwhere high-dimensional inputs lie on a lower-dimensional manifold and have\nlabels that depend only on their position within this manifold. We call it the\n*hidden manifold model* and we experimentally demonstrate that training\nnetworks on data sets drawn from this model reproduces both the phenomena seen\nduring training on MNIST.",
        "keywords": "Neural Networks;Generative models;Synthetic data sets;Generalisation;Stochastic Gradient descent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "S. Goldt;M. M\u00e9zard;F. Krzakala;L. Zdeborov\u00e1",
        "authorids": "goldt.sebastian@gmail.com;marc.mezard@gmail.com;florent.krzakala@gmail.com;lenka.zdeborova@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ngoldt2020modelling,\ntitle={Modelling the influence of data structure on learning in neural networks},\nauthor={S. Goldt and M. M{\\'e}zard and F. Krzakala and L. Zdeborov{\\'a}},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlisySYPS}\n}",
        "github": "https://drive.google.com/file/d/1L0UOtOoRTYSHZtTxMxKIQuZLEuVaoJl_/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJlisySYPS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "473;416;249",
        "wc_reply_reviewers": "0;56;122",
        "wc_reply_authors": "521;228;144",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;2;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            379.3333333333333,
            95.052032534233
        ],
        "wc_reply_reviewers_avg": [
            59.333333333333336,
            49.862031870173745
        ],
        "wc_reply_authors_avg": [
            297.6666666666667,
            161.60101759855624
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 43,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5676371976994401258&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJlkgaNKvr",
        "title": "Towards Understanding the Regularization of Adversarial Robustness on Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We study the accuracy degradation in adversarial training through regularization perspective and find that such training induces diffident NNs that concentrate prediction around decision boundary which leads to worse standard performance.",
        "abstract": "  The problem of adversarial examples has shown that modern Neural Network (NN) models could be rather fragile. Among the most promising techniques to solve the problem, one is to require the model to be {\\it $\\epsilon$-adversarially robust} (AR); that is, to require the model not to change predicted labels when any given input examples are perturbed within a certain range. However, it is widely observed that such methods would lead to standard performance degradation, i.e., the degradation on natural examples.  In this work, we study the degradation through the regularization perspective.  We identify quantities from generalization analysis of NNs; with the identified quantities we empirically find that AR is achieved by regularizing/biasing NNs towards less confident solutions by making the changes in the feature space (induced by changes in the instance space) of most layers smoother uniformly in all directions; so to a certain extent, it prevents sudden change in prediction w.r.t.  perturbations. However, the end result of such smoothing concentrates samples around decision boundaries, resulting in less confident solutions, and leads to worse standard performance.  Our studies suggest that one might consider ways that build AR into NNs in a gentler way to avoid the problematic regularization.\n",
        "keywords": "Adversarial robustness;Statistical Learning;Regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuxin Wen;Shuai Li;Kui Jia",
        "authorids": "wen.yuxin@mail.scut.edu.cn;lishuai918@gmail.com;kuijia@scut.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwen2020towards,\ntitle={Towards Understanding the Regularization of Adversarial Robustness on Neural Networks},\nauthor={Yuxin Wen and Shuai Li and Kui Jia},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlkgaNKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlkgaNKvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "793;201;567",
        "wc_reply_reviewers": "88;0;0",
        "wc_reply_authors": "1649;93;1205",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            520.3333333333334,
            243.92530732901733
        ],
        "wc_reply_reviewers_avg": [
            29.333333333333332,
            41.48359782961079
        ],
        "wc_reply_authors_avg": [
            982.3333333333334,
            654.4561779744224
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1213013251053997039&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJlnmgrFvS",
        "title": "BAIL: Best-Action Imitation Learning for Batch Deep Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new Batch Reinforcement Learning algorithm achieving state-of-the-art performance. ",
        "abstract": "The field of Deep Reinforcement Learning (DRL) has recently seen a surge in research in batch reinforcement learning, which aims for sample-efficient learning from a given data set without additional interactions with the environment. In the batch DRL setting, commonly employed off-policy DRL algorithms can perform poorly and sometimes even fail to learn altogether.  In this paper we propose anew algorithm, Best-Action Imitation Learning (BAIL), which unlike many off-policy DRL algorithms does not involve maximizing Q functions over the action space. Striving for simplicity as well as performance, BAIL first selects from the batch the actions it believes to be high-performing actions for their corresponding states; it then uses those state-action pairs to train a policy network using imitation learning.  Although BAIL is simple, we demonstrate that BAIL achieves state of the art performance on the Mujoco benchmark, typically outperforming BatchConstrained deep Q-Learning (BCQ) by a wide margin.",
        "keywords": "Deep Reinforcement Learning;Batch Reinforcement Learning;Sample Efficiency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinyue Chen;Zijian Zhou;Zheng Wang;Che Wang;Yanqiu Wu;Qing Deng;Keith Ross",
        "authorids": "xc1305@nyu.edu;zz1435@nyu.edu;zw1454@nyu.edu;cw1681@nyu.edu;yanqiu.wu@nyu.edu;qd319@nyu.edu;keithwross@nyu.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nchen2020bail,\ntitle={{\\{}BAIL{\\}}: Best-Action Imitation Learning for Batch Deep Reinforcement Learning},\nauthor={Xinyue Chen and Zijian Zhou and Zheng Wang and Che Wang and Yanqiu Wu and Qing Deng and Keith Ross},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlnmgrFvS}\n}",
        "github": "https://anonymous.4open.science/r/e5fbe703-a32d-4679-a2a8-095e74b96e85/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJlnmgrFvS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "605;904;961",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "485;476;210",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            823.3333333333334,
            156.1288641546534
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            390.3333333333333,
            127.56784687199027
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 156,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11856041909374113565&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BJlo91BYPr",
        "title": "Irrationality can help reward inference",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We find that irrationality from an expert demonstrator can help a learner infer their preferences. ",
        "abstract": "Specifying reward functions is difficult, which motivates the area of reward inference: learning rewards from human behavior. The starting assumption in the area is that human behavior is optimal given the desired reward function, but in reality people have many different forms of irrationality, from noise to myopia to risk aversion and beyond. This fact seems like it will be strictly harmful to reward inference: it is already hard to infer the reward from rational behavior, and noise and systematic biases make actions have less direct of a relationship to the reward. Our insight in this work is that, contrary to expectations, irrationality can actually help rather than hinder reward inference. For some types and amounts of irrationality, the expert now produces more varied policies compared to rational behavior, which help disambiguate among different reward parameters -- those that otherwise correspond to the same rational behavior. We put this to the test in a systematic analysis of the effect of irrationality on reward inference. We start by covering the space of irrationalities as deviations from the Bellman update, simulate expert behavior, and measure the accuracy of inference to contrast the different types and study the gains and losses. We provide a mutual information-based analysis of our findings, and wrap up by discussing the need to accurately model irrationality, as well as to what extent we might expect (or be able to train) real people to exhibit helpful irrationalities when teaching rewards to learners. ",
        "keywords": "preference inference;inverse reinforcement learning;reward inference;irrationality",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lawrence Chan;Andrew Critch;Anca Dragan",
        "authorids": "chanlaw@berkeley.edu;critch@berkeley.edu;anca@berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJlo91BYPr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "252;552;123",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "69;32;72",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            309.0,
            179.71644332113854
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            57.666666666666664,
            18.190351532856337
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8524003948487432143&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJlowyHYPr",
        "title": "CloudLSTM: A Recurrent Neural Model for Spatiotemporal Point-cloud Stream Forecasting",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper introduces CloudLSTM, a new branch of recurrent neural models tailored to forecasting over data streams generated by geospatial point-cloud sources.",
        "abstract": "This paper introduces CloudLSTM, a new branch of recurrent neural models tailored to forecasting over data streams generated by geospatial point-cloud sources. We design a Dynamic Point-cloud Convolution (D-Conv) operator as the core component of CloudLSTMs, which performs convolution directly over point-clouds and extracts local spatial features from sets of neighboring points that surround different elements of the input. This operator maintains the permutation invariance of sequence-to-sequence learning frameworks, while representing neighboring correlations at each time step -- an important aspect in spatiotemporal predictive learning. The D-Conv operator resolves the grid-structural data requirements of existing spatiotemporal forecasting models and can be easily plugged into traditional LSTM architectures with sequence-to-sequence learning and attention mechanisms.\n    We apply our proposed architecture to two representative, practical use cases that involve point-cloud streams, i.e. mobile service traffic forecasting and air quality indicator forecasting. Our results, obtained with real-world datasets collected in diverse scenarios for each use case, show that CloudLSTM delivers accurate long-term predictions, outperforming a variety of neural network models.",
        "keywords": "spatio-temporal forecasting;point cloud stream forecasting;recurrent neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chaoyun Zhang;Marco Fiore;Iain Murray;Paul Patras",
        "authorids": "chaoyun.zhang@ed.ac.uk;marco.fiore@ieiit.cnr.it;i.murray@ed.ac.uk;paul.patras@ed.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020cloudlstm,\ntitle={Cloud{\\{}LSTM{\\}}: A Recurrent Neural Model for Spatiotemporal Point-cloud Stream Forecasting},\nauthor={Chaoyun Zhang and Marco Fiore and Iain Murray and Paul Patras},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlowyHYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJlowyHYPr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "883;214;679",
        "wc_reply_reviewers": "323;0;0",
        "wc_reply_authors": "2247;460;849",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            592.0,
            279.96071152931444
        ],
        "wc_reply_reviewers_avg": [
            107.66666666666667,
            152.26366021550322
        ],
        "wc_reply_authors_avg": [
            1185.3333333333333,
            767.3253692028057
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5989570272839149583&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "BJlqYlrtPB",
        "title": "Negative Sampling in Variational Autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "Pulling near-manifold examples (utilizing an auxiliary dataset or generated samples) to a secondary prior improves the discriminative power of VAE models regarding out-of-distribution samples.",
        "abstract": "We propose negative sampling as an approach to improve the notoriously bad out-of-distribution likelihood estimates of Variational Autoencoder models. Our model pushes latent images of negative samples away from the prior. When the source of negative samples is an auxiliary dataset, such a model can vastly improve on baselines when evaluated on OOD detection tasks. Perhaps more surprisingly, we present a fully unsupervised variant that can also significantly improve detection performance: using the output of the generator as a source of negative samples results in a fully unsupervised model that can be interpreted as adversarially trained.\n",
        "keywords": "Variational Autoencoder;generative modelling;out-of-distribution detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Adri\u00e1n Csisz\u00e1rik;Beatrix Benk\u0151;D\u00e1niel Varga",
        "authorids": "csadrian@renyi.hu;bbeatrix1010@gmail.com;daniel@renyi.hu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ncsisz{\\'a}rik2020negative,\ntitle={Negative Sampling in Variational Autoencoders},\nauthor={Adri{\\'a}n Csisz{\\'a}rik and Beatrix Benk{\\H{o}} and D{\\'a}niel Varga},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlqYlrtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJlqYlrtPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "443;275;415",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "589;248;503",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            377.6666666666667,
            73.49074015744357
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            446.6666666666667,
            144.7994782065491
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4682610960167562250&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BJlrF24twB",
        "title": "BackPACK: Packing more into Backprop",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "Automatic differentiation frameworks are optimized for exactly one thing: computing the average mini-batch gradient. Yet, other quantities such as the variance of the mini-batch gradients or many approximations to the Hessian can, in theory, be computed efficiently, and at the same time as the gradient. While these quantities are of great interest to researchers and practitioners, current deep learning software  does  not  support  their  automatic  calculation.  Manually  implementing them is burdensome, inefficient if done naively, and the resulting code is rarely shared.  This hampers  progress  in  deep learning,  and  unnecessarily  narrows  research  to  focus  on  gradient  descent  and  its  variants;  it  also  complicates  replication studies and comparisons between newly developed methods that require those quantities, to the point of impossibility. To address this problem, we introduce  BackPACK, an efficient framework built on top of  PyTorch, that extends the backpropagation algorithm to extract additional information from first-and second-order derivatives. Its capabilities are illustrated by benchmark reports for computing additional quantities on deep neural networks, and an example application by testing several recent curvature approximations for optimization.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Felix Dangel;Frederik Kunstner;Philipp Hennig",
        "authorids": "felix.dangel@tuebingen.mpg.de;kunstner@cs.ubc.ca;philipp.hennig@uni-tuebingen.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nDangel2020BackPACK:,\ntitle={BackPACK: Packing more into Backprop},\nauthor={Felix Dangel and Frederik Kunstner and Philipp Hennig},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlrF24twB}\n}",
        "github": "https://toiaydcdyywlhzvlob.github.io/backpack/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlrF24twB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "88;691;254",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "22;197;158",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.3333333333333,
            254.32568795848277
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            125.66666666666667,
            75.01259153561413
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 124,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9494603980731555092&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BJlrZyrKDB",
        "title": "Statistically Consistent Saliency Estimation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a statistical framework and a theoretically consistent procedure for saliency estimation.",
        "abstract": "The use of deep learning for a wide range of data problems has increased the need for understanding and diagnosing these models, and deep learning interpretation techniques have become an essential tool for data analysts. Although numerous model interpretation methods have been proposed in recent years, most of these procedures are based on heuristics with little or no theoretical guarantees. In this work, we propose a statistical framework for saliency estimation for black box computer vision models. We build a model-agnostic estimation procedure that is statistically consistent and passes the saliency checks of Adebayo et al. (2018). Our method requires solving a linear program, whose solution can be efficiently computed in polynomial time. Through our theoretical analysis, we establish an upper bound on the number of model evaluations needed to recover the region of importance with high probability, and build a new perturbation scheme for estimation of local gradients that is shown to be more efficient than the commonly used random perturbation schemes. Validity of the new method is demonstrated through sensitivity analysis.\n",
        "keywords": "Deep Learning Interpretation;Saliency Estimation;High Dimensional Statistics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Emre Barut;Shunyan Luo",
        "authorids": "barut@gwu.edu;shine_lsy@gwu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nbarut2020statistically,\ntitle={Statistically Consistent Saliency Estimation},\nauthor={Emre Barut and Shunyan Luo},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlrZyrKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer5",
        "site": "https://openreview.net/forum?id=BJlrZyrKDB",
        "pdf_size": 0,
        "rating": "3;6;6;8;8",
        "confidence": "0;0;0;0;0",
        "wc_review": "82;136;182;600;775",
        "wc_reply_reviewers": "0;0;0;0;0",
        "wc_reply_authors": "94;440;366;787;1211",
        "reply_reviewers": "0;0;0;0;0",
        "reply_authors": "1;1;1;1;2",
        "rating_avg": [
            6.2,
            1.8330302779823362
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            355.0,
            278.8705793015821
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            579.6,
            385.3832378295662
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.2,
            0.4
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7995255839951939091&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJluPerYvB",
        "title": "Regularizing Predictions via Class-wise Self-knowledge Distillation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a new regularization technique based on the knowledge distillation.",
        "abstract": "Deep neural networks with millions of parameters may suffer from poor generalizations due to overfitting. To mitigate the issue, we propose a new regularization method that penalizes the predictive distribution between similar samples. In particular, we distill the predictive distribution between different samples of the same label and augmented samples of the same source during training. In other words, we regularize the dark knowledge (i.e., the knowledge on wrong predictions) of a single network, i.e., a self-knowledge distillation technique, to force it output more meaningful predictions.  We demonstrate the effectiveness of the proposed method  via  experiments  on  various  image  classification  tasks:  it  improves  not only the generalization ability, but also the calibration accuracy of modern neural networks.",
        "keywords": "regularization;knowledge distillation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sukmin Yun;Jongjin Park;Kimin Lee;Jinwoo Shin",
        "authorids": "sukmin.yun@kaist.ac.kr;jongjin.park@kaist.ac.kr;kiminlee@kaist.ac.kr;jinwoos@kaist.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJluPerYvB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "298;398;248",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            314.6666666666667,
            62.36095644623236
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BrC7Hs_54N8J:scholar.google.com/&scioq=Regularizing+Predictions+via+Class-wise+Self-knowledge+Distillation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJluxREKDB",
        "title": "Learning Heuristics for Quantified Boolean Formulas through Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We use RL to automatically learn branching heuristic within a state of the art QBF solver, on industrial problems.",
        "abstract": "We demonstrate how to learn efficient heuristics for automated reasoning algorithms for quantified Boolean formulas through deep reinforcement learning. We focus on a backtracking search algorithm, which can already solve formulas of impressive size - up to hundreds of thousands of variables. The main challenge is to find a representation of these formulas that lends itself to making predictions in a scalable way. For a family of challenging problems, we learned a heuristic that solves significantly more formulas compared to the existing handwritten heuristics.",
        "keywords": "Logic;QBF;Logical Reasoning;SAT;Graph;Reinforcement Learning;GNN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gil Lederman;Markus Rabe;Sanjit Seshia;Edward A. Lee",
        "authorids": "gilled@berkeley.edu;mrabe@google.com;sshesia@eecs.berkeley.edu;eal@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLederman2020Learning,\ntitle={Learning Heuristics for Quantified Boolean Formulas through Reinforcement Learning},\nauthor={Gil Lederman and Markus Rabe and Sanjit Seshia and Edward A. Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJluxREKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJluxREKDB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "1248;241;153",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "927;155;145",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            547.3333333333334,
            496.74697337332174
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            409.0,
            366.30406313152827
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 41,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11257174188114821456&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJlxdCVKDB",
        "title": "MoET: Interpretable and Verifiable Reinforcement Learning via Mixture of Expert Trees",
        "track": "main",
        "status": "Reject",
        "tldr": "Explainable reinforcement learning model using novel combination of mixture of experts with non-differentiable decision tree experts.",
        "abstract": "Deep Reinforcement Learning (DRL) has led to many recent breakthroughs on complex control tasks, such as defeating the best human player in the game of Go. However, decisions made by the DRL agent are not explainable, hindering its applicability in safety-critical settings. Viper, a recently proposed technique, constructs a decision tree policy by mimicking the DRL agent. Decision trees are interpretable as each action made can be traced back to the decision rule path that lead to it. However, one global decision tree approximating the DRL policy has significant limitations with respect to the geometry of decision boundaries. We propose MoET, a more expressive, yet still interpretable model based on Mixture of Experts, consisting of a gating function that partitions the state space, and multiple decision tree experts that specialize on different partitions. We propose a training procedure to support non-differentiable decision tree experts and integrate it into imitation learning procedure of Viper. We evaluate our algorithm on four OpenAI gym environments, and show that the policy constructed in such a way is more performant and better mimics the DRL agent by lowering mispredictions and increasing the reward. We also show that MoET policies are amenable for verification using off-the-shelf automated theorem provers such as Z3.",
        "keywords": "explainable machine learning;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marko Vasic;Andrija Petrovic;Kaiyuan Wang;Mladen Nikolic;Rishabh Singh;Sarfraz Khurshid",
        "authorids": "vasic@utexas.edu;aapetrovic@mas.bg.ac.rs;kaiyuanw@google.com;nikolic@matf.bg.ac.rs;rising@google.com;khurshid@ece.utexas.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nvasic2020moet,\ntitle={Mo{\\{}ET{\\}}: Interpretable and Verifiable Reinforcement Learning via Mixture of Expert Trees},\nauthor={Marko Vasic and Andrija Petrovic and Kaiyuan Wang and Mladen Nikolic and Rishabh Singh and Sarfraz Khurshid},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlxdCVKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlxdCVKDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "396;215;725",
        "wc_reply_reviewers": "30;0;0",
        "wc_reply_authors": "744;435;788",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            445.3333333333333,
            211.1087134366767
        ],
        "wc_reply_reviewers_avg": [
            10.0,
            14.142135623730951
        ],
        "wc_reply_authors_avg": [
            655.6666666666666,
            157.0654499103974
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11688130679959248457&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJlyi64FvB",
        "title": "Wider Networks Learn Better Features",
        "track": "main",
        "status": "Reject",
        "tldr": "We visualize the hidden states of wide networks, finding that they contain more information about the inputs than narrow networks with equal performance, and show that wide networks fine-tuned to perform novel tasks outperform narrow networks.",
        "abstract": "Transferability of learned features between tasks can massively reduce the cost of training a neural network on a novel task. We investigate the effect of network width on learned features using activation atlases --- a visualization technique that captures features the entire hidden state responds to, as opposed to individual neurons alone. We find that, while individual neurons do not learn interpretable features in wide networks, groups of neurons do. In addition, the hidden state of a wide network contains more information about the inputs than that of a narrow network trained to the same test accuracy. Inspired by this observation, we show that when fine-tuning the last layer of a network on a new task, performance improves significantly as the width of the network is increased, even though test accuracy on the original task is independent of width. ",
        "keywords": "Interpretability;transfer learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dar Gilboa;Guy Gur-Ari",
        "authorids": "dg2893@columbia.edu;guyga@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngilboa2020wider,\ntitle={Wider Networks Learn Better Features},\nauthor={Dar Gilboa and Guy Gur-Ari},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlyi64FvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlyi64FvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "190;418;221",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            276.3333333333333,
            100.96974244242128
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3613543339567468188&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJlzm64tDH",
        "title": "Pretrained Encyclopedia: Weakly Supervised Knowledge-Pretrained Language Model",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": " Recent breakthroughs of pretrained language models have shown the effectiveness of self-supervised learning for a wide range of natural language processing (NLP) tasks. In addition to standard syntactic and semantic NLP tasks, pretrained models achieve strong improvements on tasks that involve real-world knowledge, suggesting that large-scale language modeling could be an implicit method to capture knowledge. In this work, we further investigate the extent to which pretrained models such as BERT capture knowledge using a zero-shot fact completion task. Moreover, we propose a simple yet effective weakly supervised pretraining objective, which explicitly forces the model to incorporate knowledge about real-world entities. Models trained with our new objective yield significant improvements on the fact completion task. When applied to downstream tasks, our model consistently outperforms BERT on four entity-related question answering datasets (i.e., WebQuestions, TriviaQA, SearchQA and Quasar-T) with an average 2.7 F1 improvements and a standard fine-grained entity typing dataset (i.e., FIGER) with 5.7 accuracy gains.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenhan Xiong;Jingfei Du;William Yang Wang;Veselin Stoyanov",
        "authorids": "xwhan@cs.ucsb.edu;jingfeidu@fb.com;william@cs.ucsb.edu;ves@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nXiong2020Pretrained,\ntitle={Pretrained Encyclopedia: Weakly Supervised Knowledge-Pretrained Language Model},\nauthor={Wenhan Xiong and Jingfei Du and William Yang Wang and Veselin Stoyanov},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJlzm64tDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJlzm64tDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "236;532;798",
        "wc_reply_reviewers": "0;0;248",
        "wc_reply_authors": "232;498;512",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            522.0,
            229.5444764455609
        ],
        "wc_reply_reviewers_avg": [
            82.66666666666667,
            116.90832115617586
        ],
        "wc_reply_authors_avg": [
            414.0,
            128.82028825719445
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 127,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9541644264421060378&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJx-ZeSKDB",
        "title": "Compositional Embeddings: Joint Perception and Comparison of Class Label Sets",
        "track": "main",
        "status": "Reject",
        "tldr": "We explored how a novel method of compositional set embeddings can both perceive and represent not just a single class but an entire set of classes that is associated with the input data.",
        "abstract": "We explore the idea of compositional set embeddings that can be used to infer not\njust a single class, but the set of classes associated with the input data (e.g., image,\nvideo, audio signal). This can be useful, for example, in multi-object detection in\nimages, or multi-speaker diarization (one-shot learning) in audio. In particular, we\ndevise and implement two novel models consisting of (1) an embedding function\nf trained jointly with a \u201ccomposite\u201d function g that computes set union opera-\ntions between the classes encoded in two embedding vectors; and (2) embedding\nf trained jointly with a \u201cquery\u201d function h that computes whether the classes en-\ncoded in one embedding subsume the classes encoded in another embedding. In\ncontrast to prior work, these models must both perceive the classes associated\nwith the input examples, and also encode the relationships between different class\nlabel sets. In experiments conducted on simulated data, OmniGlot, and COCO\ndatasets, the proposed composite embedding models outperform baselines based\non traditional embedding approaches.",
        "keywords": "Embedding;One-shot Learning;Compositional Representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zeqian Li;Jacob Whitehill",
        "authorids": "zli14@wpi.edu;jrwhitehill@wpi.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nli2020compositional,\ntitle={Compositional Embeddings: Joint Perception and Comparison of Class Label Sets},\nauthor={Zeqian Li and Jacob Whitehill},\nyear={2020},\nurl={https://openreview.net/forum?id=BJx-ZeSKDB}\n}",
        "github": "https://drive.google.com/open?id=1zjsK9DP3CUqwcVSNwDPshIxOV5hQwFxt",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJx-ZeSKDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "251;263;438",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "336;50;440",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            317.3333333333333,
            85.46474256804511
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            275.3333333333333,
            164.89457911715053
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1aeSaHPBK1IJ:scholar.google.com/&scioq=Compositional+Embeddings:+Joint+Perception+and+Comparison+of+Class+Label+Sets&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BJx040EFvH",
        "title": "Fast is better than free: Revisiting adversarial training",
        "track": "main",
        "status": "Poster",
        "tldr": "FGSM-based adversarial training, with randomization, works just as well as PGD-based adversarial training: we can use this to train a robust classifier in 6 minutes on CIFAR10, and 12 hours on ImageNet, on a single machine.",
        "abstract": "Adversarial training, a method for learning robust deep networks, is typically assumed to be more expensive than traditional training due to the necessity of constructing adversarial examples via a first-order method like projected gradient decent (PGD).  In this paper, we make the surprising discovery that it is possible to train empirically robust models using a much weaker and cheaper adversary, an approach that was previously believed to be ineffective, rendering the method no more costly than standard training in practice.  Specifically, we show that adversarial training with the fast gradient sign method (FGSM), when combined with random initialization, is as effective as PGD-based training but has significantly lower cost.  Furthermore we show that FGSM adversarial training can be further accelerated by using standard techniques for efficient training of deep networks, allowing us to learn a robust CIFAR10 classifier with 45% robust accuracy at epsilon=8/255 in 6 minutes, and a robust ImageNet classifier with 43% robust accuracy at epsilon=2/255 in 12 hours, in comparison to past work based on ``free'' adversarial training which took 10 and 50 hours to reach the same respective thresholds. ",
        "keywords": "adversarial examples;adversarial training;fast gradient sign method",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eric Wong;Leslie Rice;J. Zico Kolter",
        "authorids": "ericwong@cs.cmu.edu;larice@cs.cmu.edu;zkolter@cs.cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nWong2020Fast,\ntitle={Fast is better than free: Revisiting adversarial training},\nauthor={Eric Wong and Leslie Rice and J. Zico Kolter},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJx040EFvH}\n}",
        "github": "https://github.com/anonymous-sushi-armadillo",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJx040EFvH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "568;335;86",
        "wc_reply_reviewers": "480;0;0",
        "wc_reply_authors": "1717;349;33",
        "reply_reviewers": "3;0;0",
        "reply_authors": "5;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            329.6666666666667,
            196.81181084703448
        ],
        "wc_reply_reviewers_avg": [
            160.0,
            226.27416997969522
        ],
        "wc_reply_authors_avg": [
            699.6666666666666,
            730.839395641903
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.8856180831641267
        ],
        "replies_avg": [
            51,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1505,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=227717459026762223&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJx3_0VKPB",
        "title": "On the Unintended Social Bias of Training Language Generation Models with News Articles",
        "track": "main",
        "status": "Reject",
        "tldr": "we introduce a novel architecture that allows us to update a memory module with an equal ratio across gender types addressing biased correlations directly in the latent space. ",
        "abstract": "There are concerns that neural language models may preserve some of the stereotypes of the underlying societies that generate the large corpora needed to train these models. For example, gender bias is a significant problem when generating text, and its unintended memorization could impact the user experience of many applications (e.g., the smart-compose feature in Gmail).\n\nIn this paper, we introduce a novel architecture that decouples the representation learning of a neural model from its memory management role. This architecture allows us to update a memory module with an equal ratio across gender types addressing biased correlations directly in the latent space. We experimentally show that our approach can mitigate the gender bias amplification in the automatic generation of articles news while providing similar perplexity values when extending the Sequence2Sequence architecture.",
        "keywords": "Fair AI;latent representations;sequence to sequence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Omar U. Florez",
        "authorids": "omar.florez@aggiemail.usu.edu",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nflorez2020on,\ntitle={On the Unintended Social Bias of Training Language Generation Models with News Articles},\nauthor={Omar U. Florez},\nyear={2020},\nurl={https://openreview.net/forum?id=BJx3_0VKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJx3_0VKPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "165;145;390",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            233.33333333333334,
            111.08055135301089
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oRA0rKDirKoJ:scholar.google.com/&scioq=On+the+Unintended+Social+Bias+of+Training+Language+Generation+Models+with+News+Articles&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJx4rerFwB",
        "title": "wMAN: WEAKLY-SUPERVISED MOMENT ALIGNMENT NETWORK FOR TEXT-BASED VIDEO SEGMENT RETRIEVAL",
        "track": "main",
        "status": "Reject",
        "tldr": "Weakly-Supervised Text-Based Video Moment Retrieval",
        "abstract": "Given a video and a sentence, the goal of weakly-supervised video moment retrieval is to locate the video segment which is described by the sentence without having access to temporal annotations during training.  Instead, a model must learn how to identify the correct segment (i.e. moment) when only being provided with video-sentence pairs.  Thus, an inherent challenge is automatically inferring the latent correspondence between visual and language representations. To facilitate this alignment, we propose our Weakly-supervised Moment Alignment Network (wMAN) which exploits a multi-level co-attention mechanism to learn richer multimodal representations. The aforementioned mechanism is comprised of a Frame-By-Word interaction module as well as a novel Word-Conditioned Visual Graph (WCVG). Our approach also incorporates a novel application of positional encodings, commonly used in Transformers, to learn visual-semantic representations that contain contextual information of their relative positions in the temporal sequence through iterative message-passing. Comprehensive experiments on the DiDeMo and Charades-STA datasets demonstrate the effectiveness of our learned representations: our combined wMAN model not only outperforms the state-of-the-art weakly-supervised method by a significant margin but also does better than strongly-supervised state-of-the-art methods on some metrics.",
        "keywords": "vision;language;video moment retrieval",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Reuben Tan;Huijuan Xu;Kate Saenko;Bryan A. Plummer",
        "authorids": "rxtan@bu.edu;huijuan@berkeley.edu;saenko@bu.edu;bplumme2@illinois.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntan2020wman,\ntitle={w{\\{}MAN{\\}}: {\\{}WEAKLY{\\}}-{\\{}SUPERVISED{\\}} {\\{}MOMENT{\\}} {\\{}ALIGNMENT{\\}} {\\{}NETWORK{\\}} {\\{}FOR{\\}} {\\{}TEXT{\\}}-{\\{}BASED{\\}} {\\{}VIDEO{\\}} {\\{}SEGMENT{\\}} {\\{}RETRIEVAL{\\}}},\nauthor={Reuben Tan and Huijuan Xu and Kate Saenko and Bryan A. Plummer},\nyear={2020},\nurl={https://openreview.net/forum?id=BJx4rerFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJx4rerFwB",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "440;288;499;476",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "507;108;461;562",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            425.75,
            82.26291691886448
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            409.5,
            177.70551482719944
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=388125569583127667&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJx7N1SKvB",
        "title": "A Random Matrix Perspective on Mixtures of Nonlinearities in High Dimensions",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "One of the distinguishing characteristics of modern deep learning systems is that they typically employ neural network architectures that utilize enormous numbers of parameters, often in the millions and sometimes even in the billions. While this paradigm has inspired significant research on the properties of large networks, relatively little work has been devoted to the fact that these networks are often used to model large complex datasets, which may themselves contain millions or even billions of constraints. In this work, we focus on this high-dimensional regime in which both the dataset size and the number of features tend to infinity. We analyze the performance of a simple regression model trained on the random features $F=f(WX+B)$ for a random weight matrix $W$ and random bias vector $B$, obtaining an exact formula for the asymptotic training error on a noisy autoencoding task. The role of the bias can be understood as parameterizing a distribution over activation functions, and our analysis actually extends to general such distributions, even those not expressible with a traditional additive bias. Intruigingly, we find that a mixture of nonlinearities can outperform the best single nonlinearity on the noisy autoecndoing task, suggesting that mixtures of nonlinearities might be useful for approximate kernel methods or neural network architecture design.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ben Adlam;Jake Levinson;Jeffrey Pennington",
        "authorids": "adlam@google.com;jpennin@google.com;jlev@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nadlam2020a,\ntitle={A Random Matrix Perspective on Mixtures of Nonlinearities in High Dimensions},\nauthor={Ben Adlam and Jake Levinson and Jeffrey Pennington},\nyear={2020},\nurl={https://openreview.net/forum?id=BJx7N1SKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJx7N1SKvB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "300;204;183",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "417;162;58",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            229.0,
            50.93132631298737
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            212.33333333333334,
            150.82071770446157
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6541369484031241615&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJx8Fh4KPB",
        "title": "RL-LIM: Reinforcement Learning-based Locally Interpretable Modeling",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Understanding black-box machine learning models is important towards their widespread adoption. However, developing globally interpretable models that explain the behavior of the entire model is challenging. An alternative approach is to explain black-box models through explaining individual prediction using a locally interpretable model. In this paper, we propose a novel method for locally interpretable modeling -- Reinforcement Learning-based Locally Interpretable Modeling (RL-LIM). RL-LIM employs reinforcement learning to select a small number of samples and distill the black-box model prediction into a low-capacity locally interpretable model. Training is guided with a reward that is obtained directly by measuring agreement of the predictions from the locally interpretable model with the black-box model. RL-LIM near-matches the overall prediction performance of black-box models while yielding human-like interpretability, and significantly outperforms state of the art locally interpretable models in terms of overall prediction performance and fidelity. ",
        "keywords": "Interpretability;Explanable AI;Explanability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jinsung Yoon;Sercan O. Arik;Tomas Pfister",
        "authorids": "jsyoon0823@gmail.com;soarik@google.com;tpfister@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyoon2020rllim,\ntitle={{\\{}RL{\\}}-{\\{}LIM{\\}}: Reinforcement Learning-based Locally Interpretable Modeling},\nauthor={Jinsung Yoon and Sercan O. Arik and Tomas Pfister},\nyear={2020},\nurl={https://openreview.net/forum?id=BJx8Fh4KPB}\n}",
        "github": "https://drive.google.com/open?id=1WpjqBHoYyF2W8vSZMjVgzgB1rlTsJq99",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJx8Fh4KPB",
        "pdf_size": 0,
        "rating": "3;6",
        "confidence": "0;0",
        "wc_review": "198;697",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "421;817",
        "reply_reviewers": "0;0",
        "reply_authors": "1;2",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            447.5,
            249.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            619.0,
            198.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16991312944196699184&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJx8YnEFPH",
        "title": "Data Valuation using Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Quantifying the value of data is a fundamental problem in machine learning. Data valuation has multiple important use cases: (1) building insights about the learning task, (2) domain adaptation, (3) corrupted sample discovery, and (4) robust learning. To adaptively learn data values jointly with the target task predictor model, we propose a meta learning framework which we name Data Valuation using Reinforcement Learning (DVRL). We employ a data value estimator (modeled by a deep neural network) to learn how likely each datum is used in training of the predictor model. We train the data value estimator using a reinforcement signal of the reward obtained on a small validation set that reflects performance on the target task. We demonstrate that DVRL yields superior data value estimates compared to alternative methods across different types of datasets and in a diverse set of application scenarios. The corrupted sample discovery performance of DVRL is close to optimal in many regimes (i.e. as if the noisy samples were known apriori), and for domain adaptation and robust learning DVRL significantly outperforms state-of-the-art by 14.6% and 10.8%, respectively. ",
        "keywords": "Data valuation;Domain adaptation;Robust learning;Corrupted sample discovery",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jinsung Yoon;Sercan O. Arik;Tomas Pfister",
        "authorids": "jsyoon0823@gmail.com;soarik@google.com;tpfister@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyoon2020data,\ntitle={Data Valuation using Reinforcement Learning},\nauthor={Jinsung Yoon and Sercan O. Arik and Tomas Pfister},\nyear={2020},\nurl={https://openreview.net/forum?id=BJx8YnEFPH}\n}",
        "github": "https://drive.google.com/open?id=1o5HDc-Ic54z2ujdy9a_5kz0TY9EpKY5D",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJx8YnEFPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "356;461;306",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "561;392;703",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            374.3333333333333,
            64.59274125025367
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            552.0,
            127.12461078275389
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 248,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12792068149668296468&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "BJxA924YvS",
        "title": "Revisit Knowledge Distillation: a Teacher-free Framework",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Knowledge Distillation (KD) aims to distill the knowledge of a cumbersome teacher model into a lightweight student model. Its success is generally attributed to the privileged information on similarities among categories provided by the teacher model, and in this sense, only strong teacher models are deployed to teach weaker students in practice.  In this work, we challenge this common belief by following experimental observations: 1) beyond the acknowledgment that the teacher can improve the student, the student can also enhance the teacher significantly by reversing the KD procedure; 2) a poorly-trained teacher with much lower accuracy than the student can still improve the latter significantly. To explain these observations, we provide a theoretical analysis of the relationships between KD and label smoothing regularization. We prove that 1) KD is a type of learned label smoothing regularization and 2) label smoothing regularization provides a virtual teacher model for KD. From these results, we argue that the success of KD is not fully due to the similarity information between categories, but also to the regularization of soft targets, which is equally or even more important. \n\nBased on these analyses, we further propose a novel Teacher-free Knowledge Distillation (Tf-KD) framework, where a student model learns from itself or manually-designed regularization distribution. The Tf-KD achieves comparable performance with normal KD from a superior teacher. It is generic and can be directly deployed for training deep neural networks. Without any extra computation cost, Tf-KD achieves up to 0.65% improvement on ImageNet over well-established baseline models, which is superior to label smoothing regularization.",
        "keywords": "Knowledge Distillation;Label Smoothing Regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Li Yuan;Francis EH Tay;Guilin Li;Tao Wang;Jiashi Feng",
        "authorids": "ylustcnus@gmail.com;mpetayeh@nus.edu.sg;liguilin2@huawei.com;twangnh@gmail.com;elefjia@nus.edu.sg",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJxA924YvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "544;256;567",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "643;595;485",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            455.6666666666667,
            141.4975461114275
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            574.3333333333334,
            66.13790306792484
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 99,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1440785858957448074&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJxAHgSYDB",
        "title": "Learning to Rank Learning Curves",
        "track": "main",
        "status": "Reject",
        "tldr": "Learn to rank learning curves in order to stop unpromising training jobs early. Novelty: use of pairwise ranking loss to directly model the probability of improving and transfer learning across data sets to reduce required training data.",
        "abstract": "Many automated machine learning methods, such as those for hyperparameter and neural architecture optimization, are computationally expensive because they involve training many different model configurations. In this work, we present a new method that saves computational budget by terminating poor configurations early on in the training. In contrast to existing methods, we consider this task as a ranking and transfer learning problem. We qualitatively show that by optimizing a pairwise ranking loss and leveraging learning curves from other data sets, our model is able to effectively rank learning curves without having to observe many or very long learning curves. We further demonstrate that our method can be used to accelerate a neural architecture search by a factor of up to 100 without a significant performance degradation of the discovered architecture. In further experiments we analyze the quality of ranking, the influence of different model components as well as the predictive behavior of the model.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Martin Wistuba;Tejaswini Pedapati",
        "authorids": "martin.wistuba@ibm.com;tejaswinip@us.ibm.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nwistuba2020learning,\ntitle={Learning to Rank Learning Curves},\nauthor={Martin Wistuba and Tejaswini Pedapati},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxAHgSYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxAHgSYDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "375;597;434",
        "wc_reply_reviewers": "0;0;120",
        "wc_reply_authors": "630;531;223",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            468.6666666666667,
            93.88763970240646
        ],
        "wc_reply_reviewers_avg": [
            40.0,
            56.568542494923804
        ],
        "wc_reply_authors_avg": [
            461.3333333333333,
            173.3057670387483
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16872185342257945660&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BJxD11HFDS",
        "title": "Factorized Multimodal Transformer for Multimodal Sequential Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A multimodal transformer for multimodal sequential learning, with strong empirical results on multimodal language metrics such as multimodal sentiment analysis, emotion recognition and personality traits recognition. ",
        "abstract": "The complex world around us is inherently multimodal and sequential (continuous). Information is scattered across different modalities and requires multiple continuous sensors to be captured. As machine learning leaps towards better generalization to real world, multimodal sequential learning becomes a fundamental research area. Arguably,  modeling arbitrarily distributed spatio-temporal dynamics within and across modalities is the biggest challenge in this research area. In this paper, we present a new transformer model, called the Factorized Multimodal Transformer (FMT) for multimodal sequential learning. FMT inherently models the intramodal and intermodal (involving two or more modalities) dynamics within its multimodal input in a factorized manner. The proposed factorization allows for increasing the number of self-attentions to better model the multimodal phenomena at hand; without encountering difficulties during training (e.g. overfitting) even on relatively low-resource setups. All the attention mechanisms within FMT have a full time-domain receptive field which allows them to asynchronously capture long-range multimodal dynamics. In our experiments we focus on datasets that contain the three commonly studied modalities of language, vision and acoustic. We perform a wide range of experiments, spanning across 3 well-studied datasets and 21 distinct labels. FMT shows superior performance over previously proposed models, setting new state of the art in  the studied datasets.",
        "keywords": "Multimodal Machine Learning;Multimodal Transformer;Multimodal Language;Sentiment Analysis;Emotion Recognition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amir Zadeh;Chengfeng Mao;Jiaxin Shi;Yiwei Zhang;Paul Pu Liang;Soujanya Poria;Louis-Philippe Morency",
        "authorids": "abagherz@andrew.cmu.edu;chengfem@andrew.cmu.edu;jiaxins1@andrew.cmu.edu;yiweizh2@andrew.cmu.edu;pliang@cs.cmu.edu;sporia@ntu.edu.sg;morency@cs.cmu.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxD11HFDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "458;173;454",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            361.6666666666667,
            133.41747345165183
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 60,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3790030232169855674&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJxDNxSFDH",
        "title": "Few-Shot Regression via Learning Sparsifying Basis Functions",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a method of doing few-shot regression by learning a set of basis functions to represent the function distribution.",
        "abstract": "Recent few-shot learning algorithms have enabled models to quickly adapt to new tasks based on only a few training samples. Previous few-shot learning works have mainly focused on classification and reinforcement learning. In this paper, we propose a few-shot meta-learning system that focuses exclusively on regression tasks. Our model is based on the idea that the degree of freedom of the unknown function can be significantly reduced if it is represented as a linear combination of a set of sparsifying basis functions. This enables a few labeled samples to approximate the function. We design a Basis Function Learner network to encode basis functions for a task distribution, and a Weights Generator network to generate the weight vector for a novel task. We show that our model outperforms the current state of the art meta-learning methods in various regression tasks.",
        "keywords": "meta-learning;few-shot learning;regression;learning basis functions;self-attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yi Loo;Yiluan Guo;Ngai-Man Cheung",
        "authorids": "loo_yi@sutd.edu.sg;guoyl1990@outlook.com;ngaiman_cheung@sutd.edu.sg",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nloo2020fewshot,\ntitle={Few-Shot Regression via Learning Sparsifying Basis Functions},\nauthor={Yi Loo and Yiluan Guo and Ngai-Man Cheung},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxDNxSFDH}\n}",
        "github": "https://github.com/fewshotreg/Few-Shot-Regression.git",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxDNxSFDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "547;427;245",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "95;493;90",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.3333333333333,
            124.1540262022228
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            226.0,
            188.80854500436854
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9764984452800699169&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJxG_0EtDS",
        "title": "Prediction, Consistency, Curvature: Representation Learning for Locally-Linear Control",
        "track": "main",
        "status": "Poster",
        "tldr": "Learning embedding for control with high-dimensional observations",
        "abstract": "Many real-world sequential decision-making problems can be formulated as optimal control with high-dimensional observations and unknown dynamics. A promising approach is to embed the high-dimensional observations into a lower-dimensional latent representation space, estimate the latent dynamics model, then utilize this model for control in the latent space. An important open question is how to learn a representation that is amenable to existing control algorithms? In this paper, we focus on learning representations for locally-linear control algorithms, such as iterative LQR (iLQR). By formulating and analyzing the representation learning problem from an optimal control perspective, we establish three underlying principles that the learned representation should comprise: 1) accurate prediction in the observation space, 2) consistency between latent and observation space dynamics, and 3) low curvature in the latent space transitions. These principles naturally correspond to a loss function that consists of three terms: prediction, consistency, and curvature (PCC). Crucially, to make PCC tractable, we derive an amortized variational bound for the PCC loss function. Extensive experiments on benchmark domains demonstrate that the new variational-PCC learning algorithm benefits from significantly more stable and reproducible training, and leads to superior control performance.  Further ablation studies give support to the importance of all three PCC components for learning a good latent space for control.",
        "keywords": "Embed-to-Control;Representation Learning;Stochastic Optimal Control;VAE;iLQR",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nir Levine;Yinlam Chow;Rui Shu;Ang Li;Mohammad Ghavamzadeh;Hung Bui",
        "authorids": "nirlevine@google.com;yinlamchow@google.com;ruishu@stanford.edu;anglili@google.com;mgh@fb.com;v.hungbh1@vinai.io",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nLevine2020Prediction,,\ntitle={Prediction, Consistency, Curvature: Representation Learning for Locally-Linear Control},\nauthor={Nir Levine and Yinlam Chow and Rui Shu and Ang Li and Mohammad Ghavamzadeh and Hung Bui},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxG_0EtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJxG_0EtDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "322;739;169",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "313;738;37",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            410.0,
            240.8775622593354
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            362.6666666666667,
            288.32890170004737
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 33,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5361494157273270444&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BJxGan4FPB",
        "title": "Transfer Alignment Network for Double Blind Unsupervised Domain Adaptation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an effective method for double blind domain adaptation problem where either source or target domain cannot observe the data in the other domain, but data from both domains are used for training. ",
        "abstract": "How can we transfer knowledge from a source domain to a target domain when each side cannot observe the data in the other side? The recent state-of-the-art deep architectures show significant performance in classification tasks which highly depend on a large number of training data. In order to resolve the dearth of abundant target labeled data, transfer learning and unsupervised learning leverage data from different sources and unlabeled data as training data, respectively. However, in some practical settings, transferring source data to target domain is restricted due to a privacy policy.\n\nIn this paper, we define the problem of unsupervised domain adaptation under double blind constraint, where either the source or the target domain cannot observe the data in the other domain, but data from both domains are used for training. We propose TAN (Transfer Alignment Network for Double Blind Domain Adaptation), an effective method for the problem by aligning source and target domain features. TAN maps the target feature into source feature space so that the classifier learned from the labeled data in the source domain is readily used in the target domain. Extensive experiments show that TAN 1) provides the state-of-the-art accuracy for double blind domain adaptation, and 2) outperforms baselines regardless of the proportion of target domain data in the training data.\n",
        "keywords": "unsupervised domain adaptation;double blind domain adaptation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Huiwen Xu;U Kang",
        "authorids": "xuhuiwen33@gmail.com;ukang@snu.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nxu2020transfer,\ntitle={Transfer Alignment Network for Double Blind Unsupervised Domain Adaptation},\nauthor={Huiwen Xu and U Kang},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxGan4FPB}\n}",
        "github": "https://github.com/tanpaper/tan.git",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJxGan4FPB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "719;229;220",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "234;284;154",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            389.3333333333333,
            233.13849007736917
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            224.0,
            53.54126134736337
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4388526606803559521&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJxH22EKPS",
        "title": "Understanding Architectures Learnt by Cell-based Neural Architecture Search",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Neural architecture search (NAS) searches architectures automatically for given tasks, e.g., image classification and language modeling. Improving the search efficiency and effectiveness has attracted increasing attention in recent years. However, few efforts have been devoted to understanding the generated architectures. In this paper, we first reveal that existing NAS algorithms (e.g., DARTS, ENAS) tend to favor architectures with wide and shallow cell structures. These favorable architectures consistently achieve fast convergence and are consequently selected by NAS algorithms. Our empirical and theoretical study further confirms that their fast convergence derives from their smooth loss landscape and accurate gradient information. Nonetheless, these architectures may not necessarily lead to better generalization performance compared with other candidate architectures in the same search space, and therefore further improvement is possible by revising existing NAS algorithms.",
        "keywords": "Neural Architecture Search;connection pattern;optimization;convergence;Lipschitz smoothness;gradient variance;generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yao Shu;Wei Wang;Shaofeng Cai",
        "authorids": "shuyao@comp.nus.edu.sg;wangwei@comp.nus.edu.sg;shaofeng@comp.nus.edu.sg",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nShu2020Understanding,\ntitle={Understanding Architectures Learnt by Cell-based Neural Architecture Search},\nauthor={Yao Shu and Wei Wang and Shaofeng Cai},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxH22EKPS}\n}",
        "github": "https://github.com/shuyao95/Understanding-NAS.git",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJxH22EKPS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "427;953;653",
        "wc_reply_reviewers": "16;0;0",
        "wc_reply_authors": "338;1119;632",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            677.6666666666666,
            215.44579106793634
        ],
        "wc_reply_reviewers_avg": [
            5.333333333333333,
            7.542472332656507
        ],
        "wc_reply_authors_avg": [
            696.3333333333334,
            322.07073067193727
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 124,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7614023017376385978&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJxI5gHKDr",
        "title": "Pitfalls of In-Domain Uncertainty Estimation and Ensembling in Deep Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We highlight the problems with common metrics of in-domain uncertainty and perform a broad study of modern ensembling techniques.",
        "abstract": "Uncertainty estimation and ensembling methods go hand-in-hand. Uncertainty estimation is one of the main benchmarks for assessment of ensembling performance. At the same time, deep learning ensembles have provided state-of-the-art results in uncertainty estimation. In this work, we focus on in-domain uncertainty for image classification. We explore the standards for its quantification and point out pitfalls of existing metrics. Avoiding these pitfalls, we perform a broad study of different ensembling techniques. To provide more insight in this study, we introduce the deep ensemble equivalent score (DEE) and show that many sophisticated ensembling techniques are equivalent to an ensemble of only few independently trained networks in terms of test performance.",
        "keywords": "uncertainty;in-domain uncertainty;deep ensembles;ensemble learning;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arsenii Ashukha;Alexander Lyzhov;Dmitry Molchanov;Dmitry Vetrov",
        "authorids": "ars.ashuha@gmail.com;alex.grig.lyzhov@gmail.com;dmolch111@gmail.com;vetrovd@yandex.ru",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nAshukha2020Pitfalls,\ntitle={Pitfalls of In-Domain Uncertainty Estimation and Ensembling in Deep Learning},\nauthor={Arsenii Ashukha and Alexander Lyzhov and Dmitry Molchanov and Dmitry Vetrov},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxI5gHKDr}\n}",
        "github": "https://github.com/bayesgroup/pytorch-ensembles",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJxI5gHKDr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "423;327;1152",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1230;470;885",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            634.0,
            368.3720944914259
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            861.6666666666666,
            310.7070789166042
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 412,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6945290947528515507&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJxOZ04Kvr",
        "title": "INVOCMAP: MAPPING METHOD NAMES TO METHOD INVOCATIONS VIA MACHINE LEARNING",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper proposes a theory of classifying Method Invocations by different abstraction levels and conducting a statistical approach for code completion from method name to method invocation.",
        "abstract": "Implementing correct method invocation is an important task for software developers. However, this is challenging work, since the structure of method invocation can be complicated. In this paper, we propose InvocMap, a code completion tool allows developers to obtain an implementation of multiple method invocations from a list of method names inside code context. InvocMap is able to predict the nested method invocations which their names didn\u2019t appear in the list of input method names given by developers. To achieve this, we analyze the Method Invocations by four levels of abstraction. We build a Machine Translation engine to learn the mapping from the first level to the third level of abstraction of multiple method invocations, which only requires developers to manually add local variables from generated expression to get the final code. We evaluate our proposed approach on six popular libraries: JDK, Android, GWT, Joda-Time, Hibernate, and Xstream. With the training corpus of 2.86 million method invocations extracted from 1000 Java Github projects and the testing corpus extracted from 120 online forums code snippets, InvocMap achieves the accuracy rate up to 84 in F1- score depending on how much information of context provided along with method names, that shows its potential for auto code completion.",
        "keywords": "Statistical Machine Translation;Method Invocation;Auto Code Completion;Software Engineering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hung Phan;Ali Jannesari",
        "authorids": "hungphd@iastate.edu;jannesar@iastate.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "https://tinyurl.com/yxbrl3x7",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxOZ04Kvr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "272;200;166",
        "wc_reply_reviewers": "243;0;0",
        "wc_reply_authors": "302;123;431",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            212.66666666666666,
            44.19150245113747
        ],
        "wc_reply_reviewers_avg": [
            81.0,
            114.5512985522207
        ],
        "wc_reply_authors_avg": [
            285.3333333333333,
            126.29154981848768
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2WHCT3aU-tUJ:scholar.google.com/&scioq=INVOCMAP:+MAPPING+METHOD+NAMES+TO+METHOD+INVOCATIONS+VIA+MACHINE+LEARNING&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "BJxQxeBYwH",
        "title": "Are Powerful Graph Neural Nets Necessary? A Dissection on Graph Classification",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a dissection of GNNs through linearization of the parts, and find that linear graph filtering with non-linear set function is powerful enough for common graph classification benchmarks.",
        "abstract": "Graph Neural Nets (GNNs) have received increasing attentions, partially due to their superior performance in many node and graph classification tasks. However, there is a lack of understanding on what they are learning and how sophisticated the learned graph functions are.  In this work, we propose a dissection of GNNs on graph classification into two parts: 1) the graph filtering, where graph-based neighbor aggregations are performed, and 2) the set function, where a set of hidden node features are composed for prediction. To study the importance of both parts, we propose to linearize them separately. We first linearize the graph filtering function, resulting Graph Feature Network (GFN), which is a simple lightweight neural net defined on a \\textit{set} of graph augmented features. Further linearization of GFN's set function results in Graph Linear Network (GLN), which is a linear function. Empirically we perform evaluations on common graph classification benchmarks. To our surprise, we find that, despite the simplification, GFN could match or exceed the best accuracies produced by recently proposed GNNs (with a fraction of computation cost), while GLN underperforms significantly. Our results demonstrate the importance of non-linear set function, and suggest that linear graph filtering with non-linear set function is an efficient and powerful scheme for modeling existing graph classification benchmarks.",
        "keywords": "graph neural nets;graph classification;set function",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ting Chen;Song Bian;Yizhou Sun",
        "authorids": "iamtingchen@gmail.com;biansonghz@gmail.com;yzsun@cs.ucla.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchen2020are,\ntitle={Are Powerful Graph Neural Nets Necessary? A Dissection on Graph Classification},\nauthor={Ting Chen and Song Bian and Yizhou Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxQxeBYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJxQxeBYwH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "209;388;246",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "427;642;231",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            281.0,
            77.15352659902635
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            433.3333333333333,
            167.84980058241226
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 104,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16957752784388373953&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJxRrlBFwB",
        "title": "I love your chain mail! Making knights smile in a fantasy game world",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Agents interact (speak, act) and can achieve goals in a rich world with diverse language, bridging the gap between chit-chat and goal-oriented dialogue.",
        "abstract": "Dialogue research tends to distinguish between chit-chat and goal-oriented tasks. While the former is arguably more naturalistic and has a wider use of language, the latter has clearer metrics and a more straightforward learning signal. Humans effortlessly combine the two, and engage in chit-chat for example with the goal of exchanging information or eliciting a specific response. Here, we bridge the divide between these two domains in the setting of a rich multi-player text-based fantasy environment where agents and humans engage in both actions and dialogue. Specifically, we train a goal-oriented model with reinforcement learning via self-play against an imitation-learned chit-chat model with two new approaches: the policy either learns to pick a topic or learns to pick an utterance given the top-k utterances. We show that both models outperform a strong inverse model baseline and can converse naturally with their dialogue partner in order to achieve goals.",
        "keywords": "reinforcement learning;dialogue systems;goal-oriented chit-chat",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shrimai Prabhumoye;Margaret Li;Jack Urbanek;Emily Dinan;Douwe Kiela;Jason Weston;Arthur Szlam",
        "authorids": "sprabhum@cs.cmu.edu;margaretli@fb.com;jju@fb.com;edinan@fb.com;dkiela@fb.com;jase@fb.com;aszlam@fb.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJxRrlBFwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "841;725;81",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            549.0,
            334.2972728974418
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16077667334257280775&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJxSI1SKDH",
        "title": "A Latent Morphology Model for Open-Vocabulary Neural Machine Translation",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "Translation into morphologically-rich languages challenges neural machine translation (NMT) models with extremely sparse vocabularies where atomic treatment of surface forms is unrealistic. This problem is typically addressed by either pre-processing words into subword units or performing translation directly at the level of characters. The former is based on word segmentation algorithms optimized using corpus-level statistics with no regard to the translation task. The latter learns directly from translation data but requires rather deep architectures. In this paper, we propose to translate words by modeling word formation through a hierarchical latent variable model which mimics the process of morphological inflection. Our model generates words one character at a time by composing two latent representations: a continuous one, aimed at capturing the lexical semantics, and a set of (approximately) discrete features, aimed at capturing the morphosyntactic function, which are shared among different surface forms. Our model achieves better accuracy in translation into three morphologically-rich languages than conventional open-vocabulary NMT methods, while also demonstrating a better generalization capacity under low to mid-resource settings.",
        "keywords": "neural machine translation;low-resource languages;latent-variable models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Duygu Ataman;Wilker Aziz;Alexandra Birch",
        "authorids": "duyguataman@gmail.com;will.aziz@gmail.com;a.birch@ed.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nAtaman2020A,\ntitle={A Latent Morphology Model for Open-Vocabulary Neural Machine Translation},\nauthor={Duygu Ataman and Wilker Aziz and Alexandra Birch},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxSI1SKDH}\n}",
        "github": "[![github](/images/github_icon.svg) d-ataman/lmm](https://github.com/d-ataman/lmm)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxSI1SKDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "775;520;242",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            512.3333333333334,
            217.663859093685
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 29,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9869395538651177404&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BJxSWeSYPB",
        "title": "Self-supervised Training of Proposal-based Segmentation via Background Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "While supervised object detection and segmentation methods achieve impressive accuracy, they generalize poorly to images whose appearance significantly differs from the data they have been trained on. To address this in scenarios where annotating data is prohibitively expensive, we introduce a self-supervised approach to detection and segmentation, able to work with monocular images captured with a moving camera. At the heart of our approach lies the observations that object segmentation and background reconstruction are linked tasks, and that, for structured scenes, background regions can be re-synthesized from their surroundings, whereas regions depicting the object cannot.\nWe encode this intuition as a self-supervised loss function that we exploit to train a proposal-based segmentation network. To account for the discrete nature of the proposals, we develop a Monte Carlo-based training strategy that allows the algorithm to explore the large space of object proposals. We apply our method to human detection and segmentation in images that visually depart from those of standard benchmarks, achieving competitive results compared to the few existing self-supervised methods and approaching the accuracy of supervised ones that exploit large annotated datasets.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Isinsu Katircioglu;Helge Rhodin;Victor Constantin;J\u00f6rg Sp\u00f6rri;Mathieu Salzmann;Pascal Fua",
        "authorids": "isinsu.katircioglu@epfl.ch;rhodin@cs.ubc.ca;victor.constantin@epfl.ch;joerg.spoerri@balgrist.ch;mathieu.salzmann@epfl.ch;pascal.fua@epfl.ch",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nkatircioglu2020selfsupervised,\ntitle={Self-supervised Training of Proposal-based Segmentation via Background Prediction},\nauthor={Isinsu Katircioglu and Helge Rhodin and Victor Constantin and J{\\\"o}rg Sp{\\\"o}rri and Mathieu Salzmann and Pascal Fua},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxSWeSYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxSWeSYPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "212;334;576",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "568;887;225",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            374.0,
            151.2701777174426
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            560.0,
            270.31956397321056
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11457364491064523906&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJxV6A4FDH",
        "title": "DCTD: Deep Conditional Target Densities for Accurate Regression",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "While deep learning-based classification is generally addressed using standardized approaches, a wide variety of techniques are employed for regression. In computer vision, one particularly popular such technique is that of confidence-based regression, which entails predicting a confidence value for each input-target pair (x, y). While this approach has demonstrated impressive results, it requires important task-dependent design choices, and the predicted confidences often lack a natural probabilistic meaning. We address these issues by proposing Deep Conditional Target Densities (DCTD), a novel and general regression method with a clear probabilistic interpretation. DCTD models the conditional target density p(y|x) by using a neural network to directly predict the un-normalized density from (x, y). This model of p(y|x) is trained by minimizing the associated negative log-likelihood, approximated using Monte Carlo sampling. We perform comprehensive experiments on four computer vision regression tasks. Our approach outperforms direct regression, as well as other probabilistic and confidence-based methods. Notably, our regression model achieves a 1.9% AP improvement over Faster-RCNN for object detection on the COCO dataset, and sets a new state-of-the-art on visual tracking when applied for bounding box regression.",
        "keywords": "Computer vision;deep learning;regression;object detection;visual tracking",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fredrik K. Gustafsson;Martin Danelljan;Goutam Bhat;Thomas B. Sch\u00f6n",
        "authorids": "fredrik.gustafsson@it.uu.se;martin.danelljan@vision.ee.ethz.ch;goutam.bhat@vision.ee.ethz.ch;thomas.schon@it.uu.se",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxV6A4FDH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "598;75;263",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            312.0,
            216.3068807658847
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4330455752112696535&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJxVI04YvB",
        "title": "PAC Confidence Sets for Deep Neural Networks via Calibrated Prediction",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We propose an algorithm combining calibrated prediction and generalization bounds from learning theory to construct confidence sets for deep neural networks with PAC guarantees---i.e., the confidence set for a given input contains the true label with high probability. We demonstrate how our approach can be used to construct PAC confidence sets on ResNet for ImageNet, a visual object tracking model, and a dynamics model for the half-cheetah reinforcement learning problem.",
        "keywords": "PAC;confidence sets;classification;regression;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sangdon Park;Osbert Bastani;Nikolai Matni;Insup Lee",
        "authorids": "sangdonp@cis.upenn.edu;obastani@seas.upenn.edu;nmatni@seas.upenn.edu;lee@cis.upenn.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nPark2020PAC,\ntitle={PAC Confidence Sets for Deep Neural Networks via Calibrated Prediction},\nauthor={Sangdon Park and Osbert Bastani and Nikolai Matni and Insup Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxVI04YvB}\n}",
        "github": "https://github.com/sangdon/PAC-confidence-set",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxVI04YvB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "734;223;166",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1764;54;136",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            374.3333333333333,
            255.38511224336386
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            651.3333333333334,
            787.4860139851514
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 76,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13464804698510313899&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "BJxVT3EKDH",
        "title": "Corpus Based Amharic Sentiment Lexicon Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "Corpus based Algorithm is developed generate Amharic Sentiment lexicon relying on corpus",
        "abstract": "Sentiment classification is an active research area with several applications including analysis of political opinions, classifying comments, movie reviews, news reviews and product reviews. To employ rule based sentiment classification, we require sentiment lexicons. However, manual construction of sentiment lexicon is time consuming and costly for resource-limited languages. To bypass manual development time and costs, we tried to build Amharic Sentiment Lexicons relying on corpus based approach. The intention of this approach is to handle sentiment terms specific to Amharic language from Amharic Corpus. Small set of seed terms are manually prepared from three parts of speech such as noun, adjective and verb. We developed algorithms for constructing Amharic sentiment lexicons automatically from Amharic news corpus. Corpus based approach is proposed relying on the word co-occurrence distributional embedding including frequency based embedding (i.e. Positive Point-wise Mutual Information PPMI). First we build word-context unigram frequency count matrix and transform it to point-wise mutual Information matrix. Using this matrix, we computed the cosine distance of mean vector of seed lists and each word in the corpus vocabulary. Based on the threshold value, the top closest words to the mean vector of seed list are added to the lexicon. Then the mean vector of the new sentiment seed list is updated and process is repeated until we get sufficient terms in the lexicon. Using PPMI with threshold value of 100 and 200, we got corpus based Amharic Sentiment lexicons of size 1811 and 3794 respectively by expanding 519 seeds. Finally, the lexicon generated in corpus based approach is evaluated.\n",
        "keywords": "Amharic sentiment lexicon;Amharic sentiment classification;seed words",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Girma Neshir;Andeas Rauber;and Solomon Atnafu",
        "authorids": "girma1978@gmail.com;rauber@ifs.tuwien.ac.at;solomon.atnafu@aau.edu.et",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nneshir2020corpus,\ntitle={Corpus Based Amharic Sentiment Lexicon Generation},\nauthor={Girma Neshir and Andeas Rauber and and Solomon Atnafu},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxVT3EKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxVT3EKDH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "210;474;542",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            408.6666666666667,
            143.19528235556118
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17815492238333244&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJxWx0NYPr",
        "title": "Adaptive Structural Fingerprints for Graph Attention Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "Exploiting rich strucural details in graph-structued data via adaptive \"strucutral fingerprints''",
        "abstract": "Graph attention network (GAT) is a promising framework to perform convolution and massage passing on graphs. Yet, how to fully exploit rich structural information in the attention mechanism remains a challenge. In the current version, GAT calculates attention scores mainly using node features and among one-hop neighbors, while increasing the attention range to higher-order neighbors can negatively affect its performance, reflecting the over-smoothing risk of GAT (or graph neural networks in general), and the ineffectiveness in exploiting graph structural details. In this paper, we propose an ``\"adaptive structural fingerprint\" (ADSF) model to fully exploit graph topological details in graph attention network. The key idea is to contextualize each node with a weighted, learnable receptive field  encoding rich and diverse local graph structures. By doing this, structural interactions between the nodes can  be inferred accurately, thus significantly improving subsequent attention layer as well as the convergence of learning. Furthermore, our model provides a useful platform  for different subspaces of node features and various scales of graph structures to ``cross-talk'' with each other through the learning of multi-head attention, being particularly useful in handling complex real-world data. Empirical results demonstrate the power of our approach in exploiting rich structural information in GAT and in alleviating  the intrinsic oversmoothing problem in graph neural networks.",
        "keywords": "Graph attention networks;graph neural networks;node classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kai Zhang;Yaokang Zhu;Jun Wang;Jie Zhang",
        "authorids": "kzhang980@gmail.com;52184501026@stu.ecnu.edu.cn;wongjun@gmail.com;jzhang080@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nZhang2020Adaptive,\ntitle={Adaptive Structural Fingerprints for Graph Attention Networks},\nauthor={Kai Zhang and Yaokang Zhu and Jun Wang and Jie Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxWx0NYPr}\n}",
        "github": "http://github.com/AvigdorZ",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxWx0NYPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "282;265;876",
        "wc_reply_reviewers": "33;0;29",
        "wc_reply_authors": "298;315;943",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            474.3333333333333,
            284.10600525547187
        ],
        "wc_reply_reviewers_avg": [
            20.666666666666668,
            14.70449666674185
        ],
        "wc_reply_authors_avg": [
            518.6666666666666,
            300.1292314246574
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 111,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9437205765585704652&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BJxYUaVtPB",
        "title": "Match prediction from group comparison data using neural networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigate the merits of employing neural networks in the match prediction problem where one seeks to estimate the likelihood of a group of M items preferred over another, based on partial group comparison data.",
        "abstract": "We explore the match prediction problem where one seeks to estimate the likelihood of a group of M items preferred over another, based on partial group comparison data. Challenges arise in practice. As existing state-of-the-art algorithms are tailored to certain statistical models, we have different best algorithms across distinct scenarios. Worse yet, we have no prior knowledge on the underlying model for a given scenario. These call for a unified approach that can be universally applied to a wide range of scenarios and achieve consistently high performances. To this end, we incorporate deep learning architectures so as to reflect the key structural features that most state-of-the-art algorithms, some of which are optimal in certain settings, share in common. This enables us to infer hidden models underlying a given dataset, which govern in-group interactions and statistical patterns of comparisons, and hence to devise the best algorithm tailored to the dataset at hand. Through extensive experiments on synthetic and real-world datasets, we evaluate our framework in comparison to state-of-the-art algorithms. It turns out that our framework consistently leads to the best performance across all datasets in terms of cross entropy loss and prediction accuracy, while the state-of-the-art algorithms suffer from inconsistent performances across different datasets. Furthermore, we show that it can be easily extended to attain satisfactory performances in rank aggregation tasks, suggesting that it can be adaptable for other tasks as well.",
        "keywords": "Neural networks;Group comparison;Match prediction;Rank aggregation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sunghyun Kim;Minje jang;Changho Suh",
        "authorids": "koishkim@gmail.com;jmj427@lunit.io;chsuh@kaist.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkim2020match,\ntitle={Match prediction from group comparison data using neural networks},\nauthor={Sunghyun Kim and Minje jang and Changho Suh},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxYUaVtPB}\n}",
        "github": "https://github.com/iclr-2020-match-prediction/iclr-2020-match-prediction",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxYUaVtPB",
        "pdf_size": 0,
        "rating": "1;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "462;620;208;103",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "574;360;58;211",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            4.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.25,
            204.08622564984634
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            300.75,
            190.49852361632622
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z_Q_09xvKdcJ:scholar.google.com/&scioq=Match+prediction+from+group+comparison+data+using+neural+networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJxZdnVFPB",
        "title": "Adversarial Attribute Learning by Exploiting negative correlated attributes",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "A typical method for classifying visual attributes in images to use convolutional neural networks (CNNs) with multi-task learning. However, this approach often suffers from negative transfer, which means that classifiers trained together to classify multiple attributes at a time perform worse than classifiers trained separately.  Many multi-task learning techniques attempt to circumvent this issue, but we are interested in negative transfer itself from a different point of view: can we take advantage of negative transfer to improve our classifiers?  In this paper, we propose adversarial attribute learning (AAL) where two classifiers compete with each other so that the primary classifier can learn a representation that is invariant to an attribute exhibiting negative transfer. Our experiments on human attribute classification datasets demonstrate that our method can take advantage of this negative relationship.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Satoshi Tsutsui;Yanwei Fu;David Crandall",
        "authorids": "stsutsui@indiana.edu;yanweifu@fudan.edu.cn;djcran@indiana.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxZdnVFPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "289;251;303",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            281.0,
            21.96967607104544
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5gTmCDYqiDgJ:scholar.google.com/&scioq=Adversarial+Attribute+Learning+by+Exploiting+negative+correlated+attributes&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJx_JAVKDB",
        "title": "In-Domain Representation Learning For Remote Sensing",
        "track": "main",
        "status": "Reject",
        "tldr": "Exploration of in-domain representation learning for remote sensing datasets.",
        "abstract": "Given the importance of remote sensing, surprisingly little attention has been paid to it by the representation learning community. To address it and to speed up innovation in this domain, we provide simplified access to 5 diverse remote sensing datasets in a standardized form. We specifically explore in-domain representation learning and address the question of \"what characteristics should a dataset have to be a good source for remote sensing representation learning\". The established baselines achieve state-of-the-art performance on these datasets. \n",
        "keywords": "Representation learning;remote sensing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Maxim Neumann;Andre Susano Pinto;Xiaohua Zhai;Neil Houlsby",
        "authorids": "maximneumann@google.com;andresp@google.com;xzhai@google.com;neilhoulsby@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nneumann2020indomain,\ntitle={In-Domain Representation Learning For Remote Sensing},\nauthor={Maxim Neumann and Andre Susano Pinto and Xiaohua Zhai and Neil Houlsby},\nyear={2020},\nurl={https://openreview.net/forum?id=BJx_JAVKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJx_JAVKDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "476;277;836",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "437;495;1021",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            529.6666666666666,
            231.34438013970043
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            651.0,
            262.69881359965575
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 79,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10353278627838043433&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJxbOlSKPr",
        "title": "Learning Compact Embedding Layers via Differentiable Product Quantization",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a differentiable product quantization framework that can reduce the size of embedding layer in an end-to-end training at no performance cost.",
        "abstract": "Embedding layers are commonly used to map discrete symbols into continuous embedding vectors that reflect their semantic meanings. Despite their effectiveness, the number of parameters in an embedding layer increases linearly with the number of symbols and poses a critical challenge on memory and storage constraints. In this work, we propose a generic and end-to-end learnable compression framework termed differentiable product quantization (DPQ). We present two instantiations of DPQ that leverage different approximation techniques to enable differentiability in end-to-end learning. Our method can readily serve as a drop-in alternative for any existing embedding layer. Empirically, DPQ offers significant compression ratios (14-238x) at negligible or no performance cost on 10 datasets across three different language tasks.",
        "keywords": "efficient modeling;compact embedding;embedding table compression;differentiable product quantization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ting Chen;Lala Li;Yizhou Sun",
        "authorids": "iamtingchen@gmail.com;lala@google.com;yzsun@cs.ucla.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchen2020learning,\ntitle={Learning Compact Embedding Layers via Differentiable Product Quantization},\nauthor={Ting Chen and Lala Li and Yizhou Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxbOlSKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxbOlSKPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "630;407;80",
        "wc_reply_reviewers": "247;0;0",
        "wc_reply_authors": "660;749;225",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            372.3333333333333,
            225.87066112170382
        ],
        "wc_reply_reviewers_avg": [
            82.33333333333333,
            116.43691663538483
        ],
        "wc_reply_authors_avg": [
            544.6666666666666,
            228.94006979023038
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HTxAMOSGgSoJ:scholar.google.com/&scioq=Learning+Compact+Embedding+Layers+via+Differentiable+Product+Quantization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BJxeHyrKPB",
        "title": "RATE-DISTORTION OPTIMIZATION GUIDED AUTOENCODER FOR GENERATIVE APPROACH",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an autoencoder based on Rate-Distortion Optimization.  With our model, log-likelihood maximization is possible without ELBO.",
        "abstract": "In the generative model approach of machine learning, it is essential to acquire an accurate probabilistic model and compress the dimension of data for easy treatment. However, in the conventional deep-autoencoder based generative model such as VAE, the probability of the real space cannot be obtained correctly from that of in the latent space, because the scaling between both spaces is not controlled. This has also been an obstacle to quantifying the impact of the variation of latent variables on data. In this paper, we propose a method to learn parametric probability distribution and autoencoder simultaneously based on Rate-Distortion Optimization to support scaling control. It is proved theoretically and experimentally that (i) the probability distribution of the latent space obtained by this model is proportional to the probability distribution of the real space because Jacobian between two spaces is constant: (ii) our model behaves as non-linear PCA, which enables to evaluate the influence of latent variables on data. Furthermore, to verify the usefulness on the practical application, we evaluate its performance in unsupervised anomaly detection and outperform current state-of-the-art methods.",
        "keywords": "Autoencoder;Rate-distortion optimization;Generative model;Unsupervised learning;Jacobian",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Keizo Kato;Jing Zhou;Akira Nakagawa",
        "authorids": "kato.keizo@jp.fujitsu.com;zhoujing@cn.fujitsu.com;anaka@jp.fujitsu.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkato2020ratedistortion,\ntitle={{\\{}RATE{\\}}-{\\{}DISTORTION{\\}} {\\{}OPTIMIZATION{\\}} {\\{}GUIDED{\\}} {\\{}AUTOENCODER{\\}} {\\{}FOR{\\}} {\\{}GENERATIVE{\\}} {\\{}APPROACH{\\}}},\nauthor={Keizo Kato and Jing Zhou and Akira Nakagawa},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxeHyrKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJxeHyrKPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "568;399;846",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1306;1028;993",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            604.3333333333334,
            184.28661252395833
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1109.0,
            140.0309489601019
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fy4E4EA1ZPgJ:scholar.google.com/&scioq=RATE-DISTORTION+OPTIMIZATION+GUIDED+AUTOENCODER+FOR+GENERATIVE+APPROACH&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJxg7eHYvB",
        "title": "GPU Memory Management for Deep Neural Networks Using Deep Q-Network",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a reinforcement learning based variable swapping and recomputation algorithm to reduce the memory cost.",
        "abstract": "Deep neural networks use deeper and broader structures to achieve better performance and consequently, use increasingly more GPU memory as well. However, limited GPU memory restricts many potential designs of neural networks. In this paper, we propose a reinforcement learning based variable swapping and recomputation algorithm to reduce the memory cost, without sacrificing the accuracy of models. Variable swapping can transfer variables between CPU and GPU memory to reduce variables stored in GPU memory. Recomputation can trade time for space by removing some feature maps during forward propagation. Forward functions are executed once again to get the feature maps before reuse. However, how to automatically decide which variables to be swapped or recomputed remains a challenging problem. To address this issue, we propose to use a deep Q-network(DQN) to make plans. By combining variable swapping and recomputation, our results outperform several well-known benchmarks. ",
        "keywords": "GPU memory management;deep reinforcement learning;neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shicheng Chen",
        "authorids": "coder.chen.shi.cheng@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxg7eHYvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "329;210;326",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            288.3333333333333,
            55.403569880970274
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ijFUNLDcnP4J:scholar.google.com/&scioq=GPU+Memory+Management+for+Deep+Neural+Networks+Using+Deep+Q-Network&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BJxg_hVtwH",
        "title": "StructPool: Structured Graph Pooling via Conditional Random Fields",
        "track": "main",
        "status": "Poster",
        "tldr": "A novel graph pooling method considering relationships between different nodes via conditional random fields.",
        "abstract": "Learning high-level representations for graphs is of great importance for graph analysis tasks. In addition to graph convolution, graph pooling is an important but less explored research area. In particular, most of existing graph pooling techniques do not consider the graph structural information explicitly. We argue that such information is important and develop a novel graph pooling technique, know as the StructPool, in this work. We consider the graph pooling as a node clustering problem, which requires the learning of a cluster assignment matrix. We propose to formulate it as a structured prediction problem and employ conditional random fields to capture the relationships among assignments of different nodes.  We also generalize our method to incorporate graph topological information in designing the Gibbs energy function.  Experimental results on multiple datasets demonstrate the effectiveness of our proposed StructPool.",
        "keywords": "Graph Pooling;Representation Learning;Graph Analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hao Yuan;Shuiwang Ji",
        "authorids": "hao.yuan@tamu.edu;sji@tamu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nYuan2020StructPool:,\ntitle={StructPool: Structured Graph Pooling via Conditional Random Fields},\nauthor={Hao Yuan and Shuiwang Ji},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxg_hVtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxg_hVtwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "197;460;266",
        "wc_reply_reviewers": "13;0;0",
        "wc_reply_authors": "552;770;222",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            307.6666666666667,
            111.33832324147073
        ],
        "wc_reply_reviewers_avg": [
            4.333333333333333,
            6.128258770283412
        ],
        "wc_reply_authors_avg": [
            514.6666666666666,
            225.27218105117984
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 230,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13672161159228945779&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJxiqxSYPB",
        "title": "Learning to Prove Theorems by Learning to Generate Theorems",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We consider the task of automated theorem proving, a key AI task. Deep learning has shown promise for training theorem provers, but there are limited human-written theorems and proofs available for supervised learning. To address this limitation, we propose to learn a neural generator that automatically synthesizes theorems and proofs for the purpose of training a theorem prover. Experiments on real-world  tasks demonstrate that synthetic data from our approach significantly improves the theorem prover and advances the state of the art of automated theorem proving in Metamath.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mingzhe Wang;Jia Deng",
        "authorids": "mingzhew@cs.princeton.edu;jiadeng@princeton.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nwang2020learning,\ntitle={Learning to Prove Theorems by Learning to Generate Theorems},\nauthor={Mingzhe Wang and Jia Deng},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxiqxSYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxiqxSYPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "562;316;159",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1685;118;477",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            345.6666666666667,
            165.85602859776372
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            760.0,
            670.2929707722338
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 53,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6712350260601158611&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BJxkOlSYDH",
        "title": "Provable Filter Pruning for Efficient Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "A sampling-based filter pruning approach for convolutional neural networks exhibiting provable guarantees on the size and performance of the pruned network.",
        "abstract": "We present a provable, sampling-based approach for generating compact Convolutional Neural Networks (CNNs) by identifying and removing redundant filters from an over-parameterized network. Our algorithm uses a small batch of input data points to assign a saliency score to each filter and constructs an importance sampling distribution where filters that highly affect the output are sampled with correspondingly high probability. \nIn contrast to existing filter pruning approaches, our method is simultaneously data-informed, exhibits provable guarantees on the size and performance of the pruned network, and is widely applicable to varying network architectures and data sets. Our analytical bounds bridge the notions of compressibility and importance of network structures, which gives rise to a fully-automated procedure for identifying and preserving filters in layers that are essential to the network's performance. Our experimental evaluations on popular architectures and data sets show that our algorithm consistently generates sparser and more efficient models than those constructed by existing filter pruning approaches. ",
        "keywords": "theory;compression;filter pruning;neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lucas Liebenwein;Cenk Baykal;Harry Lang;Dan Feldman;Daniela Rus",
        "authorids": "lucasl@mit.edu;baykal@mit.edu;hlang08@gmail.com;dannyf.post@gmail.com;rus@csail.mit.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLiebenwein2020Provable,\ntitle={Provable Filter Pruning for Efficient Neural Networks},\nauthor={Lucas Liebenwein and Cenk Baykal and Harry Lang and Dan Feldman and Daniela Rus},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxkOlSYDH}\n}",
        "github": "https://github.com/lucaslie/provable_pruning",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer5;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxkOlSYDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "174;342;192",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "362;628;142",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            236.0,
            75.31268153505093
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            377.3333333333333,
            198.70469434705248
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 199,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9217069157983955160&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJxlmeBKwS",
        "title": "FRICATIVE PHONEME DETECTION WITH ZERO DELAY",
        "track": "main",
        "status": "Reject",
        "tldr": "A deep learning based approach for zero delay fricative phoneme detection",
        "abstract": "People with high-frequency hearing loss rely on hearing aids that employ frequency lowering algorithms. These algorithms shift some of the sounds from the high frequency band to the lower frequency band where the sounds become more perceptible for the people with the condition. Fricative phonemes have an important part of their content concentrated in high frequency bands. It is important that the frequency lowering algorithm is activated exactly for the duration of a fricative phoneme, and kept off at all other times. Therefore, timely (with zero delay) and accurate fricative phoneme detection is a key problem for high quality hearing aids. In this paper we present a deep learning based fricative phoneme detection algorithm that has zero detection delay and achieves state-of-the-art fricative phoneme detection accuracy on the TIMIT Speech Corpus. All reported results are reproducible and come with easy to use code that could serve as a baseline for future research.\n",
        "keywords": "fricative detection;phoneme detection;speech recognition;deep learning;hearing aids;zero delay;extrapolation;TIMIT",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Metehan Yurt;Alberto N. Escalante B.;Veniamin I. Morgenshtern",
        "authorids": "metehan.yurt@fau.de;alberto.escalante@sivantos.com;veniamin.morgenshtern@fau.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyurt2020fricative,\ntitle={{\\{}FRICATIVE{\\}} {\\{}PHONEME{\\}} {\\{}DETECTION{\\}} {\\{}WITH{\\}} {\\{}ZERO{\\}} {\\{}DELAY{\\}}},\nauthor={Metehan Yurt and Alberto N. Escalante B. and Veniamin I. Morgenshtern},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxlmeBKwS}\n}",
        "github": "https://www.dropbox.com/s/kqlw6xc1ff8lu3l/fricatives-master.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxlmeBKwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "671;480;204",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "440;492;55",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            451.6666666666667,
            191.70173592212345
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            329.0,
            194.9068153417593
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17433235934651299358&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJxnIxSKDr",
        "title": "Mint: Matrix-Interleaving for Multi-Task Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an approach that endows a single model with the ability to represent both extremes: joint training and independent training, which leads to effective multi-task learning.",
        "abstract": "Deep learning enables training of large and flexible function approximators from scratch at the cost of large amounts of data. Applications of neural networks often consider learning in the context of a single task. However, in many scenarios what we hope to learn is not just a single task, but a model that can be used to solve multiple different tasks. Such multi-task learning settings have the potential to improve data efficiency and generalization by sharing data and representations across tasks. However, in some challenging multi-task learning settings, particularly in reinforcement learning, it is very difficult to learn a single model that can solve all the tasks while realizing data efficiency and performance benefits. Learning each of the tasks independently from scratch can actually perform better in such settings, but it does not benefit from the representation sharing that multi-task learning can potentially provide. In this work, we develop an approach that endows a single model with the ability to represent both extremes: joint training and independent training. To this end, we introduce matrix-interleaving (Mint), a modification to standard neural network models that projects the activations for each task into a different learned subspace, represented by a per-task and per-layer matrix. By learning these matrices jointly with the other model parameters, the optimizer itself can decide how much to share representations between tasks. On three challenging multi-task supervised learning and reinforcement learning problems with varying degrees of shared task structure, we find that this model consistently matches or outperforms joint training and independent training, combining the best elements of both.",
        "keywords": "multi-task learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianhe Yu;Saurabh Kumar;Eric Mitchell;Abhishek Gupta;Karol Hausman;Sergey Levine;Chelsea Finn",
        "authorids": "tianheyu@cs.stanford.edu;szk@stanford.edu;eric.anthony.mitchell95@gmail.com;abhigupta@berkeley.edu;hausmankarol@gmail.com;svlevine@eecs.berkeley.edu;cbfinn@cs.stanford.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nyu2020mint,\ntitle={Mint: Matrix-Interleaving for Multi-Task Learning},\nauthor={Tianhe Yu and Saurabh Kumar and Eric Mitchell and Abhishek Gupta and Karol Hausman and Sergey Levine and Chelsea Finn},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxnIxSKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJxnIxSKDr",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "637;725;130;98",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "780;713;261;82",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            397.5,
            285.4264353559425
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            459.0,
            295.33455605465474
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2VWxzrIwxt4J:scholar.google.com/&scioq=Mint:+Matrix-Interleaving+for+Multi-Task+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJxoz1rKwr",
        "title": "CRAP: Semi-supervised Learning via Conditional Rotation Angle Prediction",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Coupling semi-supervised learning with self-supervised learning and explicitly modeling the self-supervised task conditioned on the semi-supervised one",
        "abstract": "Self-supervised learning (SlfSL), aiming at learning feature representations through ingeniously designed pretext tasks without human annotation, has achieved compelling progress in the past few years. Very recently, SlfSL has also been identified as a promising solution for semi-supervised learning (SemSL) since it offers a new paradigm to utilize unlabeled data. This work further explores this direction by proposing a new framework to seamlessly couple SlfSL with SemSL. Our insight is that the prediction target in SemSL can be modeled as the latent factor in the predictor for the SlfSL target. Marginalizing over the latent factor naturally derives a new formulation which marries the prediction targets of these two learning processes. By implementing this framework through a simple-but-effective SlfSL approach -- rotation angle prediction, we create a new SemSL approach called Conditional Rotation Angle Prediction (CRAP). Specifically, CRAP is featured by adopting a module which predicts the image rotation angle \\textbf{conditioned on the candidate image class}. Through experimental evaluation, we show that CRAP achieves superior performance over the other existing ways of combining SlfSL and SemSL. Moreover, the proposed SemSL framework is highly extendable. By augmenting CRAP with a simple SemSL technique and a modification of the rotation angle prediction task, our method has already achieved the state-of-the-art SemSL performance.",
        "keywords": "Semi-supervised Learning;Self-supervised Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hai-Ming Xu;Lingqiao Liu",
        "authorids": "hai-ming.xu@adelaide.edu.au;lingqiao.liu@adelaide.edu.au",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "https://www.dropbox.com/s/ciummonqkd5u3as/CRAP.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxoz1rKwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "434;866;270",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            523.3333333333334,
            251.38195816105994
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qx8Bv9vKSmAJ:scholar.google.com/&scioq=CRAP:+Semi-supervised+Learning+via+Conditional+Rotation+Angle+Prediction&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BJxpIJHKwB",
        "title": "Attentive Weights Generation for Few Shot Learning via Information Maximization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A novel few shot learning method to generate query-specific classification weights via information maximization.",
        "abstract": "Few shot image classification aims at learning a classifier from limited labeled data. Generating the classification weights has been applied in many meta-learning approaches for few shot image classification due to its simplicity and effectiveness. However, we argue that it is difficult to generate the exact and universal classification weights for all the diverse query samples from very few training samples. In this work, we introduce Attentive Weights Generation for few shot learning via Information Maximization (AWGIM), which addresses current issues by two novel contributions. i) AWGIM generates different classification weights for different query samples by letting each of query samples attends to the whole support set. ii) To guarantee the generated weights adaptive to different query sample, we re-formulate the problem to maximize the lower bound of mutual information between generated weights and query as well as support data. As far as we can see, this is the first attempt to unify information maximization into few shot learning. Both two contributions are proved to be effective in the extensive experiments and we show that AWGIM is able to achieve state-of-the-art performance on benchmark datasets.",
        "keywords": "few shot learning;meta learning;information maximization;image classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yiluan Guo;Ngai-Man Cheung",
        "authorids": "guoyl1990@outlook.com;ngaiman_cheung@sutd.edu.sg",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "https://www.dropbox.com/s/gine9r9esjrwns2/AWGIM.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxpIJHKwB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "648;222;696",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "654;497;329",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            522.0,
            213.0352083576797
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            493.3333333333333,
            132.7060243127225
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 117,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9022059591117885883&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BJxpbREKvB",
        "title": "Revisiting Fine-tuning for Few-shot Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "An empirical study that provides a novel perspective on few-shot learning, in which a fine-tuning method shows comparable accuracy to more complex state-of-the-art methods in several classification tasks.",
        "abstract": "Few-shot learning is the process of learning novel classes using only a few examples and it remains a challenging task in machine learning. Many sophisticated few-shot learning algorithms have been proposed based on the notion that networks can easily overfit to novel examples if they are simply fine-tuned using only a few examples. In this study, we show that in the commonly used low-resolution mini-ImageNet dataset, the fine-tuning method achieves higher accuracy than common few-shot learning algorithms in the 1-shot task and nearly the same accuracy as that of the state-of-the-art algorithm in the 5-shot task. We then evaluate our method with more practical tasks, namely the high-resolution single-domain and cross-domain tasks. With both tasks, we show that our method achieves higher accuracy than common few-shot learning algorithms. We further analyze the experimental results and show that: 1) the retraining process can be stabilized by employing a low learning rate, 2) using adaptive gradient optimizers during fine-tuning can increase test accuracy, and 3) test accuracy can be improved by updating the entire network when a large domain-shift exists between base and novel classes.",
        "keywords": "few-shot learning;fine-tuning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akihiro Nakamura;Tatsuya Harada",
        "authorids": "nakamura@mi.t.u-tokyo.ac.jp;harada@mi.t.u-tokyo.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxpbREKvB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "450;242;298",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            330.0,
            87.87870428418177
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 85,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3791140849162948574&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJxqohNFPB",
        "title": "S-Flow GAN",
        "track": "main",
        "status": "Reject",
        "tldr": "Simulation to real images translation and video generation",
        "abstract": "Our work offers a new method for domain translation from semantic label maps\nand Computer Graphic (CG) simulation edge map images to photo-realistic im-\nages. We train a Generative Adversarial Network (GAN) in a conditional way to\ngenerate a photo-realistic version of a given CG scene. Existing architectures of\nGANs still lack the photo-realism capabilities needed to train DNNs for computer\nvision tasks, we address this issue by embedding edge maps, and training it in an\nadversarial mode. We also offer an extension to our model that uses our GAN\narchitecture to create visually appealing and temporally coherent videos.",
        "keywords": "GAN;Image Generation;AI;Generative Models;CV",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Miron Yakov;Coscas Yona",
        "authorids": "yakov.miron@gmail.com;yona.coscas@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nyakov2020sflow,\ntitle={S-Flow {\\{}GAN{\\}}},\nauthor={Miron Yakov and Coscas Yona},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxqohNFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJxqohNFPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "135;311;123",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            189.66666666666666,
            85.93537623638409
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6403971739747094127&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BJxsrgStvr",
        "title": "Drawing Early-Bird Tickets: Toward More Efficient Training of Deep Networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "(Frankle & Carbin, 2019) shows that there exist winning tickets (small but critical subnetworks) for dense, randomly initialized networks, that can be trained alone to achieve comparable accuracies to the latter in a similar number of iterations. However, the identification of these winning tickets still requires the costly train-prune-retrain process, limiting their practical benefits. In this paper, we discover for the first time that the winning tickets can be identified at the very early training stage, which we term as Early-Bird (EB) tickets, via low-cost training schemes (e.g., early stopping and low-precision training) at large learning rates. Our finding of EB tickets is consistent with recently reported observations that the key connectivity patterns of neural networks emerge early. Furthermore, we propose a mask distance metric that can be used to identify EB tickets with low computational overhead, without needing to know the true winning tickets that emerge after the full training. Finally, we leverage the existence of EB tickets and the proposed mask distance to develop efficient training methods, which are achieved by first identifying EB tickets via low-cost schemes, and then continuing to train merely the EB tickets towards the target accuracy. Experiments based on various deep networks and datasets validate: 1) the existence of EB tickets and the effectiveness of mask distance in efficiently identifying them; and 2) that the proposed efficient training via EB tickets can achieve up to 5.8x ~ 10.7x energy savings while maintaining comparable or even better accuracy as compared to the most competitive state-of-the-art training methods, demonstrating a promising and easily adopted method for tackling cost-prohibitive deep network training.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haoran You;Chaojian Li;Pengfei Xu;Yonggan Fu;Yue Wang;Xiaohan Chen;Richard G. Baraniuk;Zhangyang Wang;Yingyan Lin",
        "authorids": "hy34@rice.edu;cl114@rice.edu;px5@rice.edu;yf22@rice.edu;yw68@rice.edu;chernxh@tamu.edu;richb@rice.edu;atlaswang@tamu.edu;yingyan.lin@rice.edu",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@inproceedings{\nYou2020Drawing,\ntitle={Drawing Early-Bird Tickets: Toward More Efficient Training of Deep Networks},\nauthor={Haoran You and Chaojian Li and Pengfei Xu and Yonggan Fu and Yue Wang and Xiaohan Chen and Richard G. Baraniuk and Zhangyang Wang and Yingyan Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxsrgStvr}\n}",
        "github": "https://github.com/RICE-EIC/Early-Bird-Tickets",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxsrgStvr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "452;315;253",
        "wc_reply_reviewers": "106;15;0",
        "wc_reply_authors": "653;93;660",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            340.0,
            83.14244804350342
        ],
        "wc_reply_reviewers_avg": [
            40.333333333333336,
            46.83540920666281
        ],
        "wc_reply_authors_avg": [
            468.6666666666667,
            265.6518189075484
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 310,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6381702828996735814&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BJxt2aVFPr",
        "title": "Optimizing Data Usage via Differentiable Rewards",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "To acquire a new skill, humans learn better and faster if a tutor, based on their current knowledge level, informs them of how much attention they should pay to particular content or practice problems. Similarly, a machine learning model could potentially be trained better with a scorer that \u201cadapts\u201d to its current learning state and estimates the importance of each training data instance. Training such an adaptive scorer efficiently is a challenging problem; in order to precisely quantify the effect of a data instance at a given time during the training, it is typically necessary to first complete the entire training process. To efficiently optimize data usage, we propose a reinforcement learning approach called Differentiable Data Selection (DDS). In DDS, we formulate a scorer network as a learnable function of the training data, which can be efficiently updated along with the main model being trained. Specifically, DDS updates the scorer with an intuitive reward signal: it should up-weigh the data that has a similar gradient with a dev set upon which we would finally like to perform well. Without significant computing overhead, DDS delivers strong and consistent improvements over several strong baselines on two very different tasks of machine translation and image classification.",
        "keywords": "data selection;multilingual neural machine translation;data usage optimzation;transfer learning;classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinyi Wang;Hieu Pham;Paul Michel;Antonios Anastasopoulos;Graham Neubig;Jaime Carbonell",
        "authorids": "xinyiw1@cs.cmu.edu;hyhieu@cmu.edu;pmichel1@cs.cmu.edu;aanastas@andrew.cmu.edu;gneubig@cs.cmu.edu;jgc@cs.cmu.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nwang2020optimizing,\ntitle={Optimizing Data Usage via Differentiable Rewards},\nauthor={Xinyi Wang and Hieu Pham and Paul Michel and Antonios Anastasopoulos and Graham Neubig and Jaime Carbonell},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxt2aVFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BJxt2aVFPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "318;181;236",
        "wc_reply_reviewers": "0;30;0",
        "wc_reply_authors": "1130;66;490",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            245.0,
            56.290911048469155
        ],
        "wc_reply_reviewers_avg": [
            10.0,
            14.142135623730951
        ],
        "wc_reply_authors_avg": [
            562.0,
            437.3495931936677
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 71,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4407582239871274683&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BJxt60VtPr",
        "title": "Learning from Unlabelled Videos Using Contrastive Predictive Neural 3D Mapping",
        "track": "main",
        "status": "Poster",
        "tldr": "We show that with the right loss and architecture, view-predictive learning improves 3D object detection",
        "abstract": "Predictive coding theories suggest that the brain learns by predicting observations at various levels of abstraction. One of the most basic prediction tasks is view prediction: how would a given scene look from an alternative viewpoint? Humans excel at this task. Our ability to imagine and fill in missing information is tightly coupled with perception: we feel as if we see the world in 3 dimensions, while in fact, information from only the front surface of the world hits our retinas. This paper explores the role of view prediction in the development of 3D visual recognition. We propose neural 3D mapping networks, which take as input 2.5D (color and depth) video streams captured by a moving camera, and lift them to stable 3D feature maps of the scene, by disentangling the scene content from the motion of the camera. The model also projects its 3D feature maps to novel viewpoints, to predict and match against target views. We propose contrastive prediction losses to replace the standard color regression loss, and show that this leads to better performance on complex photorealistic data. We show that the proposed model learns visual representations useful for (1) semi-supervised learning of 3D object detectors, and (2) unsupervised learning of 3D moving object detectors, by estimating the motion of the inferred 3D feature maps in videos of dynamic scenes. To the best of our knowledge, this is the first work that empirically shows view prediction to be a scalable self-supervised task beneficial to 3D object detection. ",
        "keywords": "3D feature learning;unsupervised learning;inverse graphics;object discovery",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Adam W. Harley;Shrinidhi K. Lakshmikanth;Fangyu Li;Xian Zhou;Hsiao-Yu Fish Tung;Katerina Fragkiadaki",
        "authorids": "aharley@cmu.edu;kowshika@cmu.edu;fangyul@cmu.edu;zhouxian@cmu.edu;htung@cs.cmu.edu;katef@cs.cmu.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nHarley2020Learning,\ntitle={Learning from Unlabelled Videos Using Contrastive Predictive Neural 3D Mapping},\nauthor={Adam W. Harley and Shrinidhi K. Lakshmikanth and Fangyu Li and Xian Zhou and Hsiao-Yu Fish Tung and Katerina Fragkiadaki},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxt60VtPr}\n}",
        "github": "https://github.com/aharley/neural_3d_mapping",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BJxt60VtPr",
        "pdf_size": 0,
        "rating": "3;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "310;455;447;548",
        "wc_reply_reviewers": "0;0;0;64",
        "wc_reply_authors": "1093;1191;224;599",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "2;2;1;1",
        "rating_avg": [
            5.25,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            440.0,
            84.90877457601187
        ],
        "wc_reply_reviewers_avg": [
            16.0,
            27.712812921102035
        ],
        "wc_reply_authors_avg": [
            776.75,
            390.11048114604665
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7365572649342061474&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BJxvH1BtDS",
        "title": "Three-Head Neural Network Architecture for AlphaZero Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "An empirical study of three-head architecture for AlphaZero learning",
        "abstract": "The search-based reinforcement learning algorithm AlphaZero has been used as a general method for\nmastering two-player games Go, chess and Shogi. One crucial ingredient in AlphaZero (and its predecessor AlphaGo Zero) is the two-head network architecture that outputs two estimates --- policy and value --- for one input game state. The merit of such an architecture is that letting policy and value learning share the same representation substantially improved generalization of the neural net.   \nA three-head network architecture has been recently proposed that can learn a third action-value head on a fixed dataset the same as for two-head net. Also, using the action-value head in Monte Carlo tree search (MCTS) improved the search efficiency. \nHowever, effectiveness of the three-head network has not been investigated in an AlphaZero style learning paradigm. \nIn this paper, using the game of Hex as a test domain, we conduct an empirical study of the three-head network architecture in AlpahZero learning. We show that the architecture is also advantageous at the zero-style iterative learning. Specifically, we find that three-head network can induce the following benefits: (1) learning can become faster as search takes advantage of the additional action-value head; (2) better prediction results than two-head architecture can be achieved when using additional action-value learning as an auxiliary task.",
        "keywords": "alphazero;reinforcement learning;two-player games;heuristic search;deep neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chao Gao;Martin Mueller;Ryan Hayward;Hengshuai Yao;Shangling Jui",
        "authorids": "cgao3@ualberta.ca;mmueller@ualberta.ca;hayward@ualberta.ca;hengshuai.yao@huawei.com;jui.shangling@huawei.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ngao2020threehead,\ntitle={Three-Head Neural Network Architecture for AlphaZero Learning},\nauthor={Chao Gao and Martin Mueller and Ryan Hayward and Hengshuai Yao and Shangling Jui},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxvH1BtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxvH1BtDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "868;1216;505",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "536;565;611",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            863.0,
            290.28606580406165
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            570.6666666666666,
            30.879694874715902
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15454035505493031210&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BJxwPJHFwS",
        "title": "Robustness Verification for Transformers",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose the first algorithm for verifying the robustness of Transformers.",
        "abstract": "Robustness verification that aims to formally certify the prediction behavior of neural networks has become an important tool for understanding model behavior and obtaining safety guarantees. However, previous methods can usually only handle neural networks with relatively simple architectures. In this paper, we consider the robustness verification problem for Transformers. Transformers have complex self-attention layers that pose many challenges for verification, including cross-nonlinearity and cross-position dependency, which have not been discussed in previous works. We resolve these challenges and develop the first robustness verification algorithm for Transformers. The certified robustness bounds computed by our method are significantly tighter than those by naive Interval Bound Propagation. These bounds also shed light on interpreting Transformers as they consistently reflect the importance of different words in sentiment analysis.",
        "keywords": "Robustness;Verification;Transformers",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhouxing Shi;Huan Zhang;Kai-Wei Chang;Minlie Huang;Cho-Jui Hsieh",
        "authorids": "zhouxingshichn@gmail.com;huan@huan-zhang.com;kw@kwchang.net;aihuang@tsinghua.edu.cn;chohsieh@cs.ucla.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nShi2020Robustness,\ntitle={Robustness Verification for Transformers},\nauthor={Zhouxing Shi and Huan Zhang and Kai-Wei Chang and Minlie Huang and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxwPJHFwS}\n}",
        "github": "https://github.com/shizhouxing/Robustness-Verification-for-Transformers",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BJxwPJHFwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "362;479;300",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "930;659;195",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            380.3333333333333,
            74.21739837950548
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            594.6666666666666,
            303.4911677279734
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 141,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2702221835826609078&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BJxyzxrYPH",
        "title": "Deep geometric matrix completion: Are we doing it right?",
        "track": "main",
        "status": "Reject",
        "tldr": "A simple spectral geometric approach for matrix completion, based on the framework of functional maps.",
        "abstract": "We address the problem of reconstructing a matrix from a subset of its entries. Current methods, branded as geometric matrix completion, augment classical rank regularization techniques by incorporating geometric information into the solution. This information is usually provided as graphs encoding relations between rows/columns.\nIn this work we propose a simple spectral approach for solving the matrix completion problem, via the framework of functional maps. We introduce the zoomout loss, a multiresolution spectral geometric loss inspired by recent advances in shape correspondence, whose minimization leads to state-of-the-art results on various recommender systems datasets. Surprisingly, for some datasets we were able to achieve comparable results even without incorporating geometric information. This puts into question both the quality of such information and current methods' ability to use it in a meaningful and efficient way.",
        "keywords": "Geometric Matrix Completion;Spectral Graph Theory;Functional Maps;Deep Linear Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amit Boyarski;Sanketh Vedula;Alex Bronstein",
        "authorids": "amitboy@cs.technion.ac.il;sanketh@cs.technion.ac.il;bron@cs.technion.ac.il",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nboyarski2020deep,\ntitle={Deep geometric matrix completion:  Are we doing it right?},\nauthor={Amit Boyarski and Sanketh Vedula and Alex Bronstein},\nyear={2020},\nurl={https://openreview.net/forum?id=BJxyzxrYPH}\n}",
        "github": "https://colab.research.google.com/drive/1OkNEiTHok14gcVf3NxFIbAFutDN6-Tx6",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BJxyzxrYPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "527;392;494",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "940;805;1095",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            471.0,
            57.463031594234565
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            946.6666666666666,
            118.48581724784148
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:obxHpkBQx8sJ:scholar.google.com/&scioq=Deep+geometric+matrix+completion:+Are+we+doing+it+right%3F&hl=en&as_sdt=0,33",
        "gs_version_total": 4
    },
    {
        "id": "Bke-6pVKvB",
        "title": "Poisoning Attacks with Generative Adversarial Nets",
        "track": "main",
        "status": "Reject",
        "tldr": "In this paper we propose a novel generative model to craft systematic poisoning attacks with detectability constraints against machine learning classifiers, including deep networks. ",
        "abstract": "Machine learning algorithms are vulnerable to poisoning attacks: An adversary can inject malicious points in the training dataset to influence the learning process and degrade the algorithm's performance. Optimal poisoning attacks have already been proposed to evaluate worst-case scenarios, modelling attacks as a bi-level optimization problem. Solving these problems is computationally demanding and has limited applicability for some models such as deep networks. In this paper we introduce a novel generative model to craft systematic poisoning attacks against machine learning classifiers generating adversarial training examples, i.e. samples that look like genuine data points but that degrade the classifier's accuracy when used for training. We propose a Generative Adversarial Net with three components: generator, discriminator, and the target classifier. This approach allows us to model naturally the detectability constrains that can be expected in realistic attacks and to identify the regions of the underlying data distribution that can be more vulnerable to data poisoning. Our experimental evaluation shows the effectiveness of our attack to compromise machine learning classifiers, including deep networks.",
        "keywords": "data poisoning;adversarial machine learning;generative adversarial nets",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Luis Mu\u00f1oz-Gonz\u00e1lez;Bjarne Pfitzner;Matteo Russo;Javier Carnerero-Cano;Emil C. Lupu",
        "authorids": "l.munoz@imperial.ac.uk;bjarne.pfitzner@hpi.de;matteor@princeton.edu;j.carnerero-cano18@imperial.ac.uk;e.c.lupu@imperial.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nmu{\\~n}oz-gonz{\\'a}lez2020poisoning,\ntitle={Poisoning Attacks with Generative Adversarial Nets},\nauthor={Luis Mu{\\~n}oz-Gonz{\\'a}lez and Bjarne Pfitzner and Matteo Russo and Javier Carnerero-Cano and Emil C. Lupu},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke-6pVKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bke-6pVKvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "118;332;251",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "680;535;536",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            233.66666666666666,
            88.22068288609474
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            583.6666666666666,
            68.11917661145223
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 88,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10041724925866329230&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Bke02gHYwB",
        "title": "Learn Interpretable Word Embeddings Efficiently with von Mises-Fisher Distribution",
        "track": "main",
        "status": "Reject",
        "tldr": "Learn Interpretable Word Embeddings Efficiently with von Mises-Fisher Distribution",
        "abstract": "Word embedding plays a key role in various tasks of natural language processing. However, the dominant word embedding models don't explain what information is carried with the resulting embeddings. To generate interpretable word embeddings we intend to replace the word vector with a probability density distribution. The insight here is that if we regularize the mixture distribution of all words to be uniform, then we can prove that the inner product between word embeddings represent the point-wise mutual information between words. Moreover, our model can also handle polysemy. Each word's probability density distribution will generate different vectors for its various meanings. We have evaluated our model in several word similarity tasks. Results show that our model can outperform the dominant models consistently in these tasks.",
        "keywords": "word embedding;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Minghong Yao;Liansheng Zhuang;Houqiang Li;Jian Yang;Shafei Wang",
        "authorids": "mhyao1@mail.ustc.edu.cn;lszhuang@ustc.edu.cn;lihq@ustc.edu.cn;nanwuyaoshi@163.com;rockingsandstorm@163.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyao2020learn,\ntitle={Learn Interpretable Word Embeddings Efficiently with von Mises-Fisher Distribution},\nauthor={Minghong Yao and Liansheng Zhuang and Houqiang Li and Jian Yang and Shafei Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke02gHYwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bke02gHYwB",
        "pdf_size": 0,
        "rating": "1;1;8",
        "confidence": "0;0;0",
        "wc_review": "177;228;499",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            3.2998316455372216
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            301.3333333333333,
            141.3136778785251
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HaWN4uEy7kEJ:scholar.google.com/&scioq=Learn+Interpretable+Word+Embeddings+Efficiently+with+von+Mises-Fisher+Distribution&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Bke13pVKPS",
        "title": "Improved Training Speed, Accuracy, and Data Utilization via Loss Function Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "Using evolutionary computation, a system for loss function metalearning was built (GLO) that discovered a new loss function for classification that can train more accurate models in less time.",
        "abstract": "As the complexity of neural network models has grown, it has become increasingly important to optimize their design automatically through metalearning. Methods for discovering hyperparameters, topologies, and learning rate schedules have lead to significant increases in performance. This paper shows that loss functions can be optimized with metalearning as well, and result in similar improvements. The method, Genetic Loss-function Optimization (GLO), discovers loss functions de novo, and optimizes them for a target task. Leveraging techniques from genetic programming, GLO builds loss functions hierarchically from a set of operators and leaf nodes. These functions are repeatedly recombined and mutated to find an optimal structure, and then a covariance-matrix adaptation evolutionary strategy (CMA-ES) is used to find optimal coefficients. Networks trained with GLO loss functions are found to outperform the standard cross-entropy loss on standard image classification tasks. Training with these new loss functions requires fewer steps, results in lower test error, and allows for smaller datasets to be used. Loss function optimization thus provides a new dimension of metalearning, and constitutes an important step towards AutoML.",
        "keywords": "metalearning;evolutionary computation;loss functions;optimization;genetic programming",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Santiago Gonzalez;Risto Miikkulainen",
        "authorids": "slgonzalez@utexas.edu;risto@cs.utexas.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngonzalez2020improved,\ntitle={Improved Training Speed, Accuracy, and Data Utilization via Loss Function Optimization},\nauthor={Santiago Gonzalez and Risto Miikkulainen},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke13pVKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bke13pVKPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "445;543;216",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            401.3333333333333,
            137.02149060964447
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9124503968105055463&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Bke5aJBKvH",
        "title": "Recurrent Layer Attention Network",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a new type of end-to-end trainable attention module, which applies global weight balances among layers by utilizing co-propagating RNN with CNN.",
        "abstract": "Capturing long-range feature relations has been a central issue on convolutional neural networks(CNNs). To tackle this, attempts to integrate end-to-end trainable attention module on CNNs are widespread. Main goal of these works is to adjust feature maps considering spatial-channel correlation inside a convolution layer. In this paper, we focus on modeling relationships among layers and propose a novel structure, 'Recurrent Layer Attention network,' which stores the hierarchy of features into recurrent neural networks(RNNs) that concurrently propagating with CNN and adaptively scales feature volumes of all layers. We further introduce several structural derivatives for demonstrating the compatibility on recent attention modules and the expandability of proposed network. For semantic understanding on learned features, we also visualize intermediate layers and plot the curve of layer scaling coefficients(i.e., layer attention). Recurrent Layer Attention network achieves significant performance enhancement requiring a slight increase on parameters in an image classification task with CIFAR and ImageNet-1K 2012 dataset and an object detection task with Microsoft COCO 2014 dataset.",
        "keywords": "attention mechanism;recurrent neural network;image recognition;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eunseok Kim;Inwook Shim",
        "authorids": "eunseok1117@gmail.com;inugi00@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bke5aJBKvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "333;518;269",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            373.3333333333333,
            105.57882784388586
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "Bke61krFvS",
        "title": "Learning representations for binary-classification without backpropagation",
        "track": "main",
        "status": "Poster",
        "tldr": "First feedback alignment algorithm with provable learning guarantees for networks with single output neuron",
        "abstract": "The family of feedback alignment (FA) algorithms aims to provide a more biologically motivated alternative to backpropagation (BP), by substituting the computations that are unrealistic to be implemented in physical brains.\nWhile FA algorithms have been shown to work well in practice, there is a lack of rigorous theory proofing their learning capabilities.\t\t\nHere we introduce the first feedback alignment algorithm with provable learning guarantees. In contrast to existing work, we do not require any assumption about the size or depth of the network except that it has a single output neuron, i.e., such as for binary classification tasks.\nWe show that our FA algorithm can deliver its theoretical promises in practice, surpassing the learning performance of existing FA methods and matching backpropagation in binary classification tasks.\nFinally, we demonstrate the limits of our FA variant when the number of output neurons grows beyond a certain quantity.",
        "keywords": "feedback alignment;alternatives to backpropagation;biologically motivated learning algorithms",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mathias Lechner",
        "authorids": "mathias.lechner@ist.ac.at",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nLechner2020Learning,\ntitle={Learning representations for binary-classification without backpropagation},\nauthor={Mathias Lechner},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke61krFvS}\n}",
        "github": "https://github.com/mlech26l/iclr_paper_mdfa",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bke61krFvS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "542;347;814",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "437;166;404",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            567.6666666666666,
            191.51385212447573
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            335.6666666666667,
            120.72650450041569
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6618144182532521283&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Bke6vTVYwH",
        "title": "Graph convolutional networks for learning with few clean and many noisy labels",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this work we consider the problem of learning a classifier from noisy labels when a few clean labeled examples are given. The structure of clean and noisy data is modeled by a graph per class and Graph Convolutional Networks (GCN) are used to predict class relevance of noisy examples. For each class, the GCN is treated as a binary classifier learning to discriminate clean from noisy examples using a weighted binary cross-entropy loss function, and then the GCN-inferred \"clean\" probability is exploited as a relevance measure. Each noisy example is weighted by its relevance when learning a classifier for the end task. We evaluate our method on an extended version of a few-shot learning problem, where the few clean examples of novel classes are supplemented with additional noisy data. Experimental results show that our GCN-based cleaning process significantly improves the classification accuracy over not cleaning the noisy data and standard few-shot classification where only few clean examples are used. The proposed GCN-based method outperforms the transductive approach (Douze et al., 2018) that is using the same additional data without labels.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ahmet Iscen;Giorgos Tolias;Yannis Avrithis;Ondrej Chum;Cordelia Schmid",
        "authorids": "iscen@google.com;giorgos.tolias@cmp.felk.cvut.cz;yannis@avrithis.net;chum@cmp.felk.cvut.cz;cordelias@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\niscen2020graph,\ntitle={Graph convolutional networks for learning with few clean and many noisy labels},\nauthor={Ahmet Iscen and Giorgos Tolias and Yannis Avrithis and Ondrej Chum and Cordelia Schmid},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke6vTVYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bke6vTVYwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "147;178;213",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "273;255;127",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            179.33333333333334,
            26.96087700518826
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            218.33333333333334,
            64.99914529352588
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3928299844756290038&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 18
    },
    {
        "id": "Bke7MANKvS",
        "title": "A Kolmogorov Complexity Approach to Generalization in Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a theoretical and experimental framework for defining, understanding, and achieving generalization, and as a result robustness, in deep learning by drawing on algorithmic information theory and coding theory.",
        "abstract": "Deep artificial neural networks can achieve an extremely small difference between training and test accuracies on identically distributed training and test sets, which is a standard measure of generalization. However, the training and test sets may not be sufficiently representative of the empirical sample set, which consists of real-world input samples. When samples are drawn from an underrepresented or unrepresented subset during inference, the gap between the training and inference accuracies can be significant. To address this problem, we first reformulate a classification algorithm as a procedure for searching for a source code that maps input features to classes. We then derive a necessary and sufficient condition for generalization using a universal cognitive similarity metric, namely information distance, based on Kolmogorov complexity. Using this condition, we formulate an optimization problem to learn a more general classification function. To achieve this end, we extend the input features by concatenating encodings of them, and then train the classifier on the extended features. As an illustration of this idea, we focus on image classification, where we use channel codes on the input features as a systematic way to improve the degree to which the training and test sets are representative of the empirical sample set. To showcase our theoretical findings, considering that corrupted or perturbed input features belong to the empirical sample set, but typically not to the training and test sets, we demonstrate through extensive systematic experiments that, as a result of learning a more general classification function, a model trained on encoded input features is significantly more robust to common corruptions, e.g., Gaussian and shot noise, as well as adversarial perturbations, e.g., those found via projected gradient descent, than the model trained on uncoded input features.",
        "keywords": "Kolmogorov complexity;information distance;generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hazar Yueksel;Kush R. Varshney;Brian Kingsbury",
        "authorids": "hazar.yueksel@ibm.com;krvarshn@us.ibm.com;bedk@us.ibm.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyueksel2020a,\ntitle={A Kolmogorov Complexity Approach to Generalization in Deep Learning},\nauthor={Hazar Yueksel and Kush R. Varshney and Brian Kingsbury},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke7MANKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bke7MANKvS",
        "pdf_size": 0,
        "rating": "1;3;3;8",
        "confidence": "0;0;0;0",
        "wc_review": "437;247;756;262",
        "wc_reply_reviewers": "0;0;62;0",
        "wc_reply_authors": "1509;414;1611;103",
        "reply_reviewers": "0;0;1;0",
        "reply_authors": "3;1;3;1",
        "rating_avg": [
            3.75,
            2.5860201081971503
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            425.5,
            204.91278632628078
        ],
        "wc_reply_reviewers_avg": [
            15.5,
            26.846787517317598
        ],
        "wc_reply_authors_avg": [
            909.25,
            660.9585368992521
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            2.0,
            1.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13432049969551341169&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Bke8764twr",
        "title": "Bias-Resilient Neural Network",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a method based on the adversarial training strategy to learn discriminative features unbiased and invariant to the confounder(s) by incorporating a loss function that encourages a vanished correlation between the bias and learned features.",
        "abstract": "Presence of bias and confounding effects is inarguably one of the most critical challenges in machine learning applications that has alluded to pivotal debates in the recent years. Such challenges range from spurious associations of confounding variables in medical studies to the bias of race in gender or face recognition systems. One solution is to enhance datasets and organize them such that they do not reflect biases, which is a cumbersome and intensive task. The alternative is to make use of available data and build models considering these biases. Traditional statistical methods apply straightforward techniques such as residualization or stratification to precomputed features to account for confounding variables. However, these techniques are not in general applicable to end-to-end deep learning methods. In this paper, we propose a method based on the adversarial training strategy to learn discriminative features unbiased and invariant to the confounder(s). This is enabled by incorporating a new adversarial loss function that encourages a vanished correlation between the bias and learned features. We apply our method to a synthetic, a medical diagnosis, and a gender classification (Gender Shades) dataset. Our results show that the learned features by our method not only result in superior prediction performance but also are uncorrelated with the bias or confounder variables. The code is available at http://blinded_for_review/.",
        "keywords": "Invariant Feature Learning;Vanished Correlation;Generative Adversarial Networks;Gender Shades;Fairness in Machine Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ehsan Adeli;Qingyu Zhao;Adolf Pfefferbaum;Edith V. Sullivan;L. Fei-Fei;Juan Carlos Niebles;Kilian M. Pohl",
        "authorids": "eadeli@stanford.edu;qingyuz@stanford.edu;edie@stanford.edu;dolfp@stanford.edu;feifeili@cs.stanford.edu;jniebles@cs.stanford.edu;kilian.pohl@stanford.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nadeli2020biasresilient,\ntitle={Bias-Resilient Neural Network},\nauthor={Ehsan Adeli and Qingyu Zhao and Adolf Pfefferbaum and Edith V. Sullivan and Fei-Fei Li and Juan Carlos Niebles and Kilian M. Pohl},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke8764twr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bke8764twr",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "636;221;178",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1994;1161;234",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            345.0,
            206.51553613872895
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1129.6666666666667,
            718.8585087175052
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8584383647259685755&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Bke89JBtvB",
        "title": "Batch-shaping for learning conditional channel gated networks",
        "track": "main",
        "status": "Poster",
        "tldr": "A method that trains large capacity neural networks with significantly improved accuracy and lower dynamic computational cost",
        "abstract": "We present a method that trains large capacity neural networks with significantly improved accuracy and lower dynamic computational cost. This is achieved by gating the deep-learning architecture on a fine-grained-level. Individual convolutional maps are turned on/off conditionally on features in the network. To achieve this, we introduce a new residual block architecture that gates convolutional channels in a fine-grained manner. We also introduce a generally applicable tool batch-shaping that matches the marginal aggregate posteriors of features in a neural network to a pre-specified prior distribution. We use this novel technique to force gates to be more conditional on the data. We present results on CIFAR-10 and ImageNet datasets for image classification, and Cityscapes for semantic segmentation. Our results show that our method can slim down large architectures conditionally, such that the average computational cost on the data is on par with a smaller architecture, but with higher accuracy. In particular, on ImageNet, our ResNet50 and ResNet34 gated networks obtain 74.60% and 72.55% top-1 accuracy compared to the 69.76% accuracy of the baseline ResNet18 model, for similar complexity. We also show that the resulting networks automatically learn to use more features for difficult examples and fewer features for simple examples.",
        "keywords": "Conditional computation;channel gated networks;gating;Batch-shaping;distribution matching;image classification;semantic segmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Babak Ehteshami Bejnordi;Tijmen Blankevoort;Max Welling",
        "authorids": "behtesha@qti.qualcomm.com;tijmen@qti.qualcomm.com;mwelling@qti.qualcomm.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nBejnordi2020Batch-shaping,\ntitle={Batch-shaping for learning conditional channel gated networks},\nauthor={Babak Ehteshami Bejnordi and Tijmen Blankevoort and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke89JBtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bke89JBtvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "512;309;242",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "705;521;13",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            354.3333333333333,
            114.79353446756292
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            413.0,
            292.6476835149506
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 93,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17384815673207544337&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Bke8UR4FPB",
        "title": "Oblique Decision Trees from Derivatives of ReLU Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "A novel neural architecture which implicitly realizes (oblique) decision trees.",
        "abstract": "We show how neural models can be used to realize piece-wise constant functions such as decision trees. The proposed architecture, which we call locally constant networks, builds on ReLU networks that are piece-wise linear and hence their associated gradients with respect to the inputs are locally constant. We formally establish the equivalence between the classes of locally constant networks and decision trees. Moreover, we highlight several advantageous properties of locally constant networks, including how they realize decision trees with parameter sharing across branching / leaves. Indeed, only $M$ neurons suffice to implicitly model an oblique decision tree with $2^M$ leaf nodes. The neural representation also enables us to adopt many tools developed for deep networks (e.g., DropConnect (Wan et al., 2013)) while implicitly training decision trees. We demonstrate that our method outperforms alternative techniques for training oblique decision trees in the context of molecular property classification and regression tasks. ",
        "keywords": "oblique decision trees;ReLU networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guang-He Lee;Tommi S. Jaakkola",
        "authorids": "guanghe@csail.mit.edu;tommi@csail.mit.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLee2020Oblique,\ntitle={Oblique Decision Trees from Derivatives of ReLU Networks},\nauthor={Guang-He Lee and Tommi S. Jaakkola},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke8UR4FPB}\n}",
        "github": "https://github.com/guanghelee/iclr20-lcn",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bke8UR4FPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "712;297;198",
        "wc_reply_reviewers": "0;35;0",
        "wc_reply_authors": "902;428;19",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            402.3333333333333,
            222.6661676641115
        ],
        "wc_reply_reviewers_avg": [
            11.666666666666666,
            16.49915822768611
        ],
        "wc_reply_authors_avg": [
            449.6666666666667,
            360.8086596644943
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15458108821420666095&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Bke9u1HFwB",
        "title": "Do recent advancements in model-based deep reinforcement learning really improve data efficiency?",
        "track": "main",
        "status": "Reject",
        "tldr": "Recent advancements in data-efficient model-based reinforcement learning are not any more data efficient than existing model-free approaches.",
        "abstract": "Reinforcement learning (RL) has seen great advancements in the past few years. Nevertheless, the consensus among the RL community is that currently used model-free methods, despite all their benefits, suffer from extreme data inefficiency. To circumvent this problem, novel model-based approaches were introduced that often claim to be much more efficient than their model-free counterparts. In this paper, however, we demonstrate that the state-of-the-art model-free Rainbow DQN algorithm can be trained using a much smaller number of samples than it is commonly reported. By simply allowing the algorithm to execute network updates more frequently we manage to reach similar or better results than existing model-based techniques, at a fraction of complexity and computational costs. Furthermore, based on the outcomes of the study, we argue that the agent similar to the modified Rainbow DQN that is presented in this paper should be used as a baseline for any future work aimed at improving sample efficiency of deep reinforcement learning.",
        "keywords": "deep learning;reinforcement learning;data efficiency;DQN;Rainbow;SimPLe",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kacper Piotr Kielak",
        "authorids": "k.kielak@bham.ac.uk",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nkielak2020do,\ntitle={Do recent advancements in model-based deep reinforcement learning really improve data efficiency?},\nauthor={Kacper Piotr Kielak},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke9u1HFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bke9u1HFwB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "341;324;413",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "169;0;177",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.3333333333333,
            38.57748335781149
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            115.33333333333333,
            81.61835305588784
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 41,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8289974939665952174&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BkeDGJBKvB",
        "title": "Multitask Soft Option Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "In Hierarchical RL, we introduce the notion of a 'soft', i.e. adaptable, option and show that this helps learning in multitask settings.",
        "abstract": "We present Multitask Soft Option Learning (MSOL), a hierarchical multi-task framework based on Planning-as-Inference. MSOL extends the concept of Options, using separate variational posteriors for each task, regularized by a shared prior. The learned soft-options are temporally extended, allowing a higher-level master policy to train faster on new tasks by making decisions with lower frequency. Additionally, MSOL allows fine-tuning of soft-options for new tasks without unlearning previously useful behavior, and avoids problems with local minima in multitask training. We demonstrate empirically that MSOL significantly outperforms both hierarchical and flat transfer-learning baselines in challenging multi-task environments.",
        "keywords": "Hierarchical Reinforcement Learning;Reinforcement Learning;Control as Inference;Options;Multitask Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Maximilian Igl;Andrew Gambardella;Jinke He;Nantas Nardelli;N. Siddharth;Wendelin B\u00f6hmer;Shimon Whiteson",
        "authorids": "maximilian.igl@gmail.com;gambs@robots.ox.ac.uk;jinkehe1996@gmail.com;nantas@robots.ox.ac.uk;nsid@robots.ox.ac.uk;wendelin.boehmer@cs.ox.ac.uk;shimon.whiteson@cs.ox.ac.uk",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nigl2020multitask,\ntitle={Multitask Soft Option Learning},\nauthor={Maximilian Igl and Andrew Gambardella and Jinke He and Nantas Nardelli and N. Siddharth and Wendelin B{\\\"o}hmer and Shimon Whiteson},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeDGJBKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkeDGJBKvB",
        "pdf_size": 0,
        "rating": "3;8",
        "confidence": "0;0",
        "wc_review": "636;237",
        "wc_reply_reviewers": "245;0",
        "wc_reply_authors": "1043;343",
        "reply_reviewers": "2;0",
        "reply_authors": "2;1",
        "rating_avg": [
            5.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            436.5,
            199.5
        ],
        "wc_reply_reviewers_avg": [
            122.5,
            122.5
        ],
        "wc_reply_authors_avg": [
            693.0,
            350.0
        ],
        "reply_reviewers_avg": [
            1.0,
            1.0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3661247825600495000&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 15
    },
    {
        "id": "BkeGPJrtwB",
        "title": "Fairness with Wasserstein Adversarial Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Quantifying, enforcing and implementing fairness emerged as a major topic in machine learning. We investigate these questions in the context of deep learning. Our main algorithmic and theoretical tool is the computational estimation of similarities between probability, ``\\`a la Wasserstein'', using adversarial networks. This idea is flexible enough to investigate different fairness constrained learning tasks, which we model by specifying properties of the underlying data generative process. The first setting considers bias in the generative model which should be filtered out. The second model is related to the presence of nuisance variables in the observations producing an unwanted bias for the learning task.  For both models, we devise a learning algorithm based on approximation of Wasserstein distances using adversarial networks. We provide formal arguments describing the fairness enforcing properties of these algorithm in relation with the underlying fairness generative processes. Finally we perform experiments, both on synthetic and real world data, to demonstrate empirically the superiority of our approach compared to state of the art fairness algorithms as well as concurrent GAN type adversarial architectures based on Jensen divergence.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "serrurier Mathieu;Loubes Jean-Michel;Edouard Pauwels",
        "authorids": "mathieu.serrurier@irit.fr;loubes@math.univ-toulouse.fr;edouard.pauwels@irit.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmathieu2020fairness,\ntitle={Fairness with Wasserstein Adversarial Networks},\nauthor={serrurier Mathieu and Loubes Jean-Michel and Edouard Pauwels},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeGPJrtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkeGPJrtwB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "338;418;511",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            422.3333333333333,
            70.69339117311856
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6496332360966161093&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkeHt34Fwr",
        "title": "Regional based query in graph active learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Graph-oriented approaches to Active Learning for node classification",
        "abstract": "Graph convolution networks (GCN) have emerged as a leading method to classify nodes and graphs. These GCN have been combined with active learning (AL) methods, when a small chosen set of tagged examples can be used.  Most AL-GCN use the sample class uncertainty as selection criteria, and not the graph. In contrast, representative sampling uses the graph, but not the prediction. We propose to combine the two and query nodes based on the uncertainty of the graph around them. We here propose two novel methods to select optimal nodes in AL-GCN that explicitly use the graph information to query for optimal nodes. The first method named regional uncertainty is an extension of the classical entropy measure, but instead of sampling nodes with high entropy, we propose to sample nodes surrounded by nodes of different classes, or nodes with high ambiguity. The second method called  Adaptive Page-Rank is an extension of the page-rank algorithm, where nodes that have a low probability of being reached by random walks from tagged nodes are selected. We show that the latter is optimal when the fraction of tagged nodes is low, and when this fraction grows to one over the average degree, the regional uncertainty performs better than all existing methods. While we have tested these methods on graphs, such methods can be extended to any classification problem, where a distance can be defined between the input samples.",
        "keywords": "Active Learning;Graph Convolution Networks;Graph;Graph Topology",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Abel Roy;Louzoun Yoram",
        "authorids": "royabel10@gmail.com;louzouy@math.biu.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nroy2020regional,\ntitle={Regional based query in graph active learning},\nauthor={Abel Roy and Louzoun Yoram},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeHt34Fwr}\n}",
        "github": "https://github.com/anonymous8375/Active-Learning-GCN",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkeHt34Fwr",
        "pdf_size": 0,
        "rating": "1;6",
        "confidence": "0;0",
        "wc_review": "521;231",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "813;52",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            376.0,
            145.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            432.5,
            380.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15488764548810274978&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BkeJm6VtPH",
        "title": "Continual Learning via Neural Pruning",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a continual learning algorithm based on sparsification using activation based neural pruning. We show that we beat or match prior methods of much higher complexity.",
        "abstract": "We introduce Continual Learning via Neural Pruning~(CLNP), a new method aimed at lifelong learning in fixed capacity models based on neuronal model sparsification. In this method, subsequent tasks are trained using the inactive neurons and filters of the sparsified network and cause zero deterioration to the performance of previous tasks. In order to deal with the possible compromise between model sparsity and performance, we formalize and incorporate the concept of \\emph{graceful forgetting}: the idea that it is preferable to suffer a small amount of forgetting in a controlled manner if it helps regain network capacity and prevents uncontrolled loss of performance during the training of future tasks. CLNP also provides simple continual learning diagnostic tools in terms of the number of free neurons left for the training of future tasks as well as the number of neurons that are being reused. In particular,  we see in experiments that CLNP verifies and automatically takes advantage of the fact that the features of earlier layers are more transferable.   We show empirically that CLNP leads to significantly improved results over current weight elasticity based methods. CLNP can also be applied in single-head architectures providing the first viable such algorithm for continual learning. ",
        "keywords": "continual learning;lifelong learning;catastrophic forgetting;sparsification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Siavash Golkar;Micheal Kagan;Kyunghyun Cho",
        "authorids": "siavash.golkar@gmail.com;makagan@slac.stanford.edu;kyunghyun.cho@nyu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngolkar2020continual,\ntitle={Continual Learning via Neural Pruning},\nauthor={Siavash Golkar and Micheal Kagan and Kyunghyun Cho},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeJm6VtPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkeJm6VtPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "755;168;172",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.0,
            275.77647953853256
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 197,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=51689831899429520&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BkeMXR4KvS",
        "title": "DASGrad: Double Adaptive Stochastic Gradient",
        "track": "main",
        "status": "Reject",
        "tldr": "Stochastic gradient descent with adaptive moments and adaptive probabilities",
        "abstract": "Adaptive moment methods have been remarkably successful for optimization under the presence of high dimensional or sparse gradients, in parallel to this, adaptive sampling probabilities for SGD have allowed optimizers to improve convergence rates by prioritizing examples to learn efficiently. Numerous applications in the past have implicitly combined adaptive moment methods with adaptive probabilities yet the theoretical guarantees of such procedures have not been explored. We formalize double adaptive stochastic gradient methods DASGrad as an optimization technique and analyze its convergence improvements in a stochastic convex optimization setting, we provide empirical validation of our findings with convex and non convex objectives. We observe that the benefits of the method increase with the model complexity and variability of the gradients, and we explore the resulting utility in extensions to transfer learning. ",
        "keywords": "stochastic convex optimization;adaptivity;online learning;transfer learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kin Gutierrez;Cristian Challu;Jin Li;Artur Dubrawski",
        "authorids": "kdgutier@cs.cmu.edu;cchallu@cs.cmu.edu;jinl2@cs.cmu.edu;awd@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ngutierrez2020dasgrad,\ntitle={{\\{}DASG{\\}}rad: Double Adaptive Stochastic Gradient},\nauthor={Kin Gutierrez and Cristian Challu and Jin Li and Artur Dubrawski},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeMXR4KvS}\n}",
        "github": "https://github.com/kdgutier/dasgrad",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkeMXR4KvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "632;278;575",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "217;204;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            495.0,
            155.19664944836921
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            140.33333333333334,
            99.37247550951365
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:E_-LrMDvgikJ:scholar.google.com/&scioq=DASGrad:+Double+Adaptive+Stochastic+Gradient&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BkeOp6EKDH",
        "title": "TriMap: Large-scale Dimensionality Reduction Using Triplets",
        "track": "main",
        "status": "Reject",
        "tldr": "A significantly faster dimensionality reduction method based on triplets that preserves the global structure of the data better than t-SNE, LargeVis, and UMAP.",
        "abstract": "We introduce ``TriMap''; a dimensionality reduction technique based on triplet constraints that preserves the global accuracy of the data  better than the other commonly used methods such as t-SNE, LargeVis, and UMAP. To quantify the global accuracy, we introduce a score which roughly reflects the relative placement of the clusters rather than the individual points. We empirically show the excellent performance of TriMap on a large variety of datasets in terms of the quality of the embedding as well as the runtime. On our performance benchmarks, TriMap easily scales to millions of points without depleting the memory and clearly outperforms t-SNE, LargeVis, and UMAP in terms of runtime.",
        "keywords": "Dimensionality Reduction;Triplets;Data Visualization;t-SNE;LargeVis;UMAP",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ehsan Amid;Manfred K. Warmuth",
        "authorids": "eamid@ucsc.edu;manfred@ucsc.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\namid2020trimap,\ntitle={TriMap: Large-scale Dimensionality Reduction Using Triplets},\nauthor={Ehsan Amid and Manfred K. Warmuth},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeOp6EKDH}\n}",
        "github": "https://drive.google.com/file/d/1oR9DXPmkDt8601N8CQ1l0CVgX6AgTnya/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkeOp6EKDH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "94;540;260",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "88;381;306",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            298.0,
            184.0507176477904
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            258.3333333333333,
            124.27478514253092
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 152,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3986289688425831275&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BkePHaVKwS",
        "title": "Learning Surrogate Losses",
        "track": "main",
        "status": "Reject",
        "tldr": "Optimizing Surrogate Loss Functions",
        "abstract": "The minimization of loss functions is the heart and soul of Machine Learning. In this paper, we propose an off-the-shelf optimization approach that can seamlessly minimize virtually any non-differentiable and non-decomposable loss function (e.g. Miss-classification Rate, AUC, F1, Jaccard Index, Mathew Correlation Coefficient, etc.). Our strategy learns smooth relaxation versions of the true losses by approximating them through a surrogate neural network. The proposed loss networks are set-wise models which are invariant to the order of mini-batch instances. Ultimately, the surrogate losses are learned jointly with the prediction model via bilevel optimization. Empirical results on multiple datasets with diverse real-life loss functions compared with state-of-the-art baselines demonstrate the efficiency of learning surrogate losses.",
        "keywords": "Surrogate losses;Non-differentiable losses",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Josif Grabocka;Randolf Scholz;Lars Schmidt-Thieme",
        "authorids": "josif@ismll.uni-hildesheim.de;rscholz@ismll.uni-hildesheim.de;schmidt-thieme@ismll.uni-hildesheim.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngrabocka2020learning,\ntitle={Learning Surrogate Losses},\nauthor={Josif Grabocka and Randolf Scholz and Lars Schmidt-Thieme},\nyear={2020},\nurl={https://openreview.net/forum?id=BkePHaVKwS}\n}",
        "github": "https://gofile.io/?c=uJD3QC",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkePHaVKwS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "264;394;642",
        "wc_reply_reviewers": "0;0;975",
        "wc_reply_authors": "606;938;1593",
        "reply_reviewers": "0;0;3",
        "reply_authors": "1;2;3",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            433.3333333333333,
            156.80419495522293
        ],
        "wc_reply_reviewers_avg": [
            325.0,
            459.6194077712559
        ],
        "wc_reply_authors_avg": [
            1045.6666666666667,
            410.07018369488355
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5478221957259361680&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BkePneStwH",
        "title": "XD: Cross-lingual Knowledge Distillation for Polyglot Sentence Embeddings",
        "track": "main",
        "status": "Reject",
        "tldr": "Knowledge distillation for cross-lingual language model alignment with state-of-the-art results on XNLI",
        "abstract": "Current state-of-the-art results in multilingual natural language inference (NLI) are based on tuning XLM (a pre-trained polyglot language model) separately for each language involved, resulting in multiple models. We reach significantly higher NLI results with a single model for all languages via multilingual tuning. Furthermore, we introduce cross-lingual knowledge distillation (XD), where the same polyglot model is used both as teacher and student across languages to improve its sentence representations without using the end-task labels. When used alone, XD beats multilingual tuning for some languages and the combination of them both results in a new state-of-the-art of 79.2% on the XNLI dataset, surpassing the previous result by absolute 2.5%. The models and code for reproducing our experiments will be made publicly available after de-anonymization.",
        "keywords": "cross-lingual transfer;sentence embeddings;polyglot language models;knowledge distillation;natural language inference;embedding alignment;embedding mapping",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Maksym Del;Mark Fishel",
        "authorids": "max.del.edu@gmail.com;fishel@ut.ee",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ndel2020xd,\ntitle={{\\{}XD{\\}}: Cross-lingual Knowledge Distillation for Polyglot Sentence Embeddings},\nauthor={Maksym Del and Mark Fishel},\nyear={2020},\nurl={https://openreview.net/forum?id=BkePneStwH}\n}",
        "github": "https://drive.google.com/open?id=1QZ9VnQYWRPtNtdyvep4cF6xI2kIi_3El",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkePneStwH",
        "pdf_size": 0,
        "rating": "1;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "696;372;246;260",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "384;233;198;179",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            4.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            393.5,
            181.3470429866448
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            248.5,
            80.59311384975766
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:55AkBx-hu8QJ:scholar.google.com/&scioq=XD:+Cross-lingual+Knowledge+Distillation+for+Polyglot+Sentence+Embeddings&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BkeWw6VFwr",
        "title": "Certified Robustness for Top-k Predictions against Adversarial Perturbations via Randomized Smoothing",
        "track": "main",
        "status": "Poster",
        "tldr": "We study the certified robustness for top-k predictions via randomized smoothing under Gaussian noise and derive a tight robustness bound in L_2 norm.",
        "abstract": "It is well-known that  classifiers are vulnerable to adversarial perturbations. To defend against adversarial perturbations, various certified robustness results have been derived. However, existing certified robustnesses are limited to top-1 predictions. In many real-world applications, top-$k$ predictions are more relevant. In this work, we aim to derive certified robustness for top-$k$ predictions. In particular, our certified robustness is based on randomized smoothing, which turns any classifier to a new classifier via adding noise to an input example. We adopt randomized smoothing because it is scalable to large-scale neural networks and applicable to any classifier. We derive a tight robustness in $\\ell_2$ norm for top-$k$ predictions  when using randomized smoothing with Gaussian noise. We find that generalizing the certified robustness  from top-1 to top-$k$ predictions faces significant technical challenges. We also empirically evaluate our method on CIFAR10 and ImageNet. For example, our method can obtain an ImageNet classifier with a certified top-5 accuracy of 62.8\\% when the $\\ell_2$-norms of the adversarial perturbations are less than 0.5 (=127/255). Our code is publicly available at: \\url{https://github.com/jjy1994/Certify_Topk}. ",
        "keywords": "Certified Adversarial Robustness;Randomized Smoothing;Adversarial Examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jinyuan Jia;Xiaoyu Cao;Binghui Wang;Neil Zhenqiang Gong",
        "authorids": "jinyuan.jia@duke.edu;xiaoyu.cao@duke.edu;binghui.wang@duke.edu;neil.gong@duke.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\njia2020certified,\ntitle={Certified Robustness for Top-k Predictions against Adversarial Perturbations via Randomized Smoothing},\nauthor={Jinyuan Jia and Xiaoyu Cao and Binghui Wang and Neil Zhenqiang Gong},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeWw6VFwr}\n}",
        "github": "https://github.com/jjy1994/Certify_Topk",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkeWw6VFwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "347;455;217",
        "wc_reply_reviewers": "0;17;0",
        "wc_reply_authors": "249;203;111",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            339.6666666666667,
            97.30136461301842
        ],
        "wc_reply_reviewers_avg": [
            5.666666666666667,
            8.013876853447538
        ],
        "wc_reply_authors_avg": [
            187.66666666666666,
            57.372079930533765
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 108,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12562520033309681005&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BkeYSlrYwH",
        "title": "Collaborative Inter-agent Knowledge Distillation for Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Reinforcement Learning (RL) has demonstrated promising results across several sequential decision-making tasks. However, reinforcement learning struggles to learn efficiently, thus limiting its pervasive application to several challenging problems. A typical RL agent learns solely from its own trial-and-error experiences, requiring many experiences to learn a successful policy. To alleviate this problem, we propose collaborative inter-agent knowledge distillation (CIKD). CIKD is a learning framework that uses an ensemble of RL agents to execute different policies in the environment while sharing knowledge amongst agents in the ensemble. Our experiments demonstrate that CIKD improves upon state-of-the-art RL methods in sample efficiency and performance on several challenging MuJoCo benchmark tasks. Additionally, we present an in-depth investigation on how CIKD leads to performance improvements.\n",
        "keywords": "Reinforcement learning;distillation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhang-Wei Hong;Prabhat Nagarajan;Guilherme Maeda",
        "authorids": "williamd4112@gapp.nthu.edu.tw;prabhat@preferred.jp;gjmaeda@preferred.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhong2020collaborative,\ntitle={Collaborative Inter-agent Knowledge Distillation for Reinforcement Learning},\nauthor={Zhang-Wei Hong and Prabhat Nagarajan and Guilherme Maeda},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeYSlrYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkeYSlrYwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "863;495;406",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1412;674;729",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            588.0,
            197.81978330456906
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            938.3333333333334,
            335.68470656588187
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6841220104592494041&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkeYdyHYPS",
        "title": "Evo-NAS: Evolutionary-Neural Hybrid Agent for Architecture Search",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural Architecture Search has shown potential to automate the design of neural networks. Deep Reinforcement Learning based agents can learn complex architectural patterns, as well as explore a vast and compositional search space. On the other hand, evolutionary algorithms offer higher sample efficiency, which is critical for such a resource intensive application. In order to capture the best of both worlds, we propose a class of Evolutionary-Neural hybrid agents (Evo-NAS). We show that the Evo-NAS agent outperforms both neural and evolutionary agents when applied to architecture search for a suite of text and image classification benchmarks. On a high-complexity architecture search space for image classification, the Evo-NAS agent surpasses the accuracy achieved by commonly used agents with only 1/3 of the search cost.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Krzysztof Maziarz;Mingxing Tan;Andrey Khorlin;Kuang-Yu Samuel Chang;Andrea Gesmundo",
        "authorids": "krzysztof.s.maziarz@gmail.com;tanmingxing@google.com;akhorlin@google.com;kysc@google.com;agesmundo@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nmaziarz2020evonas,\ntitle={Evo-{\\{}NAS{\\}}: Evolutionary-Neural Hybrid Agent for Architecture Search},\nauthor={Krzysztof Maziarz and Mingxing Tan and Andrey Khorlin and Kuang-Yu Samuel Chang and Andrea Gesmundo},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeYdyHYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkeYdyHYPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "89;264;88",
        "wc_reply_reviewers": "0;187;0",
        "wc_reply_authors": "469;1026;442",
        "reply_reviewers": "0;3;0",
        "reply_authors": "1;4;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            147.0,
            82.73250066731131
        ],
        "wc_reply_reviewers_avg": [
            62.333333333333336,
            88.15264538792292
        ],
        "wc_reply_authors_avg": [
            645.6666666666666,
            269.1620742642288
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7230945454678511104&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Bke_DertPB",
        "title": "Adversarial Lipschitz Regularization",
        "track": "main",
        "status": "Poster",
        "tldr": "alternative to gradient penalty",
        "abstract": "Generative adversarial networks (GANs) are one of the most popular approaches when it comes to training generative models, among which variants of Wasserstein GANs are considered superior to the standard GAN formulation in terms of learning stability and sample quality. However, Wasserstein GANs require the critic to be 1-Lipschitz, which is often enforced implicitly by penalizing the norm of its gradient, or by globally restricting its Lipschitz constant via weight normalization techniques. Training with a regularization term penalizing the violation of the Lipschitz constraint explicitly, instead of through the norm of the gradient, was found to be practically infeasible in most situations. Inspired by Virtual Adversarial Training, we propose a method called Adversarial Lipschitz Regularization, and show that using an explicit Lipschitz penalty is indeed viable and leads to competitive performance when applied to Wasserstein GANs, highlighting an important connection between Lipschitz regularization and adversarial training.",
        "keywords": "generative adversarial networks;wasserstein generative adversarial networks;lipschitz regularization;adversarial training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "D\u00e1vid Terj\u00e9k",
        "authorids": "david.terjek92@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nTerj\u00e9k2020Adversarial,\ntitle={Adversarial Lipschitz Regularization},\nauthor={D\u00e1vid Terj\u00e9k},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bke_DertPB}\n}",
        "github": "https://drive.google.com/open?id=11CVllq2OmppENKBQdqGIBz_BYTiiZVl_",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer5",
        "site": "https://openreview.net/forum?id=Bke_DertPB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "238;917;1257",
        "wc_reply_reviewers": "0;64;0",
        "wc_reply_authors": "249;473;1058",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            804.0,
            423.6090965343717
        ],
        "wc_reply_reviewers_avg": [
            21.333333333333332,
            30.169889330626027
        ],
        "wc_reply_authors_avg": [
            593.3333333333334,
            341.0575057409266
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 55,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5613388414894784794&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BkeaEyBYDB",
        "title": "Improving Federated Learning Personalization via Model Agnostic Meta Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Federated Averaging already is a Meta Learning algorithm, while datacenter-trained methods are significantly harder to personalize.",
        "abstract": "Federated Learning (FL) refers to learning a high quality global model based on decentralized data storage, without ever copying the raw data. A natural scenario arises with data created on mobile phones by the activity of their users. Given the typical data heterogeneity in such situations, it is natural to ask how can the global model be personalized for every such device, individually. In this work, we point out that the setting of Model Agnostic Meta Learning (MAML), where one optimizes for a fast, gradient-based, few-shot adaptation to a heterogeneous distribution of tasks, has a number of similarities with the objective of personalization for FL. We present FL as a natural source of practical applications for MAML algorithms, and make the following observations. 1) The popular FL algorithm, Federated Averaging, can be interpreted as a meta learning algorithm. 2) Careful fine-tuning can yield a global model with higher accuracy, which is at the same time easier to personalize. However, solely optimizing for the global model accuracy yields a weaker personalization result. 3) A model trained using a standard datacenter optimization method is much harder to personalize, compared to one trained using Federated Averaging, supporting the first claim. These results raise new questions for FL, MAML, and broader ML research.",
        "keywords": "Federated Learning;Model Agnostic Meta Learning;Personalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yihan Jiang;Jakub Kone\u010dn\u00fd;Keith Rush;Sreeram Kannan",
        "authorids": "yihanrogerjiang@gmail.com;konkey@google.com;krush@google.com;ksreeram@uw.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njiang2020improving,\ntitle={Improving Federated Learning Personalization via Model Agnostic Meta Learning},\nauthor={Yihan Jiang and Jakub Kone{\\v{c}}n{\\'y} and Keith Rush and Sreeram Kannan},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeaEyBYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkeaEyBYDB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "350;413;315",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "298;574;419",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.3333333333333,
            40.549010126293126
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            430.3333333333333,
            112.96115359813842
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 734,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4152118664776895461&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BkeaxAEKvB",
        "title": "New Loss Functions for Fast Maximum Inner Product Search",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Quantization based methods are popular for solving large scale maximum inner product search problems. However, in most traditional quantization works, the objective is to minimize the reconstruction error for datapoints to be searched. In this work, we focus directly on minimizing error in inner product approximation and derive a new class of quantization loss functions. One key aspect of the new loss functions is that we weight the error term based on the value of the inner product, giving more importance to pairs of queries and datapoints whose inner products are high. We provide theoretical grounding to the new quantization loss function, which is simple, intuitive and able to work with a variety of quantization techniques, including binary quantization and product quantization. We conduct experiments on public benchmarking datasets \\url{http://ann-benchmarks.com} to demonstrate that our method using the new objective outperforms other state-of-the-art methods. We are committed to release our source code.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruiqi Guo;Quan Geng;David Simcha;Felix Chern;Phil Sun;Sanjiv Kumar",
        "authorids": "guorq@google.com;qgeng@google.com;dsimcha@google.com;fchern@google.com;sunphil@google.com;sanjivk@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nguo2020new,\ntitle={New Loss Functions for Fast Maximum Inner Product Search},\nauthor={Ruiqi Guo and Quan Geng and David Simcha and Felix Chern and Phil Sun and Sanjiv Kumar},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeaxAEKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkeaxAEKvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "235;212;282",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "163;329;112",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            243.0,
            29.13188402192118
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            201.33333333333334,
            92.64388209098801
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=278630084232658540&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Bkeb7lHtvH",
        "title": "At Stability's Edge: How to Adjust Hyperparameters to Preserve Minima Selection in Asynchronous Training of Neural Networks?",
        "track": "main",
        "status": "Spotlight",
        "tldr": "How to prevent stale gradients (in asynchronous SGD) from changing minima stability and degrade steady state generalization?",
        "abstract": "Background: Recent developments have made it possible to accelerate neural networks training significantly using large batch sizes and data parallelism. Training in an asynchronous fashion, where delay occurs, can make training even more scalable. However, asynchronous training has its pitfalls, mainly a degradation in generalization, even after convergence of the algorithm. This gap remains not well understood, as theoretical analysis so far mainly focused on the convergence rate of asynchronous methods.\nContributions: We examine asynchronous training from the perspective of dynamical stability. We find that the degree of delay interacts with the learning rate, to change the set of minima accessible by an asynchronous stochastic gradient descent algorithm. We derive closed-form rules on how the learning rate could be changed, while keeping the accessible set the same. Specifically, for high delay values, we find that the learning rate should be kept inversely proportional to the delay. We then extend this analysis to include momentum. We find momentum should be either turned off, or modified to improve training stability.  We provide empirical experiments to validate our theoretical findings.",
        "keywords": "implicit bias;stability;neural networks;generalization gap;asynchronous SGD",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Niv Giladi;Mor Shpigel Nacson;Elad Hoffer;Daniel Soudry",
        "authorids": "giladiniv@gmail.com;mor.shpigel@gmail.com;elad.hoffer@gmail.com;daniel.soudry@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nGiladi2020At,\ntitle={At Stability's Edge: How to Adjust Hyperparameters to Preserve Minima Selection in Asynchronous Training of Neural Networks?},\nauthor={Niv Giladi and Mor Shpigel Nacson and Elad Hoffer and Daniel Soudry},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkeb7lHtvH}\n}",
        "github": "https://github.com/paper-submissions/delay_stability",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Bkeb7lHtvH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "127;317;253",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "491;439;341",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            232.33333333333334,
            78.93175674101154
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            423.6666666666667,
            62.18967402676714
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11430740455622782158&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Bkeeca4Kvr",
        "title": "FEW-SHOT LEARNING ON GRAPHS VIA SUPER-CLASSES BASED ON GRAPH SPECTRAL MEASURES",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We propose to study the problem of few-shot graph classification in graph neural networks (GNNs) to recognize unseen classes, given limited labeled graph examples. Despite several interesting GNN variants being proposed recently for node and graph classification tasks, when faced with scarce labeled examples in the few-shot setting, these GNNs exhibit significant loss in classification performance. Here, we present an approach where a probability measure is assigned to each graph based on the spectrum of the graph\u2019s normalized Laplacian. This enables us to accordingly cluster the graph base-labels associated with each graph into super-classes, where the L^p Wasserstein distance serves as our underlying distance metric. Subsequently, a super-graph constructed based on the super-classes is then fed to our proposed GNN framework which exploits the latent inter-class relationships made explicit by the super-graph to achieve better class label separation among the graphs. We conduct exhaustive empirical evaluations of our proposed method and show that it outperforms both the adaptation of state-of-the-art graph classification methods to few-shot scenario and our naive baseline GNNs. Additionally, we also extend and study the behavior of our method to semi-supervised and active learning scenarios.",
        "keywords": "Few shot graph classification;graph spectral measures;super-classes",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jatin Chauhan;Deepak Nathani;Manohar Kaul",
        "authorids": "chauhanjatin100@gmail.com;deepakn1019@gmail.com;mkaul@iith.ac.in",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nChauhan2020FEW-SHOT,\ntitle={FEW-SHOT LEARNING ON GRAPHS VIA SUPER-CLASSES BASED ON GRAPH SPECTRAL MEASURES},\nauthor={Jatin Chauhan and Deepak Nathani and Manohar Kaul},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkeeca4Kvr}\n}",
        "github": "https://github.com/chauhanjatin10/GraphsFewShot",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bkeeca4Kvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "334;239;806",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "779;1052;1071",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            459.6666666666667,
            247.94667885566759
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            967.3333333333334,
            133.39748456732192
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 91,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14327533105664935166&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BkejpaVFDH",
        "title": "Affine Self Convolution",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Attention mechanisms, and most prominently self-attention, are a powerful building block for processing not only text but also images. These provide a parameter efficient method for aggregating inputs. We focus on self-attention in vision models, and we combine it with convolution, which as far as we know, are the first to do. What emerges is a convolution with data dependent filters. We call this an Affine Self Convolution. While this is applied differently at each spatial location, we show that it is translation equivariant. We also modify the Squeeze and Excitation variant of attention, extending both variants of attention to the roto-translation group. We evaluate these new models on CIFAR10 and CIFAR100 and show an improvement in the number of parameters, while reaching comparable or higher accuracy at test time against self-trained baselines.",
        "keywords": "Self-attention;convolution;equivariant",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nichita Diaconu;Daniel E. Worrall",
        "authorids": "diacon995@gmail.com;d.e.worrall@uva.nl",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkejpaVFDH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "463;527;701",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            563.6666666666666,
            100.5628603853773
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3463522770255165777&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Bkel1krKPS",
        "title": "Attention on Abstract Visual Reasoning",
        "track": "main",
        "status": "Reject",
        "tldr": "Introducing Attention Relation Network (ARNe) that combines features from WReN and Transformer Networks.",
        "abstract": "Attention mechanisms have been boosting the performance of deep learning models on a wide range of applications, ranging from speech understanding to program induction.  However, despite experiments from psychology which suggest that attention plays an essential role in visual reasoning, the full potential of attention mechanisms has so far not been explored to solve abstract cognitive tasks on image data. In this work, we propose a hybrid network architecture, grounded on self-attention and relational reasoning. We call this new model Attention Relation Network (ARNe). ARNe combines features from the recently introduced Transformer and the Wild Relation Network (WReN). We test ARNe on the Procedurally Generated Matrices (PGMs) datasets for abstract visual reasoning. ARNe excels the WReN model on this task by 11.28 ppt. Relational concepts between objects are efficiently learned demanding only 35% of the training samples to surpass reported accuracy of the base line model. Our proposed hybrid model, represents an alternative on learning abstract relations using self-attention and demonstrates that the Transformer network is also well suited for abstract visual reasoning.",
        "keywords": "Transformer Networks;Self-Attention;Wild Relation Networks;Procedurally Generated Matrices",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lukas Hahne;Timo L\u00fcddecke;Florentin W\u00f6rg\u00f6tter;David Kappel",
        "authorids": "l.hahne@stud.uni-goettingen.de;timo.lueddecke@phys.uni-goettingen.de;worgott@gwdg.de;david.kappel@phys.uni-goettingen.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhahne2020attention,\ntitle={Attention on Abstract Visual Reasoning},\nauthor={Lukas Hahne and Timo L{\\\"u}ddecke and Florentin W{\\\"o}rg{\\\"o}tter and David Kappel},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkel1krKPS}\n}",
        "github": "https://drive.google.com/file/d/19fNqoqULy1rPOf38YQ2OsOFkDlzhec-i/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkel1krKPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "302;225;234",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            253.66666666666666,
            34.37376260399137
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18394704686623247264&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Bkel6ertwS",
        "title": "Learning DNA folding patterns with Recurrent Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We apply RNN to solve the biological problem of chromatin folding patterns prediction from epigenetic marks and demonstrate for the first time that utilization of memory of sequential states on DNA molecule is significant for the best performance.",
        "abstract": "\nThe recent expansion of machine learning applications to molecular biology proved to have a significant contribution to our understanding of biological systems, and genome functioning in particular. Technological advances enabled the collection of large epigenetic datasets, including information about various DNA binding factors (ChIP-Seq) and DNA spatial structure (Hi-C). Several studies have confirmed the correlation between DNA binding factors and Topologically Associating Domains (TADs) in DNA structure. However, the information about physical proximity represented by genomic coordinate was not yet used for the improvement of the prediction models.\n\nIn this research, we focus on Machine Learning methods for prediction of folding patterns of DNA in a classical model organism Drosophila melanogaster. The paper considers linear models with four types of regularization, Gradient Boosting and Recurrent Neural Networks for the prediction of chromatin folding patterns from epigenetic marks. The bidirectional LSTM RNN model outperformed all the models and gained the best prediction scores. This demonstrates the utilization of complex models and the importance of memory of sequential DNA states for the chromatin folding. We identify informative epigenetic features that lead to the further conclusion of their biological significance.",
        "keywords": "Machine Learning;Recurrent Neural Networks;3D chromatin structure;topologically associating domains;computational biology.",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michal Rozenwald;Aleksandra Galitsyna;Ekaterina Khrameeva;Grigory Sapunov;Mikhail S. Gelfand",
        "authorids": "michal.rozenwald@gmail.com;agalitzina@gmail.com;ekhrameeva@gmail.com;grigory.sapunov@gmail.codelfm;mikhail.gelfand@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nrozenwald2020learning,\ntitle={Learning {\\{}DNA{\\}} folding patterns with Recurrent Neural Networks },\nauthor={Michal Rozenwald and Aleksandra Galitsyna and Ekaterina Khrameeva and Grigory Sapunov and Mikhail S. Gelfand},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkel6ertwS}\n}",
        "github": "https://www.dropbox.com/sh/izy4exnkosd309o/AACl7N8xZX1X13EOSwC_mNj1a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkel6ertwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "419;216;161",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "724;250;247",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            265.3333333333333,
            110.95444510648903
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            407.0,
            224.15619554230483
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LuJappM9Le0J:scholar.google.com/&scioq=Learning+DNA+folding+patterns+with+Recurrent+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BkelnhNFwB",
        "title": "Generalization Puzzles in Deep Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Contrary to previous beliefs, the training performance of deep networks, when measured appropriately, is predictive of test performance, consistent with classical machine learning theory.",
        "abstract": "In the last few years, deep learning has been tremendously successful in many applications. However, our theoretical understanding of deep learning, and thus the ability of providing principled improvements, seems to lag behind. A theoretical puzzle concerns the ability of deep networks to predict well despite their intriguing apparent lack of generalization: their classification accuracy on the training set is not a proxy for their performance on a test set. How is it possible that training performance is independent of testing performance? Do indeed deep networks require a drastically new theory of generalization? Or are there measurements based on the training data that are predictive of the network performance on future data? Here we show that when performance is measured appropriately, the training performance is in fact predictive of expected performance, consistently with classical machine learning theory.",
        "keywords": "deep learning;theory;generalization;cross-entropy loss;overfitting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qianli Liao;Brando Miranda;Lorenzo Rosasco;Andrzej Banburski;Robert Liang;Jack Hidary;Tomaso Poggio",
        "authorids": "lql@mit.edu;miranda9@illinois.edu;lrosasco@mit.edu;kappa666@mit.edu;bobliang345@gmail.com;hidary@google.com;tp@csail.mit.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nliao2020generalization,\ntitle={Generalization Puzzles in Deep Networks},\nauthor={Qianli Liao and Brando Miranda and Lorenzo Rosasco and Andrzej Banburski and Robert Liang and Jack Hidary and Tomaso Poggio},\nyear={2020},\nurl={https://openreview.net/forum?id=BkelnhNFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkelnhNFwB",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "852;258;426",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "29;254;62",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            512.0,
            250.0079998720041
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            115.0,
            99.20685460188726
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "BkeoaeHKDS",
        "title": "Gradients as Features for Deep Representation Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "Given a pre-trained model, we explored the per-sample gradients of the model parameters relative to a task-specific loss, and constructed a linear model that combines gradients of model parameters and the activation of the model.",
        "abstract": "We address the challenging problem of deep representation learning -- the efficient adaption of a pre-trained deep network to different tasks. Specifically, we propose to explore gradient-based features. These features are gradients of the model parameters with respect to a task-specific loss given an input sample. Our key innovation is the design of a linear model that incorporates both gradient and activation of the pre-trained network. We demonstrate that our model provides a local linear approximation to an underlying deep model, and discuss important theoretical insights. Moreover, we present an efficient algorithm for the training and inference of our model without computing the actual gradients. Our method is evaluated across a number of representation-learning tasks on several datasets and using different network architectures. Strong results are obtained in all settings, and are well-aligned with our theoretical insights.",
        "keywords": "representation learning;gradient features;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fangzhou Mu;Yingyu Liang;Yin Li",
        "authorids": "fmu@cs.wisc.edu;yliang@cs.wisc.edu;yin.li@wisc.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nMu2020Gradients,\ntitle={Gradients as Features for Deep Representation Learning},\nauthor={Fangzhou Mu and Yingyu Liang and Yin Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeoaeHKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkeoaeHKDS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "242;919;128",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "666;1297;288",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            429.6666666666667,
            349.1268473714898
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            750.3333333333334,
            416.2165568814175
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 50,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8007975967296071222&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BkepbpNFwr",
        "title": "Progressive Memory Banks for Incremental Domain Adaptation",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a neural memory-based architecture for incremental domain adaptation, and provide theoretical and empirical results.",
        "abstract": "This paper addresses the problem of incremental domain adaptation (IDA) in natural language processing (NLP). We assume each domain comes one after another, and that we could only access data in the current domain. The goal of IDA is  to build a unified model performing well on all the domains that we have encountered. We adopt the recurrent neural network (RNN) widely used in NLP, but augment it with a directly parameterized memory bank, which is retrieved by an attention mechanism at each step of RNN transition. The memory bank provides a natural way of IDA: when adapting our model to a new domain, we progressively add new slots to the memory bank, which increases the number of parameters, and thus the model capacity. We learn the new memory slots and fine-tune existing parameters by back-propagation. Experimental results show that our approach achieves significantly better performance than fine-tuning alone. Compared with expanding hidden states, our approach is more robust for old domains, shown by both empirical and theoretical results. Our model also outperforms previous work of IDA including elastic weight consolidation and progressive neural networks in the experiments.",
        "keywords": "natural language processing;domain adaptation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nabiha Asghar;Lili Mou;Kira A. Selby;Kevin D. Pantasdo;Pascal Poupart;Xin Jiang",
        "authorids": "nasghar@uwaterloo.ca;doublepower.mou@gmail.com;kaselby@uwaterloo.ca;kevin.pantasdo@uwaterloo.ca;ppoupart@uwaterloo.ca;jiang.xin@huawei.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nAsghar2020Progressive,\ntitle={Progressive Memory Banks for Incremental Domain Adaptation},\nauthor={Nabiha Asghar and Lili Mou and Kira A. Selby and Kevin D. Pantasdo and Pascal Poupart and Xin Jiang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkepbpNFwr}\n}",
        "github": "https://github.com/nabihach/IDA",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkepbpNFwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "232;167;741",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            380.0,
            256.64112427018915
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16171132848868692146&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BkeqATVYwr",
        "title": "GRAPH NEIGHBORHOOD ATTENTIVE POOLING",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Network representation learning (NRL) is a powerful technique for learning low-dimensional vector representation of high-dimensional and sparse graphs. Most studies explore the structure and meta data associated with the graph using random walks and employ a unsupervised or semi-supervised learning schemes. Learning in these methods is context-free, because only a single representation per node is learned. Recently studies have argued on the sufficiency of a single representation and proposed a context-sensitive approach that proved to be highly effective in applications such as link prediction and ranking.\nHowever, most of these methods rely on additional textual features that require RNNs or CNNs to capture high-level features or rely on a community detection algorithm to identifying multiple contexts of a node.\nIn this study, without requiring additional features nor a community detection algorithm, we propose a novel context-sensitive algorithm called GAP that learns to attend on different part of a node\u2019s neighborhood using attentive pooling networks. We show the efficacy of GAP using three real-world datasets on link prediction and node clustering tasks and compare it against 10 popular and state-of-the-art (SOTA) baselines. GAP consistently outperforms them and achieves up to \u22489% and \u224820% gain over the best performing methods on link prediction and clustering tasks, respectively.",
        "keywords": "Network Representation Learning;Attentive Pooling Networks;Context-sensitive Embedding;Mutual Attention;Link Prediction;Node Clustering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zekarias Tilahun Kefato;Sarunas Girdzijauskas",
        "authorids": "zekarias@kth.se;sarunasg@kth.se",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkefato2020graph,\ntitle={{\\{}GRAPH{\\}} {\\{}NEIGHBORHOOD{\\}} {\\{}ATTENTIVE{\\}} {\\{}POOLING{\\}}},\nauthor={Zekarias Tilahun Kefato and Sarunas Girdzijauskas},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeqATVYwr}\n}",
        "github": "https://drive.google.com/open?id=1LW80Hoj3uWkWfCxVZtQctkrhxXcmlN7T",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkeqATVYwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "278;187;589",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "617;389;405",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            351.3333333333333,
            172.11301196855771
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            470.3333333333333,
            103.91449476479315
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17371167948245601170&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Bkeqb1BFvB",
        "title": "Study of a Simple, Expressive and Consistent Graph Feature Representation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We study theoretically the consistency the Laplacian spectrum and use it as whole-graph embeddding",
        "abstract": "Graphs possess exotic features like variable size and absence of natural ordering of the nodes that make them difficult to analyze and compare. To circumvent this problem and learn on graphs, graph feature representation is required. Main difficulties with feature extraction lie in the trade-off between expressiveness, consistency and efficiency, i.e. the capacity to extract features that represent the structural information of the graph while being deformation-consistent and isomorphism-invariant. While state-of-the-art methods enhance expressiveness with powerful graph neural-networks, we propose to leverage natural spectral properties of graphs to study a simple graph feature: the graph Laplacian spectrum (GLS). We analyze the representational power of this object that satisfies both isomorphism-invariance, expressiveness and deformation-consistency. In particular, we propose a theoretical analysis based on graph perturbation to understand what kind of comparison between graphs we do when comparing GLS. To do so, we derive bounds for the distance between GLS that are related to the divergence to isomorphism, a standard computationally expensive graph divergence. Finally, we experiment GLS as graph representation through consistency tests and classification tasks, and show that it is a strong graph feature representation baseline.",
        "keywords": "Graph representation;Spectral;Graph perturbation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pineau Edouard",
        "authorids": "pineau.edouard@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "",
        "github": "https://github.com/researchsubmission/ICLR2020/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bkeqb1BFvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "213;159;145",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            172.33333333333334,
            29.31817790306136
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4fcJSkoPA7UJ:scholar.google.com/&scioq=Study+of+a+Simple,+Expressive+and+Consistent+Graph+Feature+Representation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BkevoJSYPB",
        "title": "Differentiation of Blackbox Combinatorial Solvers",
        "track": "main",
        "status": "Spotlight",
        "tldr": " In this work, we present a method that implements an efficient backward pass through blackbox implementations of combinatorial solvers with linear objective functions.",
        "abstract": "Achieving fusion of deep learning with combinatorial algorithms promises transformative changes to artificial intelligence. One possible approach is to introduce combinatorial building blocks into neural networks. Such end-to-end architectures have the potential to tackle combinatorial problems on raw input data such as ensuring global consistency in multi-object tracking or route planning on maps in robotics. In this work, we present a method that implements an efficient backward pass through blackbox implementations of combinatorial solvers with linear objective functions. We provide both theoretical and experimental backing. In particular, we incorporate the Gurobi MIP solver, Blossom V algorithm, and Dijkstra's algorithm into architectures that extract suitable features from raw inputs for the traveling salesman problem, the min-cost perfect matching problem and the shortest path problem.",
        "keywords": "combinatorial algorithms;deep learning;representation learning;optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marin Vlastelica Pogan\u010di\u0107;Anselm Paulus;Vit Musil;Georg Martius;Michal Rolinek",
        "authorids": "marin.vlastelica@tue.mpg.de;anselm.paulus@tuebingen.mpg.de;vejtek@atrey.karlin.mff.cuni.cz;georg.martius@tuebingen.mpg.de;michal.rolinek@tuebingen.mpg.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nPogan\u010di\u01072020Differentiation,\ntitle={Differentiation of Blackbox Combinatorial Solvers},\nauthor={Marin Vlastelica Pogan\u010di\u0107 and Anselm Paulus and Vit Musil and Georg Martius and Michal Rolinek},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkevoJSYPB}\n}",
        "github": "https://sites.google.com/view/combinatorialgradients/home",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkevoJSYPB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "507;171;286",
        "wc_reply_reviewers": "0;0;48",
        "wc_reply_authors": "338;86;220",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            321.3333333333333,
            139.42819737134315
        ],
        "wc_reply_reviewers_avg": [
            16.0,
            22.627416997969522
        ],
        "wc_reply_authors_avg": [
            214.66666666666666,
            102.94766739573181
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 171,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9146894798442619794&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BkexaxBKPB",
        "title": "Generative Adversarial Nets for Multiple Text Corpora",
        "track": "main",
        "status": "Reject",
        "tldr": "Constructing robust embeddings by means of GANs from multiple corpora",
        "abstract": "Generative adversarial nets (GANs) have been successfully applied to the artificial generation of image data. In terms of text data, much has been done on the artificial generation of natural language from a single corpus. We consider multiple text corpora as the input data, for which there can be two applications of GANs: (1) the creation of consistent cross-corpus word embeddings given different word embeddings per corpus; (2) the generation of robust bag-of-words document embeddings for each corpora. We demonstrate our GAN models on real-world text data sets from different corpora, and show that embeddings from both models lead to improvements in supervised learning problems.",
        "keywords": "GAN;NLP;embeddings",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Diego Klabjan;Baiyang Wang",
        "authorids": "d-klabjan@northwestern.edu;baiyang@u.northwestern.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nklabjan2020generative,\ntitle={Generative Adversarial Nets for Multiple Text Corpora},\nauthor={Diego Klabjan and Baiyang Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=BkexaxBKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkexaxBKPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1011;224;501",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            578.6666666666666,
            325.9512574331049
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10159132514335960996&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BkeyOxrYwH",
        "title": "Imagine That! Leveraging Emergent Affordances for Tool Synthesis in Reaching Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "We demonstrate that a task-driven traversal of a structured latent space leads to object affordances emerging naturally as directions in this space accessible via optimisation.",
        "abstract": "In this paper we investigate an artificial agent's ability to perform task-focused tool synthesis via imagination. Our motivation is to explore the richness of information captured by the latent space of an object-centric generative model - and how to exploit it. In particular, our approach employs activation maximisation of a task-based performance predictor to optimise the latent variable of a structured latent-space model in order to generate tool geometries appropriate for the task at hand. We evaluate our model using a novel dataset of synthetic reaching tasks inspired by the cognitive sciences and behavioural ecology. In doing so we examine the model's ability to imagine tools for increasingly complex scenario types, beyond those seen during training. Our experiments demonstrate that the synthesis process modifies emergent, task-relevant object affordances in a targeted and deliberate way: the agents often specifically modify aspects of the tools which relate to meaningful (yet implicitly learned) concepts such as a tool's length, width and configuration. Our results therefore suggest, that task relevant object affordances are implicitly encoded as directions in a structured latent space shaped by experience. ",
        "keywords": "Affordance Learning;Imagination;Generative Models;Activation Maximisation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yizhe Wu;Sudhanshu Kasewa;Oliver Groth;Sasha Salter;Li Sun;Oiwi Parker Jones;Ingmar Posner",
        "authorids": "ywu@robots.ox.ac.uk;su@robots.ox.ac.uk;ogroth@robots.ox.ac.uk;sasha@robots.ox.ac.uk;kevin@robots.ox.ac.uk;oiwi.parkerjones@jesus.ox.ac.uk;ingmar@robots.ox.ac.uk",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nwu2020imagine,\ntitle={Imagine That! Leveraging Emergent Affordances for Tool Synthesis in Reaching Tasks},\nauthor={Yizhe Wu and Sudhanshu Kasewa and Oliver Groth and Sasha Salter and Li Sun and Oiwi Parker Jones and Ingmar Posner},\nyear={2020},\nurl={https://openreview.net/forum?id=BkeyOxrYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkeyOxrYwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1349;526;308",
        "wc_reply_reviewers": "375;43;0",
        "wc_reply_authors": "2055;745;709",
        "reply_reviewers": "1;1;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            727.6666666666666,
            448.2724865773297
        ],
        "wc_reply_reviewers_avg": [
            139.33333333333334,
            167.5635866038787
        ],
        "wc_reply_authors_avg": [
            1169.6666666666667,
            626.1976968622254
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2961549995427504858&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Bkf4XgrKvS",
        "title": "Unsupervised Learning of Graph Hierarchical Abstractions with Differentiable Coarsening and Optimal Transport",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Hierarchical abstractions are a methodology for solving large-scale graph problems in various disciplines. Coarsening is one such approach: it generates a pyramid of graphs whereby the one in the next level is a structural summary of the prior one. With a long history in scientific computing, many coarsening strategies were developed based on mathematically driven heuristics. Recently, resurgent interests exist in deep learning to design hierarchical methods learnable through differentiable parameterization. These approaches are paired with downstream tasks for supervised learning. In this work, we propose an unsupervised approach, coined \\textsc{OTCoarsening}, with the use of optimal transport. Both the coarsening matrix and the transport cost matrix are parameterized, so that an optimal coarsening strategy can be learned and tailored for a given set of graphs. We demonstrate that the proposed approach produces meaningful coarse graphs and yields competitive performance compared with supervised methods for graph classification.",
        "keywords": "Unsupervised learning;hierarchical representation learning;graph neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tengfei Ma;Jie Chen",
        "authorids": "tengfei.ma1@ibm.com;chenjie@us.ibm.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nma2020unsupervised,\ntitle={Unsupervised Learning of Graph Hierarchical Abstractions with Differentiable Coarsening and Optimal Transport},\nauthor={Tengfei Ma and Jie Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkf4XgrKvS}\n}",
        "github": "https://github.com/anonymousOPT/OTCoarsening",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkf4XgrKvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "377;257;461",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "408;385;497",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.0,
            83.7137981458254
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            430.0,
            48.29768800539697
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16512208898052890485&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Bkg0u3Etwr",
        "title": "Maxmin Q-learning: Controlling the Estimation Bias of Q-learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a new variant of Q-learning algorithm called Maxmin Q-learning which provides a parameter-tuning mechanism to flexibly control bias.",
        "abstract": "Q-learning suffers from overestimation bias, because it approximates the maximum action value using the maximum estimated action value. Algorithms have been proposed to reduce overestimation bias, but we lack an understanding of how bias interacts with performance, and the extent to which existing algorithms mitigate bias. In this paper, we 1) highlight that the effect of overestimation bias on learning efficiency is environment-dependent; 2) propose a generalization of Q-learning, called \\emph{Maxmin Q-learning}, which provides a parameter to flexibly control bias; 3) show theoretically that there exists a parameter choice for Maxmin Q-learning that leads to unbiased estimation with a lower approximation variance than Q-learning; and 4) prove the convergence of our algorithm in the tabular case, as well as convergence of several previous Q-learning variants, using a novel Generalized Q-learning framework. We empirically verify that our algorithm better controls estimation bias in toy environments, and that it achieves superior performance on several benchmark problems. ",
        "keywords": "reinforcement learning;bias and variance reduction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qingfeng Lan;Yangchen Pan;Alona Fyshe;Martha White",
        "authorids": "qlan3@ualberta.ca;pan6@ualberta.ca;alona@ualberta.ca;whitem@ualberta.ca",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLan2020Maxmin,\ntitle={Maxmin Q-learning: Controlling the Estimation Bias of Q-learning},\nauthor={Qingfeng Lan and Yangchen Pan and Alona Fyshe and Martha White},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkg0u3Etwr}\n}",
        "github": "https://github.com/qlan3/Explorer",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bkg0u3Etwr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "211;380;176",
        "wc_reply_reviewers": "146;339;0",
        "wc_reply_authors": "1264;1161;6",
        "reply_reviewers": "2;2;0",
        "reply_authors": "3;3;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            255.66666666666666,
            89.07050889916121
        ],
        "wc_reply_reviewers_avg": [
            161.66666666666666,
            138.83883542518723
        ],
        "wc_reply_authors_avg": [
            810.3333333333334,
            570.3018693834189
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 227,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7792637153572320374&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Bkg5LgrYwS",
        "title": "Imitation Learning of Robot Policies using Language, Vision and Motion",
        "track": "main",
        "status": "Reject",
        "tldr": "In this paper, we present an imitation learning approach that combines language, vision, and motion in order to synthesize natural language-conditioned control policies.",
        "abstract": "In this work we propose a novel end-to-end imitation learning approach which combines natural language, vision, and motion information to produce an abstract representation of a task, which in turn can be used to synthesize specific motion controllers at run-time. This multimodal approach enables generalization to a wide variety of environmental conditions and allows an end-user to influence a robot policy through verbal communication. We empirically validate our approach with an extensive set of simulations and show that it achieves a high task success rate over a variety of conditions while remaining amenable to probabilistic interpretability.",
        "keywords": "robot learning;imitation learning;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Simon Stepputtis;Joseph Campbell;Mariano Phielipp;Chitta Baral;Heni Ben Amor",
        "authorids": "sstepput@asu.edu;jacampb1@asu.edu;mariano.j.phielipp@intel.com;chitta@asu.edu;hbenamor@asu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nstepputtis2020imitation,\ntitle={Imitation Learning of Robot Policies using Language, Vision and Motion},\nauthor={Simon Stepputtis and Joseph Campbell and Mariano Phielipp and Chitta Baral and Heni Ben Amor},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkg5LgrYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bkg5LgrYwS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "541;209;435",
        "wc_reply_reviewers": "0;0;71",
        "wc_reply_authors": "916;505;590",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            395.0,
            138.4581765973634
        ],
        "wc_reply_reviewers_avg": [
            23.666666666666668,
            33.469720976163245
        ],
        "wc_reply_authors_avg": [
            670.3333333333334,
            177.14463644779715
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aaBzI9j6ge4J:scholar.google.com/&scioq=Imitation+Learning+of+Robot+Policies+using+Language,+Vision+and+Motion&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Bkg75aVKDH",
        "title": "Training Provably Robust Models by Polyhedral Envelope Regularization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Training certifiable neural networks enables one to obtain models with robustness guarantees against adversarial attacks. In this work, we use a linear approximation to bound model\u2019s output given an input adversarial budget. This allows us to bound the adversary-free region in the data neighborhood by a polyhedral envelope and yields finer-grained certified robustness than existing methods. We further exploit this certifier to introduce a framework called polyhedral envelope regular- ization (PER), which encourages larger polyhedral envelopes and thus improves the provable robustness of the models. We demonstrate the flexibility and effectiveness of our framework on standard benchmarks; it applies to networks with general activation functions and obtains comparable or better robustness guarantees than state-of-the-art methods, with very little cost in clean accuracy, i.e., without over-regularizing the model.",
        "keywords": "deep learning;adversarial attack;robust certification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chen Liu;Mathieu Salzmann;Sabine S\u00fcsstrunk",
        "authorids": "chen.liu@epfl.ch;mathieu.salzmann@epfl.ch;sabine.susstrunk@epfl.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nliu2020training,\ntitle={Training Provably Robust Models by Polyhedral Envelope Regularization},\nauthor={Chen Liu and Mathieu Salzmann and Sabine S{\\\"u}sstrunk},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkg75aVKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkg75aVKDH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "691;696;312",
        "wc_reply_reviewers": "204;0;0",
        "wc_reply_authors": "1419;523;26",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            566.3333333333334,
            179.8524086268763
        ],
        "wc_reply_reviewers_avg": [
            68.0,
            96.16652224137046
        ],
        "wc_reply_authors_avg": [
            656.0,
            576.4136246365683
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3477686626634708194&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "BkgCv1HYvB",
        "title": "Generating Multi-Sentence Abstractive Summaries of Interleaved Texts",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In multi-participant postings, as in online chat conversations, several conversations or topic threads may take place concurrently. This leads to difficulties for readers reviewing the postings in not only following discussions but also in quickly identifying their essence. A two-step process, disentanglement of interleaved posts followed by summarization of each thread, addresses the issue, but disentanglement errors are propagated to the summarization step, degrading the overall performance. To address this, we propose an end-to-end trainable encoder-decoder network for summarizing interleaved posts. The interleaved posts are encoded hierarchically, i.e., word-to-word (words in a post) followed by post-to-post (posts in a channel). The decoder also generates summaries hierarchically, thread-to-thread (generate thread representations) followed by word-to-word (i.e., generate summary words). Additionally, we propose a hierarchical attention mechanism for interleaved text. Overall, our end-to-end trainable hierarchical framework enhances performance over a sequence to sequence framework by 8-10% on multiple synthetic interleaved texts datasets.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sanjeev Kumar Karn;Francine Chen;Yan-Ying Chen;Ulli Waltinger;Hinrich Sch\u00fctze",
        "authorids": "skarn@cis.lmu.de;chen@fxpal.com;yanying@fxpal.com;ulli.waltinger@siemens.com;hinrich@hotmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nkarn2020generating,\ntitle={Generating Multi-Sentence Abstractive Summaries of Interleaved Texts},\nauthor={Sanjeev Kumar Karn and Francine Chen and Yan-Ying Chen and Ulli Waltinger and Hinrich Sch{\\\"u}tze},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgCv1HYvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkgCv1HYvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "423;253;363",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "796;408;608",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            346.3333333333333,
            70.39570693980959
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            604.0,
            158.42558715897715
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3027812581545633786&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BkgE2yHYDr",
        "title": "Feature-based Augmentation for Semi-Supervised Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In this paper, we propose a feature-based augmentation, a simple and efficient method for semi-supervised learning, where only a small part of the data is labeled. In semi-supervised learning, input image augmentation is typically known to be a technique for ensuring generalization of unlabeled data. However, unlike general input augmentation(translation, filp, Gaussian noise, etc.), our method adds noise to features that have the most contribution on prediction, generating an augmented features. We call this method ``Feature-based augmentation\" because the noise is determined by the network weight itself and augmentation is carried out at the feature level. A prediction by augmented features is used as a target for unlabeled data. The target is stable because it is augmented by the noise based on its extracted features. Feature-based augmentation is applied to semi-supervised learning on SVHN, CIFAR-10 datasets. This method achieved a state-of-the-art error rate. In particular, performance differences from other methods were more pronounced with the smaller the number of labeled data.",
        "keywords": "semi-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Min-Hye Oh;Byung-Gook Park",
        "authorids": "listogato3@gmail.com;bgpark@snu.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkgE2yHYDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "433;411;170",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            338.0,
            119.13297892131577
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:biulcfO-m08J:scholar.google.com/&scioq=Feature-based+Augmentation+for+Semi-Supervised+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BkgF4kSFPB",
        "title": "Hallucinative Topological Memory for Zero-Shot Visual Planning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose Hallucinative Topological Memory (HTM), a visual planning algorithm that can perform zero-shot long horizon planning in new environments. ",
        "abstract": "In visual planning (VP), an agent learns to plan goal-directed behavior from observations of a dynamical system obtained offline, e.g., images obtained from self-supervised robot interaction. VP algorithms essentially combine data-driven perception and planning, and are important for robotic manipulation and navigation domains, among others. A recent and promising approach to VP is the semi-parametric topological memory (SPTM) method, where image samples are treated as nodes in a graph, and the connectivity in the graph is learned using deep image classification. Thus, the learned graph represents the topological connectivity of the data, and planning can be performed using conventional graph search methods. However, training SPTM necessitates a suitable loss function for the connectivity classifier, which requires non-trivial manual tuning. More importantly, SPTM is constricted in its ability to generalize to changes in the domain, as its graph is constructed from direct observations and thus requires collecting new samples for planning. In this paper, we propose Hallucinative Topological Memory (HTM), which overcomes these shortcomings. In HTM, instead of training a discriminative classifier we train an energy function using contrastive predictive coding. In addition, we learn a conditional VAE model that generates samples given a context image of the domain, and use these hallucinated samples for building the connectivity graph, allowing for zero-shot generalization to domain changes. In simulated domains, HTM outperforms conventional SPTM and visual foresight methods in terms of both plan quality and success in long-horizon planning. ",
        "keywords": "Visual Planning;Model-Based RL;Representation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kara Liu;Thanard Kurutach;Pieter Abbeel;Aviv Tamar",
        "authorids": "karamarieliu@berkeley.edu;thanard.kurutach@berkeley.edu;pabbeel@cs.berkeley.edu;aviv.tamar.mail@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nliu2020hallucinative,\ntitle={Hallucinative Topological Memory for Zero-Shot Visual Planning},\nauthor={Kara Liu and Thanard Kurutach and Pieter Abbeel and Aviv Tamar},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgF4kSFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkgF4kSFPB",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "1086;392;339",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1298;523;245",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            605.6666666666666,
            340.33545542923116
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            688.6666666666666,
            445.56057076700836
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 49,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2366589002127869836&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BkgGJlBFPS",
        "title": "Unsupervised Hierarchical Graph Representation Learning with Variational Bayes",
        "track": "main",
        "status": "Reject",
        "tldr": "Bayespool: An unsupervised hierarchical graph representation learning method based on Variational Bayes.",
        "abstract": "Hierarchical graph representation learning is an emerging subject owing to the increasingly popular adoption of graph neural networks in machine learning and applications. Loosely speaking, work under this umbrella falls into two categories: (a) use a predefined graph hierarchy to perform pooling; and (b) learn the hierarchy for a given graph through differentiable parameterization of the coarsening process. These approaches are supervised; a predictive task with ground-truth labels is used to drive the learning. In this work, we propose an unsupervised approach, \\textsc{BayesPool}, with the use of variational Bayes. It produces graph representations given a predefined hierarchy. Rather than relying on labels, the training signal comes from the evidence lower bound of encoding a graph and decoding the subsequent one in the hierarchy. Node features are treated latent in this variational machinery, so that they are produced as a byproduct and are used in downstream tasks. We demonstrate a comprehensive set of experiments to show the usefulness of the learned representation in the context of graph classification.",
        "keywords": "Hierarchical Graph Representation;Unsupervised Graph Learning;Variational Bayes;Graph classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shashanka Ubaru;Jie Chen",
        "authorids": "shashanka.ubaru@ibm.com;chenjie@us.ibm.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nubaru2020unsupervised,\ntitle={Unsupervised Hierarchical Graph Representation Learning with Variational Bayes},\nauthor={Shashanka Ubaru and Jie Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgGJlBFPS}\n}",
        "github": "https://anonymous.4open.science/r/a50d6411-55f7-4e24-8f6c-6eecee118ea0/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkgGJlBFPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "154;194;241",
        "wc_reply_reviewers": "0;0;126",
        "wc_reply_authors": "177;268;263",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            196.33333333333334,
            35.55590277608238
        ],
        "wc_reply_reviewers_avg": [
            42.0,
            59.39696961966999
        ],
        "wc_reply_authors_avg": [
            236.0,
            41.769207158703246
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11769340812847124692&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkgHWkrtPB",
        "title": "Where is the Information in a Deep Network?",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Whatever information a deep neural network has gleaned from past data is encoded in its weights.  How this information affects the response of the network to future data is largely an open question.  In fact, even how to define and measure information in a network entails some subtleties. We measure information in the weights of a deep neural network as the optimal trade-off between accuracy of the network and complexity of the weights relative to a prior.  Depending on the prior, the definition reduces to known information measures such as Shannon Mutual Information and Fisher Information, but in general it affords added flexibility that enables us to relate it to generalization, via the PAC-Bayes bound, and to invariance.  For the latter, we introduce a notion of effective information in the activations, which are deterministic functions of future inputs.  We relate this to the Information in the Weights, and use this result to show that models of low (information) complexity not only generalize better, but are bound to learn invariant representations of future inputs. These relations hinge not only on the architecture of the model, but also on how it is trained.",
        "keywords": "Information;Learning Dynamics;PAC-Bayes;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alessandro Achille;Stefano Soatto",
        "authorids": "achille@cs.ucla.edu;soatto@cs.ucla.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nachille2020where,\ntitle={Where is the Information in a Deep Network?},\nauthor={Alessandro Achille and Stefano Soatto},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgHWkrtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkgHWkrtPB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "171;381;113",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "59;1215;108",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            221.66666666666666,
            115.12698303274615
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            460.6666666666667,
            533.7691968965196
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 77,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1983138838890631645&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BkgL7kBtDH",
        "title": "Learning Function-Specific Word Representations",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Task-independent neural model for learning associations between interrelated groups of words.",
        "abstract": "We present a neural framework for learning associations between interrelated groups of words such as the ones found in Subject-Verb-Object (SVO) structures. Our model induces a joint function-specific word vector space, where vectors of e.g. plausible SVO compositions lie close together. The model retains information about word group membership even in the joint space, and can thereby effectively be applied to a number of tasks reasoning over the SVO structure. We show the robustness and versatility of the proposed framework by reporting state-of-the-art results on the tasks of estimating selectional preference (i.e., thematic fit) and event similarity. The results indicate that the combinations of representations learned with our task-independent model outperform task-specific architectures from prior work, while reducing the number of parameters by up to 95%. The proposed framework is versatile and holds promise to support learning function-specific representations beyond the SVO structures.",
        "keywords": "representation learning;associations;word embeddings;SVO;thematic fit;selectional preference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniela Gerz;Ivan Vuli\u0107;Marek Rei;Roi Reichart;Anna Korhonen",
        "authorids": "dsg40@cam.ac.uk;iv250@cam.ac.uk;marek.rei@cl.cam.ac.uk;roiri@technion.ac.il;alk23@cam.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkgL7kBtDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "562;168;109",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            279.6666666666667,
            201.0875983799653
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QUCEI0CicTsJ:scholar.google.com/&scioq=Learning+Function-Specific+Word+Representations&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BkgM7xHYwH",
        "title": "Autoencoder-based Initialization for Recurrent Neural Networks with a Linear Memory",
        "track": "main",
        "status": "Reject",
        "tldr": "We show how to initialize recurrent architectures with the closed-form solution of a linear autoencoder for sequences. We show the advantages of this approach compared to orthogonal RNNs.",
        "abstract": "Orthogonal recurrent neural networks address the vanishing gradient problem by parameterizing the recurrent connections using an orthogonal matrix. This class of models is particularly effective to solve tasks that require the memorization of long sequences. We propose an alternative solution based on explicit memorization using linear autoencoders for sequences. We show how a recently proposed recurrent architecture, the Linear Memory Network, composed of a nonlinear feedforward layer and a separate linear recurrence, can be used to solve hard memorization tasks. We propose an initialization schema that sets the weights of a recurrent architecture to approximate a linear autoencoder of the input sequences, which can be found with a closed-form solution. The initialization schema can be easily adapted to any recurrent architecture.\n    We argue that this approach is superior to a random orthogonal initialization due to the autoencoder, which allows the memorization of long sequences even before training. The empirical analysis show that our approach achieves competitive results against alternative orthogonal models, and the LSTM, on sequential MNIST, permuted MNIST and TIMIT.",
        "keywords": "recurrent neural networks;autoencoders;orthogonal RNNs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Antonio Carta;Alessandro Sperduti;Davide Bacciu",
        "authorids": "antonio.carta@di.unipi.it;sperduti@math.unipd.it;bacciu@di.unipi.it",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ncarta2020autoencoderbased,\ntitle={Autoencoder-based Initialization for Recurrent Neural Networks with a Linear Memory},\nauthor={Antonio Carta and Alessandro Sperduti and Davide Bacciu},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgM7xHYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkgM7xHYwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "124;342;208",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "200;375;510",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            224.66666666666666,
            89.77502746062268
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            361.6666666666667,
            126.90766547200984
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:B3qevVbhqIMJ:scholar.google.com/&scioq=Autoencoder-based+Initialization+for+Recurrent+Neural+Networks+with+a+Linear+Memory&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BkgMbCVFvr",
        "title": "Pretraining boosts out-of-domain robustness for pose estimation",
        "track": "main",
        "status": "Reject",
        "tldr": "Transfer learning boosts out-of-domain robustness for pose estimation.",
        "abstract": "Deep neural networks are highly effective tools for human and animal pose estimation. However, robustness to out-of-domain data remains a challenge. Here, we probe the transfer and generalization ability for pose estimation with two architecture classes (MobileNetV2s and ResNets) pretrained on ImageNet. We generated a novel dataset of 30 horses that allowed for both within-domain and out-of-domain (unseen horse) testing. We find that pretraining on ImageNet strongly improves out-of-domain performance. Moreover, we show that for both pretrained and networks trained from scratch, better ImageNet-performing architectures perform better for pose estimation, with a substantial improvement on out-of-domain data when pretrained. Collectively, our results demonstrate that transfer learning is particularly beneficial for out-of-domain robustness.",
        "keywords": "pose estimation;robustness;out-of-domain;transfer learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexander Mathis;Mert Y\u00fcksekg\u00f6n\u00fcl;Byron Rogers;Matthias Bethge;Mackenzie W. Mathis",
        "authorids": "amathis@fas.harvard.edu;mertyuksekgonul@gmail.com;byron@performancegenetics.com;matthias.bethge@uni-tuebingen.de;mathis@rowland.harvard.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nmathis2020pretraining,\ntitle={Pretraining boosts out-of-domain robustness for pose estimation},\nauthor={Alexander Mathis and Mert Y{\\\"u}ksekg{\\\"o}n{\\\"u}l and Byron Rogers and Matthias Bethge and Mackenzie W. Mathis},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgMbCVFvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkgMbCVFvr",
        "pdf_size": 0,
        "rating": "1;1",
        "confidence": "0;0",
        "wc_review": "140;458",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "390;1328",
        "reply_reviewers": "0;0",
        "reply_authors": "1;2",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            299.0,
            159.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            859.0,
            469.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 161,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18437109915355843291&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BkgNqkHFPr",
        "title": "Enhanced Convolutional Neural Tangent Kernels",
        "track": "main",
        "status": "Reject",
        "tldr": "New techniques to enhance the convolutional neural tangent kernel which can match the performance of AlexNet on CIFAR-10.",
        "abstract": "Recent research shows that for training with l2 loss, convolutional neural networks (CNNs) whose width (number of channels in convolutional layers) goes to infinity, correspond to regression with respect to the CNN Gaussian Process kernel (CNN-GP) if only the last layer is trained, and correspond to regression with respect to the Convolutional Neural Tangent Kernel (CNTK) if all layers are trained. An exact algorithm to compute CNTK (Arora et al., 2019) yielded the finding that classification accuracy of CNTK on CIFAR-10 is within 6-7% of that of the corresponding CNN architecture (best figure being around 78%) which is interesting performance for a fixed kernel.\nHere we show how to significantly enhance the performance of these kernels using two ideas. (1) Modifying the kernel using a new operation called Local Average Pooling (LAP) which preserves efficient computability of the kernel and inherits the spirit of standard data augmentation using pixel shifts. Earlier papers were unable to incorporate naive data augmentation because of the quadratic training cost of kernel regression. This idea is inspired by Global Average Pooling (GAP), which we show for CNN-GP and CNTK, GAP is equivalent to full translation data augmentation. (2) Representing the input image using a pre-processing technique proposed by Coates et al. (2011), which uses a single convolutional layer composed of random image patches.\nOn CIFAR-10 the resulting kernel, CNN-GP with LAP and horizontal flip data augmentation achieves 89% accuracy, matching the performance of AlexNet (Krizhevsky et al., 2012). Note that this is the best such result we know of for a classifier that is not a trained neural network. Similar improvements are obtained for Fashion-MNIST.",
        "keywords": "neural tangent kernel;data augmentation;global average pooling;kernel regression;deep learning theory;kernel design",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dingli Yu;Ruosong Wang;Zhiyuan Li;Wei Hu;Ruslan Salakhutdinov;Sanjeev Arora;Simon S. Du",
        "authorids": "dingliy@cs.princeton.edu;ruosongw@andrew.cmu.edu;zhiyuanli@cs.princeton.edu;huwei@cs.princeton.edu;rsalakhu@cs.cmu.edu;arora@cs.princeton.edu;ssdu@ias.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nyu2020enhanced,\ntitle={Enhanced Convolutional Neural Tangent Kernels},\nauthor={Dingli Yu and Ruosong Wang and Zhiyuan Li and Wei Hu and Ruslan Salakhutdinov and Sanjeev Arora and Simon S. Du},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgNqkHFPr}\n}",
        "github": "https://github.com/conferencesubmissioncode/ecnk",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkgNqkHFPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "1569;417;391",
        "wc_reply_reviewers": "0;0;206",
        "wc_reply_authors": "623;229;511",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            792.3333333333334,
            549.2888331490293
        ],
        "wc_reply_reviewers_avg": [
            68.66666666666667,
            97.10933128295252
        ],
        "wc_reply_authors_avg": [
            454.3333333333333,
            165.76556404218044
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 149,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18217693790031898214&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BkgOM1rKvr",
        "title": "The Surprising Behavior Of Graph Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "The paper presents a set of experiements which highlight the gap in our intuitive understanding of Graph Neural Networks.",
        "abstract": "We highlight a lack of understanding of the behaviour of Graph Neural Networks (GNNs) in various topological contexts. We present 4 experimental studies which counter-intuitively demonstrate that the performance of GNNs is weakly dependent on the topology, sensitive to structural noise and the modality (attributes or edges) of information, and degraded by strong coupling between nodal attributes and structure. We draw on the empirical results to recommend reporting of topological context in GNN evaluation and propose a simple (attribute-structure) decoupling method to improve GNN performance.",
        "keywords": "Graph Neural Networks;Graph Toplogy;Noise;Attributed Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vivek Kothari;Catherine Tong;Nicholas Lane",
        "authorids": "vivek.kothari@cs.ox.ac.uk;eu.tong@cs.ox.ac.uk;nicholas.lane@cs.ox.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkothari2020the,\ntitle={The Surprising Behavior Of Graph Neural Networks},\nauthor={Vivek Kothari and Catherine Tong and Nicholas Lane},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgOM1rKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkgOM1rKvr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "124;390;213",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "199;221;102",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            242.33333333333334,
            110.55717475084505
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            174.0,
            51.697840058039816
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DA3e1_iULaUJ:scholar.google.com/&scioq=The+Surprising+Behavior+Of+Graph+Neural+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BkgRe1SFDS",
        "title": "Learning World Graph Decompositions To Accelerate Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We learn a task-agnostic world graph abstraction of the environment and show how using it for structured exploration can significantly accelerate downstream task-specific RL.",
        "abstract": "Efficiently learning to solve tasks in complex environments is a key challenge for reinforcement learning (RL) agents.  We propose to decompose a complex environment using a task-agnostic world graphs, an abstraction that accelerates learning by enabling agents to focus exploration on a subspace of the environment.The nodes of a world graph are important waypoint states and edges represent feasible traversals between them.  Our framework has two learning phases: 1) identifying world graph nodes and edges by training a binary recurrent variational auto-encoder (VAE) on trajectory data and 2) a hierarchical RL framework that leverages structural and connectivity knowledge from the learned world graph to bias exploration towards task-relevant waypoints and regions. We show that our approach significantly accelerates RL on a suite of challenging 2D grid world tasks: compared to baselines, world graph integration doubles achieved rewards on simpler tasks, e.g. MultiGoal, and manages to solve more challenging tasks, e.g. Door-Key, where baselines fail.",
        "keywords": "environment decomposition;subgoal discovery;generative modeling;reinforcement learning;unsupervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenling Shang;Alex Trott;Stephan Zheng;Caiming Xiong;Richard Socher",
        "authorids": "w.shang@uva.nl;atrott@salesforce.com;stephan.zheng@salesforce.com;cxiong@salesforce.com;richard@socher.org",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nshang2020learning,\ntitle={Learning World Graph Decompositions To Accelerate Reinforcement Learning},\nauthor={Wenling Shang and Alex Trott and Stephan Zheng and Caiming Xiong and Richard Socher},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgRe1SFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkgRe1SFDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "444;713;861",
        "wc_reply_reviewers": "0;17;541",
        "wc_reply_authors": "579;1007;1663",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;2;3",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            672.6666666666666,
            172.61196044564494
        ],
        "wc_reply_reviewers_avg": [
            186.0,
            251.11882977321048
        ],
        "wc_reply_authors_avg": [
            1083.0,
            445.7921787858852
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yP8kVJQyZm0J:scholar.google.com/&scioq=Learning+World+Graph+Decompositions+To+Accelerate+Reinforcement+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BkgStySKPB",
        "title": "Contrastive Multiview Coding",
        "track": "main",
        "status": "Reject",
        "tldr": "An unsupervised/self-supervised framework for learning representations from multiple views",
        "abstract": "Humans view the world through many sensory channels, e.g., the long-wavelength light channel, viewed by the left eye, or the high-frequency vibrations channel, heard by the right ear. Each view is noisy and incomplete, but important factors, such as physics, geometry, and semantics, tend to be shared between all views (e.g., a \"dog\" can be seen, heard, and felt). We hypothesize that a powerful representation is one that models view-invariant factors. Based on this hypothesis, we investigate a contrastive coding scheme, in which a representation is learned that aims to maximize mutual information between different views but is otherwise compact. Our approach scales to any number of views, and is view-agnostic. The resulting learned representations perform above the state of the art for downstream tasks such as object classification, compared to formulations based on predictive learning or single view reconstruction, and improve as more views are added. On the Imagenet linear readoff benchmark, we achieve 68.4% top-1 accuracy. ",
        "keywords": "Representation Learning;Unsupervised Learning;Self-supervsied Learning;Multiview Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yonglong Tian;Dilip Krishnan;Phillip Isola",
        "authorids": "yonglong@mit.edu;dilipkay@google.com;phillipi@mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntian2020contrastive,\ntitle={Contrastive Multiview Coding},\nauthor={Yonglong Tian and Dilip Krishnan and Phillip Isola},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgStySKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkgStySKPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "567;84;146",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "709;294;62",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            265.6666666666667,
            214.57296091436024
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            355.0,
            267.6353240263076
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3020,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17466907634599741918&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "BkgTwRNtPB",
        "title": "Solving Packing Problems by Conditional Query Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural Combinatorial Optimization (NCO) has shown the potential to solve traditional NP-hard problems recently. Previous studies have shown that NCO outperforms heuristic algorithms in many combinatorial optimization problems such as the routing problems. However, it is less efficient for more complicated problems such as packing, one type of optimization problem that  faces mutual conditioned action space. In this paper, we propose a Conditional Query Learning (CQL) method to handle the packing problem for both 2D and 3D settings. By embedding previous actions as a conditional query to the attention model, we design a fully end-to-end model and train it for 2D and 3D packing via reinforcement learning respectively. Through extensive experiments, the results show that our method could achieve lower bin gap ratio and variance for both 2D and 3D packing. Our model improves 7.2% space utilization ratio compared with genetic algorithm for 3D packing (30 boxes case), and reduces more than 10% bin gap ratio in almost every case compared with extant learning approaches. In addition, our model shows great scalability to packing box number. Furthermore, we provide a general test environment of 2D and 3D packing for learning algorithms. All source code of the model and the test environment is released.",
        "keywords": "Neural Combinatorial Optimization;Reinforcement Learning;Packing Problem",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongda Li;Changwei Ren;Zhaoquan Gu;Yuexuan Wang;Francis Lau",
        "authorids": "lidongda@gzhu.edu.cn;rcw@zju.edu.cn;zqgu@gzhu.edu.cn;amywang@zju.edu.cn;fcmlau@cs.hku.hk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nli2020solving,\ntitle={Solving Packing Problems by Conditional Query Learning},\nauthor={Dongda Li and Changwei Ren and Zhaoquan Gu and Yuexuan Wang and Francis Lau},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgTwRNtPB}\n}",
        "github": "https://anonymous.4open.science/r/7ed9e338-7a23-4f9f-b7bf-170490e55016/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkgTwRNtPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "532;266;88",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "580;182;15",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.3333333333333,
            182.4451211247432
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            259.0,
            236.9992967640762
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11627379011163909757&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BkgUB1SYPS",
        "title": "Interpretable Network Structure for Modeling Contextual Dependency",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural language models have achieved great success in many NLP tasks, to a large extent, due to the ability to capture contextual dependencies among terms in a text. While many efforts have been devoted to empirically explain the connection between the network hyperparameters and the ability to represent the contextual dependency, the theoretical analysis is relatively insufficient. Inspired by the recent research on the use of tensor space to explain the neural network architecture, we explore the interpretable mechanism for neural language models. Specifically, we define the concept of separation rank in the language modeling process, in order to theoretically measure the degree of contextual dependencies in a sentence. Then, we show that the lower bound of such a separation rank can reveal the quantitative relation between the network structure (e.g. depth/width) and the modeling ability for the contextual dependency. Especially, increasing the depth of the neural network can be more effective to improve the ability of modeling contextual dependency. Therefore, it is important to design an adaptive network to compute the adaptive depth in a task. Inspired by Adaptive Computation Time (ACT), we design an adaptive recurrent network based on the separation rank to model contextual dependency. Experiments on various NLP tasks have verified the proposed theoretical analysis. We also test our adaptive recurrent neural network in the sentence classification task, and the experiments show that it can achieve better results than the traditional bidirectional LSTM.",
        "keywords": "Language Model;Recurrent Neural Network;Separation Rank",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xindian Ma;Peng Zhang;Xiaoliu Mao;Yehua Zhang;Nan Duan;Yuexian Hou;Ming Zhou.",
        "authorids": "xindianma@tju.edu.cn;pzhang@tju.edu.cn;xiaoliumao@tju.edu.cn;yehua_zhang@tju.edu.cn;nanduan@microsoft.com;yxhou@tju.edu.cn;mingzhou@microsoft.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nma2020interpretable,\ntitle={Interpretable Network Structure for Modeling Contextual Dependency},\nauthor={Xindian Ma and Peng Zhang and Xiaoliu Mao and Yehua Zhang and Nan Duan and Yuexian Hou and Ming Zhou.},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgUB1SYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkgUB1SYPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "770;363;198",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "219;133;157",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            443.6666666666667,
            240.3834899118952
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            169.66666666666666,
            36.23380864453651
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17851953276451167824&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkgWahEFvr",
        "title": "Enhancing Transformation-Based Defenses Against Adversarial Attacks with a Distribution Classifier",
        "track": "main",
        "status": "Poster",
        "tldr": "We enhance existing transformation-based defenses by using a distribution classifier on the distribution of softmax obtained from transformed images.",
        "abstract": "Adversarial attacks on convolutional neural networks (CNN) have gained significant attention and there have been active research efforts on defense mechanisms. Stochastic input transformation methods have been proposed, where the idea is to recover the image from adversarial attack by random transformation, and to take the majority vote as consensus among the random samples. However, the transformation improves the accuracy on adversarial images at the expense of the accuracy on clean images. While it is intuitive that the accuracy on clean images would deteriorate, the exact mechanism in which how this occurs is unclear. In this paper, we study the distribution of softmax induced by stochastic transformations. We observe that with random transformations on the clean images, although the mass of the softmax distribution could shift to the wrong class, the resulting distribution of softmax could be used to correct the prediction. Furthermore, on the adversarial counterparts, with the image transformation, the resulting shapes of the distribution of softmax are similar to the distributions from the clean images. With these observations, we propose a method to improve existing transformation-based defenses. We train a separate lightweight distribution classifier to recognize distinct features in the distributions of softmax outputs of transformed images. Our empirical studies show that our distribution classifier, by training on distributions obtained from clean images only, outperforms majority voting for both clean and adversarial images. Our method is generic and can be integrated with existing transformation-based defenses.",
        "keywords": "adversarial attack;transformation defenses;distribution classifier",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Connie Kou;Hwee Kuan Lee;Ee-Chien Chang;Teck Khim Ng",
        "authorids": "conniekoukl@gmail.com;leehk@bii.a-star.edu.sg;changec@comp.nus.edu.sg;ngtk@comp.nus.edu.sg",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nKou2020Enhancing,\ntitle={Enhancing Transformation-Based Defenses Against Adversarial Attacks with a Distribution Classifier},\nauthor={Connie Kou and Hwee Kuan Lee and Ee-Chien Chang and Teck Khim Ng},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgWahEFvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkgWahEFvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "484;161;773",
        "wc_reply_reviewers": "44;0;226",
        "wc_reply_authors": "1120;134;1685",
        "reply_reviewers": "1;0;1",
        "reply_authors": "3;1;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            472.6666666666667,
            249.97644333461147
        ],
        "wc_reply_reviewers_avg": [
            90.0,
            97.82978414913664
        ],
        "wc_reply_authors_avg": [
            979.6666666666666,
            640.921385368145
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "BkgXHTNtvS",
        "title": "Bounds on Over-Parameterization for Guaranteed Existence of Descent Paths in Shallow ReLU Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We study the landscape of squared loss in neural networks with one-hidden layer and ReLU activation functions.  Let $m$ and $d$ be the widths of hidden and input layers, respectively. We show that there exist poor local minima with positive curvature for some training sets of size $n\\geq m+2d-2$. By positive curvature of a local minimum, we mean that within a small neighborhood the loss function is strictly increasing in all directions. Consequently, for such training sets, there are initialization of weights from which there is no descent path to global optima. It is known that for $n\\le m$, there always exist descent paths to global optima from all initial weights. In this perspective, our results provide a somewhat sharp characterization of the over-parameterization required for \"existence of descent paths\" in the loss landscape. ",
        "keywords": "Spurious local minima;Loss landscape;Over-parameterization;Theory of deep learning;Optimization;Descent path",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arsalan Sharifnassab;Saber Salehkaleybar;S. Jamaloddin Golestani",
        "authorids": "a.sharifnassab@gmail.com;saber.salehk@gmail.com;golestani@sharif.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nSharifnassab2020Bounds,\ntitle={Bounds on Over-Parameterization for Guaranteed Existence of Descent Paths in Shallow ReLU Networks},\nauthor={Arsalan Sharifnassab and Saber Salehkaleybar and S. Jamaloddin Golestani},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgXHTNtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkgXHTNtvS",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "261;274",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "408;711",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            267.5,
            6.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            559.5,
            151.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16264202605528436808&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BkgXT24tDS",
        "title": "Additive Powers-of-Two Quantization: An Efficient Non-uniform Discretization for Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We propose Additive Powers-of-Two~(APoT) quantization, an efficient non-uniform quantization scheme for the bell-shaped and long-tailed distribution of weights and activations in neural networks. By constraining all quantization levels as the sum of Powers-of-Two terms, APoT quantization enjoys high computational efficiency and a good match with the distribution of weights. A simple reparameterization of the clipping function is applied to generate a better-defined gradient for learning the clipping threshold. Moreover, weight normalization is presented to refine the distribution of weights to make the training more stable and consistent. Experimental results show that our proposed method outperforms state-of-the-art methods, and is even competitive with the full-precision models, demonstrating the effectiveness of our proposed APoT quantization. For example, our 4-bit quantized ResNet-50 on ImageNet achieves 76.6% top-1 accuracy without bells and whistles; meanwhile, our model reduces 22% computational cost compared with the uniformly quantized counterpart.",
        "keywords": "Quantization;Efficient Inference;Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuhang Li;Xin Dong;Wei Wang",
        "authorids": "loafyuhang@gmail.com;xindong@g.harvard.edu;wangwei@comp.nus.edu.sg",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLi2020Additive,\ntitle={Additive Powers-of-Two Quantization: An Efficient Non-uniform Discretization for Neural Networks},\nauthor={Yuhang Li and Xin Dong and Wei Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgXT24tDS}\n}",
        "github": "https://github.com/yhhhli/APoT_Quantization",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkgXT24tDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "373;462;280",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "565;792;437",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            371.6666666666667,
            74.3071702835975
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            598.0,
            146.79464113742935
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 377,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15761551970233038392&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BkgYPREtPr",
        "title": "Symplectic Recurrent Neural Networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "We propose Symplectic Recurrent Neural Networks (SRNNs) as learning algorithms that capture the dynamics of physical systems from observed trajectories. SRNNs model the Hamiltonian function of the system by a neural networks, and leverage symplectic integration, multiple-step training and initial state optimization to address the challenging numerical issues associated with Hamiltonian systems. We show SRNNs succeed reliably on complex and noisy Hamiltonian systems. Finally, we show how to augment the SRNN integration scheme in order to handle stiff dynamical systems such as bouncing billiards.",
        "keywords": "Hamiltonian systems;learning physical laws;symplectic integrators;recurrent neural networks;inverse problems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhengdao Chen;Jianyu Zhang;Martin Arjovsky;L\u00e9on Bottou",
        "authorids": "zc1216@nyu.edu;edzhang@tju.edu.cn;martinarjovsky@gmail.com;leonb@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nChen2020Symplectic,\ntitle={Symplectic Recurrent Neural Networks},\nauthor={Zhengdao Chen and Jianyu Zhang and Martin Arjovsky and L\u00e9on Bottou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgYPREtPr}\n}",
        "github": "[![github](/images/github_icon.svg) zhengdao-chen/SRNN](https://github.com/zhengdao-chen/SRNN)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkgYPREtPr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "441;456;108",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "902;481;19",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            335.0,
            160.63000964950479
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            467.3333333333333,
            360.61275012893753
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 286,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16381042632484621201&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BkgZSCEtvr",
        "title": "Continuous Graph Flow",
        "track": "main",
        "status": "Reject",
        "tldr": "Graph generative models based on generalization of message passing to continuous time using ordinary differential equations ",
        "abstract": "In this paper, we propose Continuous Graph Flow, a generative continuous flow based method that aims to model complex distributions of graph-structured data.  Once learned, the model can be applied to an arbitrary graph, defining a probability density over the random variables represented by the graph. It is formulated as an ordinary differential equation system with shared and reusable functions that operate over the graphs.  This leads to a new type of neural graph message passing scheme that performs continuous message passing over time. This class of models offers several advantages: a flexible representation that can generalize to variable data dimensions; ability to model dependencies in complex data distributions; reversible and memory-efficient; and exact and efficient computation of the likelihood of the data. We demonstrate the effectiveness of our model on a diverse set of generation tasks across different domains: graph generation, image puzzle generation, and layout generation from scene graphs. Our proposed model achieves significantly better performance compared to  state-of-the-art models.",
        "keywords": "graph flow;normalizing flow;continuous message passing;reversible graph neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiwei Deng;Megha Nawhal;Lili Meng;Greg Mori",
        "authorids": "zhiweid@princeton.edu;mnawhal@sfu.ca;lilimeng1103@gmail.com;mori@cs.sfu.ca",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ndeng2020continuous,\ntitle={Continuous Graph Flow},\nauthor={Zhiwei Deng and Megha Nawhal and Lili Meng and Greg Mori},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgZSCEtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkgZSCEtvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "418;316;534",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "661;481;510",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            422.6666666666667,
            89.05928113091613
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            550.6666666666666,
            78.91063862933115
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3241719753024484049&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BkgZxpVFvH",
        "title": "LSTOD: Latent Spatial-Temporal Origin-Destination prediction model and its applications in ride-sharing platforms",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a purely convolutional CNN model with attention mechanism to predict spatial-temporal origin-destination flows. ",
        "abstract": "Origin-Destination (OD) flow data is an important instrument in transportation studies. Precise prediction of customer demands from each original location to a destination given a series of previous snapshots helps  ride-sharing platforms to better understand their market mechanism. However, most existing prediction methods ignore the network structure of OD flow data and fail to utilize the topological dependencies among related OD pairs. In this paper, we propose a latent spatial-temporal origin-destination (LSTOD) model, with a novel convolutional neural network (CNN) filter to learn the spatial features of OD pairs from a graph perspective and an attention structure to capture their long-term periodicity. Experiments on a real customer request dataset with available OD information from a ride-sharing platform demonstrate the advantage of LSTOD in achieving at least 6.5% improvement in prediction accuracy over the second best model. ",
        "keywords": "Origin-Destination Flow;Spatial Adjacent Convolution Network;Periodically Shift Attention Mechanism",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fan Zhou;Haibo Zhou;Hongtu Zhu",
        "authorids": "zhoufan@mail.shufe.edu.cn;zhou@bios.unc.edu;zhuhongtu@didiglobal.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhou2020lstod,\ntitle={{\\{}LSTOD{\\}}: Latent Spatial-Temporal Origin-Destination prediction model and its applications in ride-sharing platforms},\nauthor={Fan Zhou and Haibo Zhou and Hongtu Zhu},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgZxpVFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkgZxpVFvH",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "227;1054;194",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "274;432;92",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            491.6666666666667,
            397.85787523799104
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            266.0,
            138.9196410399432
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Mryp1K_ecLMJ:scholar.google.com/&scioq=LSTOD:+Latent+Spatial-Temporal+Origin-Destination+prediction+model+and+its+applications+in+ride-sharing+platforms&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Bkga90VKDB",
        "title": "Distilled embedding: non-linear embedding factorization using knowledge distillation",
        "track": "main",
        "status": "Reject",
        "tldr": "We present an embedding decomposition and distillation technique for NLP model compression which is state-of-the-art in machine translation and simpler than existing methods",
        "abstract": "Word-embeddings are a vital component of Natural Language Processing (NLP) systems and have been extensively researched. Better representations of words have come at the cost of huge memory footprints, which has made deploying NLP models on edge-devices challenging due to memory limitations. Compressing embedding matrices without sacrificing model performance is essential for successful commercial edge deployment. In this paper, we propose Distilled Embedding, an (input/output) embedding compression method based on low-rank matrix decomposition with an added non-linearity. First, we initialize the weights of our decomposition by learning to reconstruct the full word-embedding and then fine-tune on the downstream task employing knowledge distillation on the factorized embedding. We conduct extensive experimentation with various compression rates on machine translation, using different data-sets with a shared word-embedding matrix for both embedding and vocabulary projection matrices. We show that the proposed technique outperforms conventional low-rank matrix factorization, and other recently proposed word-embedding matrix compression methods. \n",
        "keywords": "Model Compression;Embedding Compression;Low Rank Approximation;Machine Translation;Natural Language Processing;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vasileios Lioutas;Ahmad Rashid;Krtin Kumar;Md Akmal Haidar;Mehdi Rezagholizadeh",
        "authorids": "vasileios.lioutas@carleton.ca;ahmad.rashid@huawei.com;krtin.kumar@huawei.com;md.akmal.haidar@huawei.com;mehdi.rezagholizadeh@huawei.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nlioutas2020distilled,\ntitle={Distilled embedding: non-linear embedding factorization using knowledge distillation},\nauthor={Vasileios Lioutas and Ahmad Rashid and Krtin Kumar and Md Akmal Haidar and Mehdi Rezagholizadeh},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkga90VKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkga90VKDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "373;197;202",
        "wc_reply_reviewers": "0;0;70",
        "wc_reply_authors": "406;657;340",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            257.3333333333333,
            81.81415253827149
        ],
        "wc_reply_reviewers_avg": [
            23.333333333333332,
            32.99831645537222
        ],
        "wc_reply_authors_avg": [
            467.6666666666667,
            136.5633756010577
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7389595216424810148&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BkgeQ1BYwS",
        "title": "Implicit Generative Modeling for Efficient Exploration",
        "track": "main",
        "status": "Reject",
        "tldr": "We efficiently explore by modeling uncertainty in the environment dynamics with an implicit generative model. ",
        "abstract": "Efficient exploration remains a challenging problem in reinforcement learning, especially for those tasks where rewards from environments are sparse. A commonly used approach for exploring such environments is to introduce some \"intrinsic\" reward. In this work, we focus on model uncertainty estimation as an intrinsic reward for efficient exploration. In particular, we introduce an implicit generative modeling approach to estimate a Bayesian uncertainty of the agent's belief of the environment dynamics. Each random draw from our generative model is a neural network that instantiates the dynamic function, hence multiple draws would approximate the posterior, and the variance in the future prediction based on this posterior is used as an intrinsic reward for exploration. We design a training algorithm for our generative model based on the amortized Stein Variational Gradient Descent. In experiments, we compare our implementation with state-of-the-art intrinsic reward-based exploration approaches, including two recent approaches based on an ensemble of dynamic models. In challenging exploration tasks, our implicit generative model consistently outperforms competing approaches regarding data efficiency in exploration.",
        "keywords": "Reinforcement Learning;Exploration;Intrinsic Reward;Implicit Generative Models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Neale Ratzlaff;Qinxun Bai;Li Fuxin;Wei Xu",
        "authorids": "ratzlafn@oregonstate.edu;qinxun.bai@horizon.ai;lif@oregonstate.edu;wei.xu@horizon.ai",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nratzlaff2020implicit,\ntitle={Implicit Generative Modeling for Efficient Exploration},\nauthor={Neale Ratzlaff and Qinxun Bai and Li Fuxin and Wei Xu},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgeQ1BYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkgeQ1BYwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "752;347;236",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "725;321;621",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            445.0,
            221.7611327532397
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            555.6666666666666,
            171.28014738693125
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4359517541023552105&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BkggGREKvS",
        "title": "Promoting Coordination through Policy Regularization in Multi-Agent Deep Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose regularization objectives for multi-agent RL algorithms that foster coordination on cooperative tasks.",
        "abstract": "A central challenge in multi-agent reinforcement learning is the induction of coordination between agents of a team. In this work, we investigate how to promote inter-agent coordination using policy regularization and discuss two possible avenues respectively based on inter-agent modelling and synchronized sub-policy selection. We test each approach in four challenging continuous control tasks with sparse rewards and compare them against three baselines including MADDPG, a state-of-the-art multi-agent reinforcement learning algorithm. To ensure a fair comparison, we rely on a thorough hyper-parameter selection and training methodology that allows a fixed hyper-parameter search budget for each algorithm and environment. We consequently assess both the hyper-parameter sensitivity, sample-efficiency and asymptotic performance of each learning method. Our experiments show that the proposed methods lead to significant improvements on cooperative problems. We further analyse the effects of the proposed regularizations on the behaviors learned by the agents.",
        "keywords": "Reinforcement Learning;Multi-Agent;Continuous Control;Regularization;Coordination;Inductive biases",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Paul Barde;Julien Roy;F\u00e9lix G. Harvey;Derek Nowrouzezahrai;Christopher Pal",
        "authorids": "paul.b.barde@gmail.com;jul.roy1311@gmail.com;c212.felixh@gmail.com;derek@cim.mcgill.ca;christopher.pal@polymtl.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nbarde2020promoting,\ntitle={Promoting Coordination through Policy Regularization in Multi-Agent Deep Reinforcement Learning},\nauthor={Paul Barde and Julien Roy and F{\\'e}lix G. Harvey and Derek Nowrouzezahrai and Christopher Pal},\nyear={2020},\nurl={https://openreview.net/forum?id=BkggGREKvS}\n}",
        "github": "https://drive.google.com/file/d/1BvclzmkIgvDieov96YSBAj24Q2VXQM2w/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkggGREKvS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "350;638;316",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1244;3051;706",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;6;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            434.6666666666667,
            144.4468375870129
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1667.0,
            1002.9798934508441
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            3.0,
            2.160246899469287
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 31,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2382958176237375510&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BkghKgStPH",
        "title": "Continual Learning using the SHDL Framework with Skewed Replay Distributions",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Human and animals continuously acquire, adapt as well as transfer knowledge throughout their lifespan. The ability to learn continuously is crucial for the effective functioning of agents interacting with the real world and processing continuous streams of information. Continuous learning has been a long-standing challenge for neural networks as the repeated acquisition of information from non-uniform data distributions generally lead to catastrophic forgetting or interference.  This work proposes a modular architecture capable of continuous acquisition of tasks while averting catastrophic forgetting.  Specifically, our contributions are: (i) Efficient Architecture: a modular architecture emulating the visual cortex that can learn meaningful representations with limited labelled examples, (ii) Knowledge Retention: retention of learned knowledge via limited replay of past experiences, (iii) Forward Transfer: efficient and relatively faster learning on new tasks, and (iv) Naturally Skewed Distributions: The learning  in the above-mentioned claims is performed on non-uniform data distributions which better represent the natural statistics of our ongoing experience. Several experiments that substantiate the above-mentioned claims are demonstrated on the CIFAR-100 dataset.",
        "keywords": "Continual Learning;Catastrophic Forgetting;SHDL;CIFAR-100",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amarjot Singh;Jay McClelland",
        "authorids": "as2436@stanford.edu;jlmcc@stanford.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsingh2020continual,\ntitle={Continual Learning using the {\\{}SHDL{\\}} Framework with Skewed Replay Distributions},\nauthor={Amarjot Singh and Jay McClelland},\nyear={2020},\nurl={https://openreview.net/forum?id=BkghKgStPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkghKgStPH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "323;294;579",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            398.6666666666667,
            128.06335237772836
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:g_3EgmlmW8MJ:scholar.google.com/&scioq=Continual+Learning+using+the+SHDL+Framework+with+Skewed+Replay+Distributions&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Bkgk624KDB",
        "title": "Learning Effective Exploration Strategies For Contextual Bandits",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop a meta-learning algorithm, MELEE, that learns an exploration policy based on simulated, synthetic contextual bandit tasks.",
        "abstract": "In contextual bandits, an algorithm must choose actions given observed contexts, learning from a reward signal that is observed only for the action chosen. This leads to an exploration/exploitation trade-off: the algorithm must balance taking actions it already believes are good with taking new actions to potentially discover better choices. We develop a meta-learning algorithm, MELEE, that learns an exploration policy based on simulated, synthetic contextual bandit tasks. MELEE uses imitation learning against these simulations to train an exploration policy that can be applied to true contextual bandit tasks at test time. We evaluate on both a natural contextual bandit problem derived from a learning to rank dataset as well as hundreds of simulated contextual bandit problems derived from classification tasks. MELEE outperforms seven strong baselines on most of these datasets by leveraging a rich feature representation for learning an exploration strategy.",
        "keywords": "meta-learning;contextual bandits;imitation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amr Sharaf;Hal Daum\u00e9 III",
        "authorids": "amr@cs.umd.edu;hal@umiacs.umd.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsharaf2020learning,\ntitle={Learning Effective Exploration Strategies For Contextual Bandits},\nauthor={Amr Sharaf and Hal Daum{\\'e} III},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkgk624KDB}\n}",
        "github": "https://www.dropbox.com/sh/dc3v8po5cbu8zaw/AACu1f_4c4wIZxD1e7W0KVZ0a?dl=0  ",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bkgk624KDB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "706;453;714",
        "wc_reply_reviewers": "25;0;0",
        "wc_reply_authors": "543;499;757",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            624.3333333333334,
            121.1949760601578
        ],
        "wc_reply_reviewers_avg": [
            8.333333333333334,
            11.785113019775793
        ],
        "wc_reply_authors_avg": [
            599.6666666666666,
            112.69230477523989
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7tQPsnlTmoAJ:scholar.google.com/&scioq=Learning+Effective+Exploration+Strategies+For+Contextual+Bandits&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BkglSTNFDB",
        "title": "Q-learning with UCB Exploration is Sample Efficient for Infinite-Horizon MDP",
        "track": "main",
        "status": "Poster",
        "tldr": "We adapt Q-learning with UCB-exploration bonus to infinite-horizon MDP with discounted rewards without accessing a generative model, and improves the previously best known result.",
        "abstract": "A fundamental question in reinforcement learning is whether model-free algorithms are sample efficient. Recently,  Jin et al. (2018) proposed a Q-learning algorithm with UCB exploration policy, and proved it has nearly optimal regret bound for finite-horizon episodic MDP. In this paper, we adapt Q-learning with UCB-exploration bonus to infinite-horizon MDP with discounted rewards \\emph{without} accessing a generative model. We show that the \\textit{sample complexity of exploration} of our algorithm is bounded by $\\tilde{O}({\\frac{SA}{\\epsilon^2(1-\\gamma)^7}})$. This improves the previously best known result of $\\tilde{O}({\\frac{SA}{\\epsilon^4(1-\\gamma)^8}})$ in this setting achieved by delayed Q-learning (Strehlet al., 2006),, and matches the lower bound in terms of $\\epsilon$ as well as $S$ and $A$ up to logarithmic factors.",
        "keywords": "theory;reinforcement learning;sample complexity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuanhao Wang;Kefan Dong;Xiaoyu Chen;Liwei Wang",
        "authorids": "yuanhao-16@mails.tsinghua.edu.cn;dkf16@mails.tsinghua.edu.cn;cxy30@pku.edu.cn;wanglw@cis.pku.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWang2020Q-learning,\ntitle={Q-learning with UCB Exploration is Sample Efficient for Infinite-Horizon MDP},\nauthor={Yuanhao Wang and Kefan Dong and Xiaoyu Chen and Liwei Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkglSTNFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkglSTNFDB",
        "pdf_size": 0,
        "rating": "6;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "264;359;287;410",
        "wc_reply_reviewers": "46;0;0;247",
        "wc_reply_authors": "306;125;117;842",
        "reply_reviewers": "1;0;0;1",
        "reply_authors": "2;1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            330.0,
            57.97844427026306
        ],
        "wc_reply_reviewers_avg": [
            73.25,
            102.05727558581995
        ],
        "wc_reply_authors_avg": [
            347.5,
            295.3341328055394
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 125,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10478991344524301379&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BkglepEFDS",
        "title": "Dual-Component Deep Domain Adaptation: A New Approach for Cross Project Software Vulnerability Detection",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Our aim in this paper is to propose a new approach for tackling the problem of transfer learning from labeled to unlabeled software projects in the context of SVD in order to resolve the mode collapsing problem faced in previous approaches.",
        "abstract": "Owing to the ubiquity of computer software, software vulnerability detection (SVD) has become an important problem in the software industry and in the field of computer security. One of the most crucial issues in SVD is coping with the scarcity of labeled vulnerabilities in projects that require the laborious manual labeling of code by software security experts. One possible way to address is to employ deep domain adaptation which has recently witnessed enormous success in transferring learning from structural labeled to unlabeled data sources. The general idea is to map both source and target data into a joint feature space and close the discrepancy gap of those data in this joint feature space. Generative adversarial network (GAN) is a technique that attempts to bridge the discrepancy gap and also emerges as a building block to develop deep domain adaptation approaches with state-of-the-art performance. However, deep domain adaptation approaches using the GAN principle to close the discrepancy gap are subject to the mode collapsing problem that negatively impacts the predictive performance. Our aim in this paper is to propose Dual Generator-Discriminator Deep Code Domain Adaptation Network (Dual-GD-DDAN) for tackling the problem of transfer learning from labeled to unlabeled software projects in the context of SVD in order to resolve the mode collapsing problem faced in previous approaches. The experimental results on real-world software projects show that our proposed method outperforms state-of-the-art baselines by a wide margin.",
        "keywords": "Domain adaptation;Cyber security;Software vulnerability detection;Machine learning;Deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Van Nguyen;Trung Le;Olivier de Vel;Paul Montague;John C Grundy;Dinh Phung",
        "authorids": "van.nk@monash.edu;trunglm@monash.edu;olivier.devel@dst.defence.gov.au;paul.montague@dst.defence.gov.au;john.grundy@monash.edu;dinh.phung@monash.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkglepEFDS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "292;640;512",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            481.3333333333333,
            143.71576887113753
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15780373600418238090&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BkgnhTEtDS",
        "title": "Feature Interaction Interpretability: A Case for Explaining Ad-Recommendation Systems via Neural Interaction Detection",
        "track": "main",
        "status": "Poster",
        "tldr": "Proposed methods to extract and leverage interpretations of feature interactions",
        "abstract": "Recommendation is a prevalent application of machine learning that affects many users; therefore, it is important for recommender models to be accurate and interpretable. In this work, we propose a method to both interpret and augment the predictions of black-box recommender systems. In particular, we propose to interpret feature interactions from a source recommender model and explicitly encode these interactions in a target recommender model, where both source and target models are black-boxes. By not assuming the structure of the recommender system, our approach can be used in general settings. In our experiments, we focus on a prominent use of machine learning recommendation: ad-click prediction. We found that our interaction interpretations are both informative and predictive, e.g., significantly outperforming existing recommender models. What's more, the same approach to interpret interactions can provide new insights into domains even beyond recommendation, such as text and image classification.",
        "keywords": "Feature Interaction;Interpretability;Black Box;AutoML",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael Tsang;Dehua Cheng;Hanpeng Liu;Xue Feng;Eric Zhou;Yan Liu",
        "authorids": "tsangm@usc.edu;dehuacheng@fb.com;hanpengl@usc.edu;xfeng@fb.com;hanningz@fb.com;yanliu.cs@usc.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nTsang2020Feature,\ntitle={Feature Interaction Interpretability: A Case for Explaining Ad-Recommendation Systems via Neural Interaction Detection},\nauthor={Michael Tsang and Dehua Cheng and Hanpeng Liu and Xue Feng and Eric Zhou and Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgnhTEtDS}\n}",
        "github": "https://github.com/mtsang/interaction_interpretability",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkgnhTEtDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "364;318;566",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "214;328;599",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            416.0,
            107.71567512050726
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            380.3333333333333,
            161.4730799717264
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 75,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3857662297580644261&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Bkgq9ANKvB",
        "title": "Peer Loss Functions: Learning from Noisy Labels without Knowing Noise Rates",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper introduces peer loss, a family of loss functions that enables training a classifier over noisy labels, but without using explicit knowledge of the noise rates of labels.",
        "abstract": "Learning with noisy labels is a common problem in supervised learning. Existing approaches require practitioners to specify noise rates, i.e., a set of parameters controlling the severity of label noises in the problem. In this work, we introduce a technique to learn from noisy labels that does not require a priori specification of the noise rates. In particular, we introduce a new family of loss functions that we name as peer loss functions. Our approach then uses a standard empirical risk minimization (ERM) framework with peer loss functions. Peer loss functions associate each training sample with a certain form of \"peer\" samples, which evaluate a classifier' predictions jointly. We show that, under mild conditions, performing ERM with peer loss functions on the noisy dataset leads to the optimal or a near optimal classifier as if performing ERM over the clean training data, which we do not have access to. To our best knowledge, this is the first result on \"learning with noisy labels without knowing noise rates\" with theoretical guarantees. We pair our results with an extensive set of experiments, where we compare with state-of-the-art techniques of learning with noisy labels. Our results show that peer loss functions based method consistently outperforms the baseline benchmarks. Peer loss provides a way to simplify model development when facing potentially noisy training labels, and can be promoted as a robust candidate loss function in such situations. ",
        "keywords": "learning with noisy labels;empirical risk minimization;peer loss",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yang Liu;Hongyi Guo",
        "authorids": "yangliu@ucsc.edu;guohongyi@sjtu.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nliu2020peer,\ntitle={Peer Loss Functions: Learning from Noisy Labels without Knowing Noise Rates},\nauthor={Yang Liu and Hongyi Guo},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkgq9ANKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkgq9ANKvB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "395;269;285",
        "wc_reply_reviewers": "0;133;0",
        "wc_reply_authors": "748;1007;333",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            316.3333333333333,
            56.00793594562193
        ],
        "wc_reply_reviewers_avg": [
            44.333333333333336,
            62.69680126520721
        ],
        "wc_reply_authors_avg": [
            696.0,
            277.6052353012577
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 293,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10307859687015437629&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BkgqExrYvS",
        "title": "PopSGD: Decentralized Stochastic Gradient Descent in the Population Model",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that large-scale distributed optimization can be performed efficiently in the population model of distributed computing. ",
        "abstract": "The population model is a standard way to represent large-scale decentralized\ndistributed systems, in which agents with limited computational power interact\nin randomly chosen pairs, in order to collectively solve global computational\ntasks. In contrast with synchronous gossip models, nodes are anonymous, lack a\ncommon notion of time, and have no control over their scheduling. In this paper,\nwe examine whether large-scale distributed optimization can be performed in this\nextremely restrictive setting. \n\nWe introduce and analyze a natural decentralized variant of stochastic gradient\ndescent (SGD), called PopSGD, in which every node maintains a local parameter,\nand is able to compute stochastic gradients with respect to this parameter. \nEvery pair-wise node interaction performs a stochastic gradient step at each\nagent, followed by averaging of the two models. We prove that, under standard\nassumptions, SGD can converge even in this extremely loose, decentralized\nsetting, for both convex and non-convex objectives.  Moreover, surprisingly, in\nthe former case, the algorithm can achieve linear speedup in the number of nodes\nn. Our analysis leverages a new technical connection between decentralized SGD\nand randomized load balancing, which enables us to tightly bound the\nconcentration of node parameters. We validate our analysis through experiments,\nshowing that PopSGD can achieve convergence and speedup for large-scale\ndistributed learning tasks in a supercomputing environment.",
        "keywords": "Distributed machine learning;distributed optimization;decentralized parallel SGD;population protocols",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Giorgi Nadiradze;Amirmojtaba Sabour;Aditya Sharma;Ilia Markov;Vitaly Aksenov;Dan Alistarh.",
        "authorids": "giorgi.nadiradze@ist.ac.at;amsabour79@gmail.com;adityasharma.2000.as@gmail.com;ilia.markov@ist.ac.at;aksenov.vitaly@gmail.com;dan.alistarh@ist.ac.at",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nnadiradze2020popsgd,\ntitle={Pop{\\{}SGD{\\}}: Decentralized Stochastic Gradient Descent in the Population Model},\nauthor={Giorgi Nadiradze and Amirmojtaba Sabour and Aditya Sharma and Ilia Markov and Vitaly Aksenov and Dan Alistarh.},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgqExrYvS}\n}",
        "github": "https://github.com/ICLR-PopSGD/PopSGD",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkgqExrYvS",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "792;719",
        "wc_reply_reviewers": "187;0",
        "wc_reply_authors": "930;455",
        "reply_reviewers": "1;0",
        "reply_authors": "2;1",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            755.5,
            36.5
        ],
        "wc_reply_reviewers_avg": [
            93.5,
            93.5
        ],
        "wc_reply_authors_avg": [
            692.5,
            237.5
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mwjmf3cgIroJ:scholar.google.com/&scioq=PopSGD:+Decentralized+Stochastic+Gradient+Descent+in+the+Population+Model&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BkgqL0EtPH",
        "title": "{COMPANYNAME}11K: An Unsupervised Representation Learning Dataset for Arrhythmia Subtype Discovery",
        "track": "main",
        "status": "Reject",
        "tldr": "We release a dataset constructed from single-lead ECG data from 11,000 patients who were prescribed to use the {DEVICENAME}(TM) device.",
        "abstract": "We release the largest public ECG dataset of continuous raw signals for representation learning containing over 11k patients and 2 billion labelled beats. Our goal is to enable semi-supervised ECG models to be made as well as to discover unknown subtypes of arrhythmia and anomalous ECG signal events. To this end, we propose an unsupervised representation learning task, evaluated in a semi-supervised fashion.  We provide a set of baselines for different feature extractors that can be built upon.  Additionally, we perform qualitative evaluations on results from PCA embeddings, where we identify some clustering of known subtypes indicating the potential for representation learning in arrhythmia sub-type discovery.",
        "keywords": "representation learning;healthcare;medical;clinical;dataset;ecg;cardiology;heart;discovery;anomaly detection;out of distribution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shawn Tan;Guillaume Androz;Ahmad Chamseddine;Pierre Fecteau;Aaron Courville;Yoshua Bengio;Joseph Paul Cohen",
        "authorids": "shawn@wtf.sg;guillaume.androz@icentia.com;doctor.ahmad89@gmail.com;pierre.fecteau@icentia.com;aaron.courville@gmail.com;yoshua.bengio@mila.quebec;joseph@josephpcohen.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\ntan2020companynamek,\ntitle={{\\{}{\\{}COMPANYNAME{\\}}{\\}}11K: An Unsupervised Representation Learning Dataset for Arrhythmia Subtype Discovery},\nauthor={Shawn Tan and Guillaume Androz and Ahmad Chamseddine and Pierre Fecteau and Aaron Courville and Yoshua Bengio and Joseph Paul Cohen},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgqL0EtPH}\n}",
        "github": "https://drive.google.com/file/d/1nwF-yGrDUIiBa15fcaPOXxuasn_6B50M/view",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkgqL0EtPH",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "690;281",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "208;291",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            485.5,
            204.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            249.5,
            41.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-7Lgbg1T89UJ:scholar.google.com/&scioq=%7BCOMPANYNAME%7D11K:+An+Unsupervised+Representation+Learning+Dataset+for+Arrhythmia+Subtype+Discovery&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Bkgr2kHKDH",
        "title": "EMS: End-to-End Model Search for Network Architecture, Pruning and Quantization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present an end-to-end design methodology for efficient deep learning deployment. ",
        "abstract": "We present an end-to-end design methodology for efficient deep learning deployment. Unlike previous methods that separately optimize the neural network architecture, pruning policy, and quantization policy, we jointly optimize them in an end-to-end manner. To deal with the larger design space it brings, we train a quantization-aware accuracy predictor that fed to the evolutionary search to select the best fit. We first generate a large dataset of <NN architecture, ImageNet accuracy> pairs without training each architecture, but by sampling a unified supernet. Then we use these data to train an accuracy predictor without quantization, further using predictor-transfer technique to get the quantization-aware predictor, which reduces the amount of post-quantization fine-tuning time. Extensive experiments on ImageNet show the benefits of the end-to-end methodology: it maintains the same accuracy (75.1%) as ResNet34 float model while saving 2.2\u00d7 BitOps comparing with the 8-bit model; we obtain the same level accuracy as MobileNetV2+HAQ while achieving 2\u00d7/1.3\u00d7 latency/energy saving; the end-to-end optimization outperforms separate optimizations using ProxylessNAS+AMC+HAQ by 2.3% accuracy while reducing orders of magnitude GPU hours and CO2 emission.\n",
        "keywords": "End-to-end Design;Joint Optimization;Architecture Search;Network Pruning;Network Quanzation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianzhe Wang;Kuan Wang;Han Cai;Ji Lin;Yujun Lin;Zhijian Liu;Song Han",
        "authorids": "usedtobe@mit.edu;wangkuan15@mails.tsinghua.edu.cn;hancai@mit.edu;jilin@mit.edu;yujunlin@mit.edu;zhijian@mit.edu;songhan@mit.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bkgr2kHKDH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "374;378;163",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            305.0,
            100.42244105112495
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hC4_PKYBYTQJ:scholar.google.com/&scioq=EMS:+End-to-End+Model+Search+for+Network+Architecture,+Pruning+and+Quantization&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BkgrBgSYDS",
        "title": "Kaleidoscope: An Efficient, Learnable Representation For All Structured Linear Maps",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose a differentiable family of \"kaleidoscope matrices,\" prove that all structured matrices can be represented in this form, and use them to replace hand-crafted linear maps in deep learning models.",
        "abstract": "Modern neural network architectures use structured linear transformations, such as low-rank matrices, sparse matrices, permutations, and the Fourier transform, to improve inference speed and reduce memory usage compared to general linear maps. However, choosing which of the myriad structured transformations to use (and its associated parameterization) is a laborious task that requires trading off speed, space, and accuracy. We consider a different approach: we introduce a family of matrices called kaleidoscope matrices (K-matrices) that provably capture any structured matrix with near-optimal space (parameter) and time (arithmetic operation) complexity. We empirically validate that K-matrices can be automatically learned within end-to-end pipelines to replace hand-crafted procedures, in order to improve model quality. For example, replacing channel shuffles in ShuffleNet improves classification accuracy on ImageNet by up to 5%. K-matrices can also simplify hand-engineered pipelines---we replace filter bank feature computation in speech data preprocessing with a learnable kaleidoscope layer, resulting in only 0.4% loss in accuracy on the TIMIT speech recognition task. In addition, K-matrices can capture latent structure in models: for a challenging permuted image classification task, adding a K-matrix to a standard convolutional architecture can enable learning the latent permutation and improve accuracy by over 8 points. We provide a practically efficient implementation of our approach, and use K-matrices in a Transformer network to attain 36% faster end-to-end inference speed on a language translation task.",
        "keywords": "structured matrices;efficient ML;algorithms;butterfly matrices;arithmetic circuits",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tri Dao;Nimit Sohoni;Albert Gu;Matthew Eichhorn;Amit Blonder;Megan Leszczynski;Atri Rudra;Christopher R\u00e9",
        "authorids": "trid@stanford.edu;nims@stanford.edu;albertgu@stanford.edu;mae226@cornell.edu;amitblon@buffalo.edu;mleszczy@stanford.edu;atri@buffalo.edu;chrismre@cs.stanford.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nDao2020Kaleidoscope:,\ntitle={Kaleidoscope: An Efficient, Learnable Representation For All Structured Linear Maps},\nauthor={Tri Dao and Nimit Sohoni and Albert Gu and Matthew Eichhorn and Amit Blonder and Megan Leszczynski and Atri Rudra and Christopher R\u00e9},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgrBgSYDS}\n}",
        "github": "https://github.com/HazyResearch/learning-circuits",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkgrBgSYDS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "243;565;768",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "205;525;361",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            525.3333333333334,
            216.15786412301134
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            363.6666666666667,
            130.6530605160944
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 71,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2108140633513895100&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "Bkgv71rtwr",
        "title": "Open-Set Domain Adaptation with Category-Agnostic Clusters",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present a new design, i.e., Self-Ensembling with Category-agnostic Clusters, for both closed-set and open-set domain adaptation.",
        "abstract": "Unsupervised domain adaptation has received significant attention in recent years. Most of existing works tackle the closed-set scenario, assuming that the source and target domains share the exactly same categories. In practice, nevertheless, a target domain often contains samples of classes unseen in source domain (i.e., unknown class). The extension of domain adaptation from closed-set to such open-set situation is not trivial since the target samples in unknown class are not expected to align with the source. In this paper, we address this problem by augmenting the state-of-the-art domain adaptation technique, Self-Ensembling, with category-agnostic clusters in target domain. Specifically, we present Self-Ensembling with Category-agnostic Clusters (SE-CC) --- a novel architecture that steers domain adaptation with the additional guidance of category-agnostic clusters that are specific to target domain. These clustering information provides domain-specific visual cues, facilitating the generalization of Self-Ensembling for both closed-set and open-set scenarios. Technically, clustering is firstly performed over all the unlabeled target samples to obtain the category-agnostic clusters, which reveal the underlying data space structure peculiar to target domain. A clustering branch is capitalized on to ensure that the learnt representation preserves such underlying structure by matching the estimated assignment distribution over clusters to the inherent cluster distribution for each target sample. Furthermore, SE-CC enhances the learnt representation with mutual information maximization. Extensive experiments are conducted on Office and VisDA datasets for both open-set and closed-set domain adaptation, and superior results are reported when comparing to the state-of-the-art approaches.",
        "keywords": "Unsupervised Domain Adaptation;Open-set Domain Adaptation;Category-agnostic Clusters",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yingwei Pan;Ting Yao;Yehao Li;Chong-Wah Ngo;Tao Mei",
        "authorids": "panyw.ustc@gmail.com;tingyao.ustc@gmail.com;yehaoli.sysu@gmail.com;cscwngo@cityu.edu.hk;tmei@live.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bkgv71rtwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "366;789;322",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            492.3333333333333,
            210.54268503612806
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:64uZdwT2mlEJ:scholar.google.com/&scioq=Open-Set+Domain+Adaptation+with+Category-Agnostic+Clusters&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Bkgwp3NtDH",
        "title": "Programmable Neural Network Trojan for Pre-trained Feature Extractor",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a more powerful NN trojaning attack that can support outer-scope targets and dynamic targets",
        "abstract": "Neural network (NN) trojaning attack is an emerging and important attack that can broadly damage the system deployed with NN models.\nDifferent from adversarial attack, it hides malicious functionality in the weight parameters of NN models.\nExisting studies have explored NN trojaning attacks in some small datasets for specific domains, with limited numbers of fixed target classes.\nIn this paper, we propose a more powerful trojaning attack method for large models, which outperforms existing studies in capability, generality, and stealthiness.\nFirst, the attack is programmable that the malicious misclassification target is not fixed and can be generated on demand even after the victim's deployment.\nSecond, our trojaning attack is not limited in a small domain; one trojaned model on a large-scale dataset can affect applications of different domains that reuses its general features.\nThird, our trojan shows no biased behavior for different target classes, which makes it more difficult to defend.",
        "keywords": "Neural Network;Trojan;Security",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Ji;Zinxin Liu;Xing Hu;Peiqi Wang;Youhui Zhang",
        "authorids": "jiy15@mails.tsinghua.edu.cn;liuzixin18@mails.tsinghua.edu.cn;xinghu@ucsb.edu;wpq14@mails.tsinghua.edu.cn;zyh02@tsinghua.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nji2020programmable,\ntitle={Programmable Neural Network Trojan for Pre-trained Feature Extractor},\nauthor={Yu Ji and Zinxin Liu and Xing Hu and Peiqi Wang and Youhui Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkgwp3NtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkgwp3NtDH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "482;432;325",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "735;500;125",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            413.0,
            65.48791237065561
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            453.3333333333333,
            251.20819165695127
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12362849276047478180&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BkgzMCVtPB",
        "title": "Optimal Strategies Against Generative Attacks",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "Generative neural models have improved dramatically recently. With this progress comes the risk that such models will be used to attack systems that rely on sensor data for authentication and anomaly detection. Many such learning systems are installed worldwide, protecting critical infrastructure or private data against malfunction and cyber attacks. We formulate the scenario of such an authentication system facing generative impersonation attacks, characterize it from a theoretical perspective and explore its practical implications. In particular, we ask fundamental theoretical questions in learning, statistics and information theory: How hard is it to detect a \"fake reality\"? How much data does the attacker need to collect before it can reliably generate nominally-looking artificial data? Are there optimal strategies for the attacker or the authenticator? We cast the problem as a maximin game, characterize the optimal strategy for both attacker and authenticator in the general case, and provide the optimal strategies in closed form for the case of Gaussian source distributions. Our analysis reveals the structure of the optimal attack and the relative importance of data collection for both authenticator and attacker. Based on these insights we design practical learning approaches and show that they result in models that are more robust to various attacks on real-world data.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Roy Mor;Erez Peterfreund;Matan Gavish;Amir Globerson",
        "authorids": "roy16mor@gmail.com;erezpeter@cs.huji.ac.il;matan.gavish@mail.huji.ac.il;amir.globerson@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nMor2020Optimal,\ntitle={Optimal Strategies Against Generative Attacks},\nauthor={Roy Mor and Erez Peterfreund and Matan Gavish and Amir Globerson},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgzMCVtPB}\n}",
        "github": "https://github.com/roymor1/OptimalStrategiesAgainstGenerativeAttacks",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkgzMCVtPB",
        "pdf_size": 0,
        "rating": "8;8;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "359;302;280;318",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "178;153;345;579",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            314.75,
            28.89095879336648
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            313.75,
            170.00202204679803
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11376237286221602312&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Bkgz_krKPB",
        "title": "Distilling the Knowledge of BERT for Text Generation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a model-agnostic way to leverage BERT for text generation and achieve improvements over Transformer on 2 tasks over 4 datasets.",
        "abstract": "Large-scale pre-trained language model, such as BERT, has recently achieved great success in a wide range of language understanding tasks. However, it remains an open question how to utilize BERT for text generation tasks. In this paper, we present a novel approach to addressing this challenge in a generic sequence-to-sequence (Seq2Seq) setting. We first propose a new task, Conditional Masked Language Modeling (C-MLM), to enable fine-tuning of BERT on target text-generation dataset. The fine-tuned BERT (i.e., teacher) is then exploited as extra supervision to improve conventional Seq2Seq models (i.e., student) for text generation. By leveraging BERT's idiosyncratic bidirectional nature, distilling the knowledge learned from BERT can encourage auto-regressive Seq2Seq models to plan ahead, imposing global sequence-level supervision for coherent text generation. Experiments show that the proposed approach significantly outperforms strong baselines of Transformer on multiple text generation tasks, including machine translation (MT) and text summarization. Our proposed model also achieves new state-of-the-art results on the IWSLT German-English and English-Vietnamese MT datasets.",
        "keywords": "text generation;neural machine translation;abstractive summarization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yen-Chun Chen;Zhe Gan;Yu Cheng;Jingzhou Liu;Jingjing Liu",
        "authorids": "yen-chun.chen@microsoft.com;zhe.gan@microsoft.com;yu.cheng@microsoft.com;jingzhol@andrew.cmu.edu;jingjl@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkgz_krKPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "256;283;209",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            249.33333333333334,
            30.57595278791634
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6569345550687347580&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkgzqRVFDr",
        "title": "Reinforcement Learning with Probabilistically Complete Exploration",
        "track": "main",
        "status": "Reject",
        "tldr": "Enhancing reinforcement learning exploration with planning algorithms",
        "abstract": "Balancing exploration and exploitation remains a key challenge in reinforcement learning (RL). State-of-the-art RL algorithms suffer from high sample complexity, particularly in the sparse reward case, where they can do no better than to explore in all directions until the first positive rewards are found. To mitigate this, we propose Rapidly Randomly-exploring Reinforcement Learning (R3L). We formulate exploration as a search problem and leverage widely-used planning algorithms such as Rapidly-exploring Random Tree (RRT) to find initial solutions. These solutions are used as demonstrations to initialize a policy, then refined by a generic RL algorithm, leading to faster and more stable convergence. We provide theoretical guarantees of R3L exploration finding successful solutions, as well as bounds for its sampling complexity. We experimentally demonstrate the method outperforms classic and intrinsic exploration techniques, requiring only a fraction of exploration samples and achieving better asymptotic performance.",
        "keywords": "Reinforcement Learning;Exploration;sparse rewards;learning from demonstration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Philippe Morere;Tom Blau;Gilad Francis;Fabio Ramos",
        "authorids": "philippe.morere@sydney.edu.au;tom.blau@sydney.edu.au;gilad.francis@sydney.edu.au;fabio.ramos@sydney.edu.au",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmorere2020reinforcement,\ntitle={Reinforcement Learning with Probabilistically Complete Exploration},\nauthor={Philippe Morere and Tom Blau and Gilad Francis and Fabio Ramos},\nyear={2020},\nurl={https://openreview.net/forum?id=BkgzqRVFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkgzqRVFDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "457;521;432",
        "wc_reply_reviewers": "58;0;0",
        "wc_reply_authors": "291;625;523",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            470.0,
            37.47888294315436
        ],
        "wc_reply_reviewers_avg": [
            19.333333333333332,
            27.34146220587984
        ],
        "wc_reply_authors_avg": [
            479.6666666666667,
            139.7553417782503
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12248662857176989759&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Bkl086VYvH",
        "title": "Feature-map-level Online Adversarial Knowledge Distillation",
        "track": "main",
        "status": "Reject",
        "tldr": "Online knowledge distillation method using feature map information.",
        "abstract": "Feature maps contain rich information about image intensity and spatial correlation. However, previous online knowledge distillation methods only utilize the class probabilities. Thus in this paper, we propose an online knowledge distillation method that transfers not only the knowledge of the class probabilities but also that of the feature map using the adversarial training framework. We train multiple networks simultaneously by employing discriminators to distinguish the feature map distributions of different networks. Each network has its corresponding discriminator which discriminates the feature map from its own as fake while classifying that of the other network as real. By training a network to fool the corresponding discriminator, it can learn the other network\u2019s feature map distribution. Discriminators and networks are trained concurrently in a minimax two-player game. Also, we propose a novel cyclic learning scheme for training more than two networks together. We have applied our method to various network architectures on the classification task and discovered a significant improvement of performance especially in the case of training a pair of a small network and a large one.",
        "keywords": "Computer vision;Image classification;Knowledge distillation;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Inseop Chung;SeongUk Park;Jangho Kim;Nojun Kwak",
        "authorids": "jis3613@snu.ac.kr;swpark0703@snu.ac.kr;kjh91@snu.ac.kr;nojunk@snu.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nchung2020featuremaplevel,\ntitle={Feature-map-level Online Adversarial Knowledge Distillation},\nauthor={Inseop Chung and SeongUk Park and Jangho Kim and Nojun Kwak},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkl086VYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkl086VYvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "158;254;424",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "333;691;968",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            278.6666666666667,
            109.98585767674356
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            664.0,
            259.93973660575
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 176,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16816599832360988251&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Bkl2UlrFwr",
        "title": "Iterative Deep Graph Learning for Graph Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, we propose an end-to-end graph learning framework, namely Iterative Deep Graph Learning (IDGL), for jointly learning graph structure and graph embedding simultaneously. We first cast graph structure learning problem as similarity metric learning problem and leverage an adapted graph regularization for controlling smoothness, connectivity and sparsity of the generated graph. We further propose a novel iterative method for searching for hidden graph structure that augments the initial graph structure. Our iterative method dynamically stops when learning graph structure approaches close enough to the ground truth graph. Our extensive experiments demonstrate that the proposed IDGL model can consistently outperform or match state-of-the-art baselines in terms of both classification accuracy and computational time. The proposed approach can cope with both transductive training and inductive training. ",
        "keywords": "deep learning;graph neural networks;graph learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Chen;Lingfei Wu;Mohammed J. Zaki",
        "authorids": "cheny39@rpi.edu;lwu@email.wm.edu;zaki@cs.rpi.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchen2020iterative,\ntitle={Iterative Deep Graph Learning for Graph Neural Networks},\nauthor={Yu Chen and Lingfei Wu and Mohammed J. Zaki},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkl2UlrFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bkl2UlrFwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "250;372;326",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "502;1227;752",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            316.0,
            50.30573194643595
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            827.0,
            300.6936425444786
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:35RfUcCqux8J:scholar.google.com/&scioq=Iterative+Deep+Graph+Learning+for+Graph+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Bkl5kxrKDr",
        "title": "A Generalized Training Approach for Multiagent Learning",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "This paper investigates a population-based training regime based on game-theoretic principles called Policy-Spaced Response Oracles (PSRO). PSRO is general in the sense that it (1) encompasses well-known algorithms such as fictitious play and double oracle as special cases, and (2) in principle applies to general-sum, many-player games. Despite this, prior studies of PSRO have been focused on two-player zero-sum games, a regime where in Nash equilibria are tractably computable. In moving from two-player zero-sum games to more general settings, computation of Nash equilibria quickly becomes infeasible.  Here, we extend the theoretical underpinnings of PSRO by considering an alternative solution concept, \u03b1-Rank, which is unique (thus faces no equilibrium selection issues, unlike Nash) and applies readily to general-sum, many-player settings. We establish convergence guarantees in several games classes, and identify links between Nash equilibria and \u03b1-Rank. We demonstrate the competitive performance of \u03b1-Rank-based PSRO against an exact Nash solver-based PSRO in 2-player Kuhn and Leduc Poker. We then go beyond the reach of prior PSRO applications by considering 3- to 5-player poker games, yielding instances where \u03b1-Rank achieves faster convergence than approximate Nash solvers, thus establishing it as a favorable general games solver. We also carry out an initial empirical validation in MuJoCo soccer, illustrating the feasibility of the proposed approach in another complex domain.",
        "keywords": "multiagent learning;game theory;training;games",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Paul Muller;Shayegan Omidshafiei;Mark Rowland;Karl Tuyls;Julien Perolat;Siqi Liu;Daniel Hennes;Luke Marris;Marc Lanctot;Edward Hughes;Zhe Wang;Guy Lever;Nicolas Heess;Thore Graepel;Remi Munos",
        "authorids": "pmuller@google.com;somidshafiei@google.com;markrowland@google.com;karltuyls@google.com;perolat@google.com;liusiqi@google.com;hennes@google.com;marris@google.com;lanctot@google.com;edwardhughes@google.com;zhewang@google.com;guylever@google.com;heess@google.com;thore@google.com;munos@google.com",
        "gender": ";;;;;;;;;;;;;;",
        "homepage": ";;;;;;;;;;;;;;",
        "dblp": ";;;;;;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;;;;;",
        "orcid": ";;;;;;;;;;;;;;",
        "linkedin": ";;;;;;;;;;;;;;",
        "or_profile": ";;;;;;;;;;;;;;",
        "aff": ";;;;;;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;;;;;",
        "position": ";;;;;;;;;;;;;;",
        "bibtex": "@inproceedings{\nMuller2020A,\ntitle={A Generalized Training Approach for Multiagent Learning},\nauthor={Paul Muller and Shayegan Omidshafiei and Mark Rowland and Karl Tuyls and Julien Perolat and Siqi Liu and Daniel Hennes and Luke Marris and Marc Lanctot and Edward Hughes and Zhe Wang and Guy Lever and Nicolas Heess and Thore Graepel and Remi Munos},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkl5kxrKDr}\n}",
        "github": "[![github](/images/github_icon.svg) deepmind/open_spiel](https://github.com/deepmind/open_spiel)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bkl5kxrKDr",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "198;527;409",
        "wc_reply_reviewers": "0;216;0",
        "wc_reply_authors": "536;1511;390",
        "reply_reviewers": "0;2;0",
        "reply_authors": "2;4;2",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            378.0,
            136.0906560593587
        ],
        "wc_reply_reviewers_avg": [
            72.0,
            101.82337649086284
        ],
        "wc_reply_authors_avg": [
            812.3333333333334,
            497.61453176351495
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.9428090415820634
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            15,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 127,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15325169882978328378&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Bkl7bREtDr",
        "title": "AMRL: Aggregated Memory For Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "In Deep RL, order-invariant functions can be used in conjunction with standard memory modules to improve gradient decay and resilience to noise.",
        "abstract": "In many partially observable scenarios, Reinforcement Learning (RL) agents must rely on long-term memory in order to learn an optimal policy. We demonstrate that using techniques from NLP and supervised learning fails at RL tasks due to stochasticity from the environment and from exploration. Utilizing our insights on the limitations of traditional memory methods in RL, we propose AMRL, a class of models that can learn better policies with greater sample efficiency and are resilient to noisy inputs. Specifically, our models use a standard memory module to summarize short-term context, and then aggregate all prior states from the standard model without respect to order. We show that this provides advantages both in terms of gradient decay and signal-to-noise ratio over time. Evaluating in Minecraft and maze environments that test long-term memory, we find that our model improves average return by 19% over a baseline that has the same number of parameters and by 9% over a stronger baseline that has far more parameters.",
        "keywords": "deep learning;reinforcement learning;rl;memory;noise;machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jacob Beck;Kamil Ciosek;Sam Devlin;Sebastian Tschiatschek;Cheng Zhang;Katja Hofmann",
        "authorids": "jacob_beck@alumni.brown.edu;kamil.ciosek@microsoft.com;sam.devlin@microsoft.com;sebastian.tschiatschek@microsoft.com;cheng.zhang@microsoft.com;katja.hofmann@microsoft.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nBeck2020AMRL:,\ntitle={AMRL: Aggregated Memory For Reinforcement Learning},\nauthor={Jacob Beck and Kamil Ciosek and Sam Devlin and Sebastian Tschiatschek and Cheng Zhang and Katja Hofmann},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkl7bREtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bkl7bREtDr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "263;131;1074",
        "wc_reply_reviewers": "10;0;18",
        "wc_reply_authors": "611;130;488",
        "reply_reviewers": "1;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            489.3333333333333,
            416.9191235186455
        ],
        "wc_reply_reviewers_avg": [
            9.333333333333334,
            7.363574011458175
        ],
        "wc_reply_authors_avg": [
            409.6666666666667,
            204.02995422785898
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 26,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15674358805948306867&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Bkl8YR4YDB",
        "title": "Large-scale Pretraining for Neural Machine Translation with Tens of Billions of Sentence Pairs",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, we investigate the problem of training neural machine translation (NMT) systems with a dataset of more than 40 billion bilingual sentence pairs, which is larger than the largest dataset to date by orders of magnitude. Unprecedented challenges emerge in this situation compared to previous NMT work, including severe noise in the data and prohibitively long training time. We propose practical solutions to handle these issues and demonstrate that large-scale pretraining significantly improves NMT performance. We are able to push the BLEU score of  WMT17  Chinese-English dataset to 32.3, with a significant performance boost of +3.2 over existing state-of-the-art results.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuxian Meng;Xiangyuan Ren;Zijun Sun;Xiaoya Li;Arianna Yuan;Fei Wu;Jiwei Li",
        "authorids": "yuxian_meng@shannonai.com;xiangyuan_re@shannonai.com;zijun_sun@shannonai.com;xiaoya_li@shannonai.com;xfyuan@stanford.edu;wufei@zju.edu.cn;jiwei_li@shannonai.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nmeng2020largescale,\ntitle={Large-scale Pretraining for Neural Machine Translation with Tens of Billions of Sentence Pairs},\nauthor={Yuxian Meng and Xiangyuan Ren and Zijun Sun and Xiaoya Li and Arianna Yuan and Fei Wu and Jiwei Li},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkl8YR4YDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkl8YR4YDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "503;252;211",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1199;91;122",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            322.0,
            129.07620488171577
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            470.6666666666667,
            515.1649142642469
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16205528417849291026&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BklBp6EYvB",
        "title": "Task-Based Top-Down Modulation Network for Multi-Task-Learning Applications",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a top-down modulation network for multi-task learning applications with several advantages over current schemes.    ",
        "abstract": "A general problem that received considerable recent attention is how to perform multiple tasks in the same network, maximizing both efficiency and prediction accuracy. A popular approach consists of a multi-branch architecture on top of a\nshared backbone, jointly trained on a weighted sum of losses. However, in many cases, the shared representation results in non-optimal performance, mainly due to an interference between conflicting gradients of uncorrelated tasks. Recent approaches address this problem by a channel-wise modulation of the feature-maps along the shared backbone, with task specific vectors, manually or dynamically tuned. Taking this approach a step further, we propose a novel architecture which\nmodulate the recognition network channel-wise, as well as spatial-wise, with an efficient top-down image-dependent computation scheme. Our architecture uses no task-specific branches, nor task specific modules. Instead, it uses a top-down modulation network that is shared between all of the tasks. We show the effectiveness of our scheme by achieving on par or better results than alternative approaches on both correlated and uncorrelated sets of tasks. We also demonstrate our advantages in terms of model size, the addition of novel tasks and interpretability. \nCode will be released.",
        "keywords": "deep learning;multi-task learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hila Levi;Shimon Ullman",
        "authorids": "hila.levi@weizmann.ac.il;shimon.ullman@weizmann.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlevi2020taskbased,\ntitle={Task-Based Top-Down Modulation Network for Multi-Task-Learning Applications},\nauthor={Hila Levi and Shimon Ullman},\nyear={2020},\nurl={https://openreview.net/forum?id=BklBp6EYvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BklBp6EYvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "687;561;952",
        "wc_reply_reviewers": "161;95;0",
        "wc_reply_authors": "267;183;480",
        "reply_reviewers": "1;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            733.3333333333334,
            162.9526175167357
        ],
        "wc_reply_reviewers_avg": [
            85.33333333333333,
            66.08244009484585
        ],
        "wc_reply_authors_avg": [
            310.0,
            125.00399993600205
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Uq9ULtViHfsJ:scholar.google.com/&scioq=Task-Based+Top-Down+Modulation+Network+for+Multi-Task-Learning+Applications&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BklC2RNKDS",
        "title": "Scalable Neural Learning for Verifiable Consistency with Temporal Specifications",
        "track": "main",
        "status": "Reject",
        "tldr": "Neural Network Verification for Temporal Properties and Sequence Generation Models",
        "abstract": "Formal verification of machine learning models has attracted attention recently, and significant progress has been made on proving simple properties like robustness to small perturbations of the input features. In this context, it has also been observed that folding the verification procedure into training makes it easier to train verifiably robust models. In this paper, we extend the applicability of verified training by extending it to (1) recurrent neural network architectures and (2) complex specifications that go beyond simple adversarial robustness, particularly specifications that capture temporal properties like requiring that a robot periodically visits a charging station or that a language model always produces sentences of bounded length. Experiments show that while models trained using standard training often violate desired specifications, our verified training method produces models that both perform well (in terms of test error or reward) and can be shown to be provably consistent with specifications.",
        "keywords": "Verification;Recurrent Neural Networks;Reinforcement Learning;Temporal Logic;Adversarial Robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sumanth Dathathri;Johannes Welbl;Krishnamurthy (Dj) Dvijotham;Ramana Kumar;Aditya Kanade;Jonathan Uesato;Sven Gowal;Po-Sen Huang;Pushmeet Kohli",
        "authorids": "sdathath@caltech.edu;johannes.welbl.14@ucl.ac.uk;dvij@google.com;ramanakumar@google.com;akanade@google.com;juesato@google.com;sgowal@google.com;posenhuang@google.com;pushmeet@google.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@misc{\ndathathri2020scalable,\ntitle={Scalable Neural Learning for Verifiable Consistency with Temporal Specifications},\nauthor={Sumanth Dathathri and Johannes Welbl and Krishnamurthy (Dj) Dvijotham and Ramana Kumar and Aditya Kanade and Jonathan Uesato and Sven Gowal and Po-Sen Huang and Pushmeet Kohli},\nyear={2020},\nurl={https://openreview.net/forum?id=BklC2RNKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BklC2RNKDS",
        "pdf_size": 0,
        "rating": "1;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "225;738;361;112",
        "wc_reply_reviewers": "148;0;37;0",
        "wc_reply_authors": "805;749;163;70",
        "reply_reviewers": "1;0;1;0",
        "reply_authors": "2;2;1;1",
        "rating_avg": [
            4.5,
            2.692582403567252
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.0,
            235.90782098099248
        ],
        "wc_reply_reviewers_avg": [
            46.25,
            60.6563063497935
        ],
        "wc_reply_authors_avg": [
            446.75,
            332.47283723636735
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AGZkEvbREJUJ:scholar.google.com/&scioq=Scalable+Neural+Learning+for+Verifiable+Consistency+with+Temporal+Specifications&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BklDO1HYPS",
        "title": "Accelerated Variance Reduced Stochastic Extragradient Method for Sparse Machine Learning Problems",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recently, many stochastic gradient descent algorithms with variance reduction have been proposed. Moreover, their proximal variants such as Prox-SVRG can effectively solve non-smooth problems, which makes that they are widely applied in many machine learning problems. However, the introduction of proximal operator will result in the error of the optimal value. In order to address this issue, we introduce the idea of extragradient and propose a novel accelerated variance reduced stochastic extragradient descent (AVR-SExtraGD) algorithm, which inherits the advantages of Prox-SVRG and momentum acceleration techniques. Moreover, our  theoretical analysis shows that AVR-SExtraGD enjoys the best-known convergence rates and oracle complexities of stochastic first-order algorithms such as Katyusha for both strongly convex and non-strongly convex problems. Finally, our experimental results show that for ERM problems and robust face recognition via sparse representation, our AVR-SExtraGD can yield the improved performance compared with Prox-SVRG and Katyusha. The asynchronous variant of AVR-SExtraGD outperforms KroMagnon and ASAGA, which are the asynchronous variants of SVRG and SAGA, respectively.",
        "keywords": "non-smooth optimization;SVRG;proximal operator;extragradient descent;momentum acceleration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fanhua Shang;Lin Kong;Yuanyuan Liu;Hua Huang;Hongying Liu",
        "authorids": "fhshang@xidian.edu.cn;xdkonglin0511@163.com;yyliu@xidian.edu.cn;huanghua1115@outlook.com;hyliu@xidian.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nshang2020accelerated,\ntitle={Accelerated Variance Reduced Stochastic Extragradient Method for Sparse Machine Learning Problems},\nauthor={Fanhua Shang and Lin Kong and Yuanyuan Liu and Hua Huang and Hongying Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=BklDO1HYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BklDO1HYPS",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "945;196;252",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1162;210;138",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            464.3333333333333,
            340.65068455661276
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            503.3333333333333,
            466.6742856520904
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bXnBeuz_LrIJ:scholar.google.com/&scioq=Accelerated+Variance+Reduced+Stochastic+Extragradient+Method+for+Sparse+Machine+Learning+Problems&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BklEF3VFPB",
        "title": "Towards Stable and comprehensive Domain Alignment: Max-Margin Domain-Adversarial Training",
        "track": "main",
        "status": "Reject",
        "tldr": "A stable domain-adversarial training approach for robust and comprehensive domain adaptation",
        "abstract": "  Domain adaptation tackles the problem of transferring knowledge from a label-rich source domain to an unlabeled or label-scarce target domain. Recently domain-adversarial training (DAT) has shown promising capacity to learn a domain-invariant feature space by reversing the gradient propagation of a domain classifier. However, DAT is still vulnerable in several aspects including (1) training instability due to the overwhelming discriminative ability of the domain classifier in adversarial training, (2) restrictive feature-level alignment, and (3) lack of interpretability or systematic explanation of the learned feature space. In this paper, we propose a novel Max-margin Domain-Adversarial Training (MDAT) by designing an Adversarial Reconstruction Network (ARN). The proposed MDAT stabilizes the gradient reversing in ARN by replacing the domain classifier with a reconstruction network, and in this manner ARN conducts both feature-level and pixel-level domain alignment without involving extra network structures. Furthermore, ARN demonstrates strong robustness to a wide range of hyper-parameters settings, greatly alleviating the task of model selection. Extensive empirical results validate that our approach outperforms other state-of-the-art domain alignment methods. Additionally, the reconstructed target samples are visualized to interpret the domain-invariant feature space which conforms with our intuition. ",
        "keywords": "domain adaptation;transfer learning;adversarial training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jianfei Yang;Han Zou;Yuxun Zhou;Lihua Xie",
        "authorids": "yang0478@e.ntu.edu.sg;hanzou@berkeley.edu;yxzhou@berkeley.edu;elhxie@ntu.edu.sg",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nyang2020towards,\ntitle={Towards Stable and comprehensive Domain Alignment: Max-Margin Domain-Adversarial Training},\nauthor={Jianfei Yang and Han Zou and Yuxun Zhou and Lihua Xie},\nyear={2020},\nurl={https://openreview.net/forum?id=BklEF3VFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BklEF3VFPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "369;513;483",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "516;251;608",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            455.0,
            62.03224967708329
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            458.3333333333333,
            151.34140947613187
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5948865237900346834&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BklEFpEYwS",
        "title": "Meta-Learning without Memorization",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We identify and formalize the memorization problem in meta-learning and solve this problem with novel meta-regularization method, which greatly expand the domain that meta-learning can be  applicable to and effective on.",
        "abstract": "The ability to learn new concepts with small amounts of data is a critical aspect of intelligence that has proven challenging for deep learning methods. Meta-learning has emerged as a promising technique for leveraging data from previous tasks to enable efficient learning of new tasks. However, most meta-learning algorithms implicitly require that the meta-training tasks be mutually-exclusive, such that no single model can solve all of the tasks at once. For example, when creating tasks for few-shot image classification, prior work uses a per-task random assignment of image classes to N-way classification labels. If this is not done, the meta-learner can ignore the task training data and learn a single model that performs all of the meta-training tasks zero-shot, but does not adapt effectively to new image classes.  This requirement means that the user must take great care in designing the tasks, for example by shuffling labels or removing task identifying information from the inputs. In some domains, this makes meta-learning entirely inapplicable. In this paper, we address this challenge by designing a meta-regularization objective using information theory that places precedence on data-driven adaptation. This causes the meta-learner to decide what must be learned from the task training data and what should be inferred from the task testing input. By doing so, our algorithm can successfully use data from non-mutually-exclusive tasks to efficiently adapt to novel tasks. We demonstrate its applicability to both contextual and gradient-based meta-learning algorithms, and apply it in practical settings where applying standard meta-learning has been difficult. Our approach substantially outperforms standard meta-learning algorithms in these settings.\u00a0",
        "keywords": "meta-learning;memorization;regularization;overfitting;mutually-exclusive",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mingzhang Yin;George Tucker;Mingyuan Zhou;Sergey Levine;Chelsea Finn",
        "authorids": "mzyin@utexas.edu;gjt@google.com;mingyuan.zhou@mccombs.utexas.edu;svlevine@eecs.berkeley.edu;cbfinn@cs.stanford.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nYin2020Meta-Learning,\ntitle={Meta-Learning without Memorization},\nauthor={Mingzhang Yin and George Tucker and Mingyuan Zhou and Sergey Levine and Chelsea Finn},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BklEFpEYwS}\n}",
        "github": "https://github.com/google-research/google-research/tree/master/meta_learning_without_memorization",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BklEFpEYwS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "262;699;639",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "728;810;594",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            533.3333333333334,
            193.41894656131515
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            710.6666666666666,
            89.02933349308093
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 249,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7293303494980173831&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BklHF6VtPB",
        "title": "Modeling Winner-Take-All Competition in Sparse Binary Projections",
        "track": "main",
        "status": "Reject",
        "tldr": "We developed a Winner-Take-All model that learns a sparse binary representation for input samples, with significantly improved speed and accuracy.",
        "abstract": "Inspired by the advances in biological science, the study of sparse binary projection models has attracted considerable recent research attention. The models project dense input samples into a higher-dimensional space and output sparse binary data representations after Winner-Take-All competition, subject to the constraint that the projection matrix is also sparse and binary. Following the work along this line, we developed a supervised-WTA model when training samples with both input and output representations are available, from which the optimal projection matrix can be obtained with a simple, efficient yet effective algorithm. We further extended the model and the algorithm to an unsupervised setting where only the input representation of the samples is available. In a series of empirical evaluation on similarity search tasks, the proposed models reported significantly improved results over the state-of-the-art methods in both search accuracy and running time. The successful results give us strong confidence that the work provides a highly practical tool to real world applications.\n",
        "keywords": "Sparse Representation;Sparse Binary Projection;Winner-Take-All",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenye Li",
        "authorids": "wyli@cuhk.edu.cn",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nli2020modeling,\ntitle={Modeling Winner-Take-All Competition in Sparse Binary Projections},\nauthor={Wenye Li},\nyear={2020},\nurl={https://openreview.net/forum?id=BklHF6VtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BklHF6VtPB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "130;455;209",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "260;220;210",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            264.6666666666667,
            138.39637117914458
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            230.0,
            21.602468994692867
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15204608105640584196&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BklIxyHKDr",
        "title": "Deep k-NN for Noisy Labels",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Modern machine learning models are often trained on examples with noisy labels that hurt performance and are hard to identify. In this paper, we provide an empirical study showing that a simple $k$-nearest neighbor-based filtering approach on the logit layer of a preliminary model can  remove mislabeled training data and produce more accurate models than some recently proposed methods. We also provide new statistical guarantees into its efficacy.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dara Bahri;Heinrich Jiang;Maya Gupta",
        "authorids": "dbahri@google.com;heinrichj@google.com;mayagupta@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nbahri2020deep,\ntitle={Deep k-{\\{}NN{\\}} for Noisy Labels},\nauthor={Dara Bahri and Heinrich Jiang and Maya Gupta},\nyear={2020},\nurl={https://openreview.net/forum?id=BklIxyHKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BklIxyHKDr",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "169;256;433",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;170;600",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            286.0,
            109.8453458276681
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            256.6666666666667,
            252.49862485874166
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 90,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5564939682232167396&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BklLVAEKvH",
        "title": "Generalized Clustering by Learning to Optimize Expected Normalized Cuts",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a novel end-to-end approach for learning to cluster in the absence of labeled examples. We define a differentiable loss function equivalent to the expected normalized cuts.",
        "abstract": "We introduce a novel end-to-end approach for learning to cluster in the absence of labeled examples. Our clustering objective is based on optimizing normalized cuts, a criterion which measures both intra-cluster similarity as well as inter-cluster dissimilarity. We define a differentiable loss function equivalent to the expected normalized cuts. Unlike much of the work in unsupervised deep learning, our trained model directly outputs final cluster assignments, rather than embeddings that need further processing to be usable. Our approach generalizes to unseen datasets across a wide variety of domains, including text, and image. Specifically, we achieve state-of-the-art results on popular unsupervised clustering benchmarks (e.g., MNIST, Reuters, CIFAR-10, and CIFAR-100), outperforming the strongest baselines by up to 10.9%. Our generalization results are superior (by up to 21.9%) to the recent top-performing clustering approach with the ability to generalize.",
        "keywords": "Clustering;Normalized cuts;Generalizability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Azade Nazi;Will Hang;Anna Goldie;Sujith Ravi;Azalia Mirhoseini",
        "authorids": "azade@google.com;agoldie@google.com;sravi@google.com;azalia@google.com;willhang@stanford.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nnazi2020generalized,\ntitle={Generalized Clustering by Learning to Optimize Expected Normalized Cuts},\nauthor={Azade Nazi and Will Hang and Anna Goldie and Sujith Ravi and Azalia Mirhoseini},\nyear={2020},\nurl={https://openreview.net/forum?id=BklLVAEKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BklLVAEKvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "349;158;251",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "671;245;205",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            252.66666666666666,
            77.98432891008011
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            373.6666666666667,
            210.87963918364008
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4984367885882149708&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BklMDCVtvr",
        "title": "Discovering the compositional structure of vector representations with Role Learning Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a new analysis technique that discovers interpretable compositional structure in notoriously hard-to-interpret recurrent neural networks.",
        "abstract": "Neural networks (NNs) are able to perform tasks that rely on compositional structure even though they lack obvious mechanisms for representing this structure. To analyze the internal representations that enable such success, we propose ROLE, a technique that detects whether these representations implicitly encode symbolic structure. ROLE learns to approximate the representations of a target encoder E by learning a symbolic constituent structure and an embedding of that structure into E\u2019s representational vector space. The constituents of the approximating symbol structure are defined by structural positions \u2014 roles \u2014 that can be filled by symbols. We show that when E is constructed to explicitly embed a particular type of structure (e.g., string or tree), ROLE successfully extracts the ground-truth roles defining that structure. We then analyze a seq2seq network trained to perform a more complex compositional task (SCAN), where there is no ground truth role scheme available. For this model, ROLE successfully discovers an interpretable symbolic structure that the model implicitly uses to perform the SCAN task, providing a comprehensive account of the link between the representations and the behavior of a notoriously hard-to-interpret type of model. We verify the causal importance of the discovered symbolic structure by showing that, when we systematically manipulate hidden embeddings based on this symbolic structure, the model\u2019s output is also changed in the way predicted by our analysis. Finally, we use ROLE to explore whether popular sentence embedding models are capturing compositional structure and find evidence that they are not; we conclude by discussing how insights from ROLE can be used to impart new inductive biases that will improve the compositional abilities of such models.",
        "keywords": "compositionality;generalization;neurosymbolic;symbolic structures;interpretability;tensor product representations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Paul Soulos;Tom McCoy;Tal Linzen;Paul Smolensky",
        "authorids": "psoulos1@jhu.edu;tom.mccoy@jhu.edu;tal.linzen@jhu.edu;paul.smolensky@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsoulos2020discovering,\ntitle={Discovering the compositional structure of vector representations with Role Learning Networks},\nauthor={Paul Soulos and Tom McCoy and Tal Linzen and Paul Smolensky},\nyear={2020},\nurl={https://openreview.net/forum?id=BklMDCVtvr}\n}",
        "github": "https://github.com/iclr2020-anonymous1/role-learner",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BklMDCVtvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "272;185;82",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "348;365;811",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            179.66666666666666,
            77.65879788464981
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            508.0,
            214.36573109213762
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 50,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8704426777633911958&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BklOXeBFDS",
        "title": "Transfer Active Learning For Graph Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel method to learn a transferable active learning policy for Graph Neural Networks via reinforcement learning and  policy distillation.",
        "abstract": "Graph neural networks have been proved very effective for a variety of prediction tasks on graphs such as node classification. Generally, a large number of labeled data are required to train these networks. However, in reality it could be very expensive to obtain a large number of labeled data on large-scale graphs. In this paper, we studied active learning for graph neural networks, i.e., how to effectively label the nodes on a graph for training graph neural networks. We formulated the problem as a sequential decision process, which sequentially label informative nodes, and trained a policy network to maximize the performance of graph neural networks for a specific task. Moreover, we also studied how to learn a universal policy for labeling nodes on graphs with multiple training graphs and then transfer the learned policy to unseen graphs. Experimental results on both settings of a single graph and multiple training graphs (transfer learning setting) prove the effectiveness of our proposed approaches over many competitive baselines. ",
        "keywords": "Active Learning;Graph Neural Networks;Transfer Learning;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shengding Hu;Meng Qu;Zhiyuan Liu;Jian Tang",
        "authorids": "hsd16@mails.tsinghua.edu.cn;meng.qu@umontreal.ca;liuzy@tsinghua.edu.cn;jian.tang@hec.ca",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhu2020transfer,\ntitle={Transfer Active Learning For Graph Neural Networks},\nauthor={Shengding Hu and Meng Qu and Zhiyuan Liu and Jian Tang},\nyear={2020},\nurl={https://openreview.net/forum?id=BklOXeBFDS}\n}",
        "github": "https://drive.google.com/drive/folders/1GFGR2WFEuG49MQN-nX4pkZj9Y_E7vPP5",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BklOXeBFDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "294;435;285",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "144;166;274",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            338.0,
            68.68769904429759
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            194.66666666666666,
            56.81157941437252
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12017446822452233018&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BklRFpVKPH",
        "title": "Demonstration Actor Critic",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We study the problem of \\textit{Reinforcement learning from demonstrations (RLfD)}, where the learner is provided with both some expert demonstrations and reinforcement signals from the environment. One approach leverages demonstration data in a supervised manner, which is simple and direct, but can only provide supervision signal over those states seen in the demonstrations. Another approach uses demonstration data for reward shaping. By contrast, the latter approach can provide guidance on how to take actions, even for those states are not seen in the demonstrations. But existing algorithms in the latter one adopt shaping reward which is not directly dependent on current policy, limiting the algorithms to treat demonstrated states the same as other states, failing to directly exploit supervision signal in demonstration data. In this paper, we propose a novel objective function with policy-dependent shaping reward, so as to get the best of both worlds. We present a convergence proof for policy iteration of the proposed objective, under the tabular setting. Then we develop a new practical algorithm, termed as Demonstration Actor Critic (DAC). Experiments on a range of popular benchmark sparse-reward tasks shows that our DAC method obtains a significant performance gain over five strong and off-the-shelf baselines.",
        "keywords": "Deep Reinforcement Learning;Reinforcement Learning from Demonstration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guoqing Liu;Li Zhao;Pushi Zhang;Jiang Bian;Tao Qin;Nenghai Yu;Tie-Yan Liu",
        "authorids": "lgq1001@mail.ustc.edu.cn;lizo@microsoft.com;zpschang@gmail.com;jiang.bian@microsoft.com;taoqin@microsoft.com;ynh@ustc.edu.cn;tyliu@microsoft.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nliu2020demonstration,\ntitle={Demonstration Actor Critic},\nauthor={Guoqing Liu and Li Zhao and Pushi Zhang and Jiang Bian and Tao Qin and Nenghai Yu and Tie-Yan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=BklRFpVKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BklRFpVKPH",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "380;444;479",
        "wc_reply_reviewers": "146;0;42",
        "wc_reply_authors": "1796;406;818",
        "reply_reviewers": "1;0;1",
        "reply_authors": "3;1;2",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            434.3333333333333,
            40.99051380773633
        ],
        "wc_reply_reviewers_avg": [
            62.666666666666664,
            61.369554521946974
        ],
        "wc_reply_authors_avg": [
            1006.6666666666666,
            582.935864587368
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15390706573132582964&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BklS6ANFDH",
        "title": "Semi-Supervised Semantic Dependency Parsing Using CRF Autoencoders",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose an approach to semi-supervised learning of semantic dependency parsers based on the CRF autoencoder framework.",
        "abstract": "Semantic dependency parsing, which aims to find rich bi-lexical relationships, allows words to have multiple dependency heads, resulting in graph-structured representations. We propose an approach to semi-supervised learning of semantic dependency parsers based on the CRF autoencoder framework. Our encoder is a discriminative neural semantic dependency parser that predicts the latent parse graph of the input sentence. Our decoder is a generative neural model that reconstructs the input sentence conditioned on the latent parse graph. Our model is arc-factored and therefore parsing and learning are both tractable. Experiments show our model achieves significant and consistent improvement over the supervised baseline.",
        "keywords": "Semi-Supervised Learning;Semantic Dependency Parsing;CRF Autoencoder;Natural Language Processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zixia Jia;Youmi Ma;Jiong Cai;Kewei Tu",
        "authorids": "jiazx@shanghaitech.edu.cn;maym@shanghaitech.edu.cn;caijiong@shanghaitech.edu.cn;tukw@shanghaitech.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BklS6ANFDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "433;219;556",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            402.6666666666667,
            139.24159659463197
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1557212402500232668&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BklSv34KvB",
        "title": "Carpe Diem, Seize the Samples Uncertain \"at the Moment\" for Adaptive Batch Selection",
        "track": "main",
        "status": "Reject",
        "tldr": "We explore the issue of truly uncertain samples for more effective batch selection.",
        "abstract": "The performance of deep neural networks is significantly affected by how well mini-batches are constructed. In this paper, we propose a novel adaptive batch selection algorithm called Recency Bias that exploits the uncertain samples predicted inconsistently in recent iterations. The historical label predictions of each sample are used to evaluate its predictive uncertainty within a sliding window. By taking advantage of this design, Recency Bias not only accelerates the training step but also achieves a more accurate network. We demonstrate the superiority of Recency Bias by extensive evaluation on two independent tasks. Compared with existing batch selection methods, the results showed that Recency Bias reduced the test error by up to 20.5% in a fixed wall-clock training time. At the same time, it improved the training time by up to 59.3% to reach the same test error.",
        "keywords": "batch selection;uncertain sample;acceleration;convergence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hwanjun Song;Minseok Kim;Sundong Kim;Jae-Gil Lee",
        "authorids": "songhwanjun@kaist.ac.kr;minseokkim@kaist.ac.kr;sundong@ibs.re.kr;jaegil@kaist.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsong2020carpe,\ntitle={Carpe Diem, Seize the Samples Uncertain ''at the Moment'' for Adaptive Batch Selection},\nauthor={Hwanjun Song and Minseok Kim and Sundong Kim and Jae-Gil Lee},\nyear={2020},\nurl={https://openreview.net/forum?id=BklSv34KvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BklSv34KvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "314;835;325",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "360;416;182",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            491.3333333333333,
            243.05052058277013
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            319.3333333333333,
            99.76416635690838
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7104947089787922022&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "BklSwn4tDH",
        "title": "Prestopping: How Does Early Stopping Help Generalization Against Label Noise?",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel two-phase training approach based on \"early stopping\" for robust training on noisy labels.",
        "abstract": "Noisy labels are very common in real-world training data, which lead to poor generalization on test data because of overfitting to the noisy labels. In this paper, we claim that such overfitting can be avoided by \"early stopping\" training a deep neural network before the noisy labels are severely memorized. Then, we resume training the early stopped network using a \"maximal safe set,\" which maintains a collection of almost certainly true-labeled samples at each epoch since the early stop point. Putting them all together, our novel two-phase training method, called Prestopping, realizes noise-free training under any type of label noise for practical use. Extensive experiments using four image benchmark data sets verify that our method significantly outperforms four state-of-the-art methods in test error by 0.4\u20138.2 percent points under existence of real-world noise.",
        "keywords": "noisy label;label noise;robustness;deep learning;early stopping",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hwanjun Song;Minseok Kim;Dongmin Park;Jae-Gil Lee",
        "authorids": "songhwanjun@kaist.ac.kr;minseokkim@kaist.ac.kr;dongminpark@kaist.ac.kr;jaegil@kaist.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsong2020prestopping,\ntitle={Prestopping: How Does Early Stopping Help Generalization Against Label Noise?},\nauthor={Hwanjun Song and Minseok Kim and Dongmin Park and Jae-Gil Lee},\nyear={2020},\nurl={https://openreview.net/forum?id=BklSwn4tDH}\n}",
        "github": "https://bit.ly/2l3g9Jx",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BklSwn4tDH",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "266;189;410;191",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "709;493;596;415",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;2;2;2",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            264.0,
            89.8248295294792
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            553.25,
            110.48614166491652
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11177910202852609925&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BklTQCEtwH",
        "title": "Curriculum Learning for Deep Generative Models with Clustering",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel cluster-based algorithm of curriculum learning is proposed to solve the robust training of generative models.",
        "abstract": "Training generative models like Generative Adversarial Network (GAN)  is challenging for noisy data. A novel curriculum learning algorithm pertaining to clustering is proposed to address this issue in this paper. The curriculum construction is based on the centrality of underlying clusters in data points.  The data points of high centrality takes priority of being fed into generative models during training. To make our algorithm scalable to large-scale data, the active set is devised, in the sense that every round of training proceeds only on an active subset containing a small fraction of already trained data and the incremental data of lower centrality. Moreover, the geometric analysis is presented to interpret the necessity of cluster curriculum for generative models. The experiments on cat and human-face data validate that our algorithm is able to learn the optimal generative models (e.g. ProGAN) with respect to specified quality metrics for noisy data. An interesting finding is that the optimal cluster curriculum is closely related to the critical point of the geometric percolation process formulated in the paper.",
        "keywords": "curriculum learning;generative adversarial network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Deli Zhao;Jiapeng Zhu;Zhenfang Guo;Bo Zhang",
        "authorids": "zhaodeli@gmail.com;jengzhu0@gmail.com;guozhenfang@pku.edu.cn;zhangbo@xiaomi.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhao2020curriculum,\ntitle={Curriculum Learning for Deep Generative Models with Clustering},\nauthor={Deli Zhao and Jiapeng Zhu and Zhenfang Guo and Bo Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=BklTQCEtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BklTQCEtwH",
        "pdf_size": 0,
        "rating": "1;6",
        "confidence": "0;0",
        "wc_review": "443;415",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "505;157",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            429.0,
            14.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            331.0,
            174.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16824047564159123301&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BklVA2NYvH",
        "title": "Adversarially Robust Neural Networks via Optimal Control: Bridging Robustness with Lyapunov Stability",
        "track": "main",
        "status": "Reject",
        "tldr": "An adversarial defense method bridging robustness of deep neural nets with Lyapunov stability",
        "abstract": "Deep neural networks are known to be vulnerable to adversarial perturbations. In this paper, we bridge adversarial robustness of neural nets with Lyapunov stability of dynamical systems. From this viewpoint, training neural nets is equivalent to finding an optimal control of the discrete dynamical system, which allows one to utilize methods of successive approximations, an optimal control algorithm based on Pontryagin's maximum principle, to train neural nets. This decoupled training method allows us to add constraints to the optimization, which makes the deep model more robust. The constrained optimization problem can be formulated as a semi-definite programming problem and hence can be solved efficiently. Experiments show that our method effectively improves deep model's adversarial robustness.",
        "keywords": "adversarial defense;optimal control;Lyapunov stability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiyang Chen;Hang Su",
        "authorids": "zy-chen17@mails.tsinghua.edu.cn;suhangss@mail.tsinghua.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nchen2020adversarially,\ntitle={Adversarially Robust Neural Networks via Optimal Control: Bridging Robustness with Lyapunov Stability},\nauthor={Zhiyang Chen and Hang Su},\nyear={2020},\nurl={https://openreview.net/forum?id=BklVA2NYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BklVA2NYvH",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "331;133;152",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            205.33333333333334,
            89.19765816557194
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Guc2ObYZUxMJ:scholar.google.com/&scioq=Adversarially+Robust+Neural+Networks+via+Optimal+Control:+Bridging+Robustness+with+Lyapunov+Stability&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BklWt24tvH",
        "title": "Learning Structured Communication for Multi-agent Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Learning to cooperate is crucial for many practical large-scale multi-agent applications. In this work, we consider an important collaborative task, in which agents learn to ef\ufb01ciently communicate with each other under a multi-agent reinforcement learning (MARL) setting. Despite the fact that there has been a number of existing works along this line, achieving global cooperation at scale is still challenging. In particular, most of the existing algorithms suffer from issues such as scalability and high communication complexity, in the sense that when the agent population is large, it can be dif\ufb01cult to extract effective information for high-performance MARL. In contrast, the proposed algorithmic framework, termed Learning Structured Communication (LSC), is not only scalable but also communication high-qualitative (learning ef\ufb01cient). The key idea is to allow the agents to dynamically learn a hierarchical communication structure, while under such a structure the graph neural network (GNN) is used to ef\ufb01ciently extract useful information to be exchanged between the neighboring agents. A number of new techniques are proposed to tightly integrate the communication structure learning, GNN optimization and MARL tasks. Extensive experiments are performed to demonstrate that, the proposed LSC framework enjoys high communication ef\ufb01ciency, scalability and global cooperation capability.",
        "keywords": "Learning to communicate;Multi-agent reinforcement learning;Hierarchical communication network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junjie Sheng;Xiangfeng Wang;Bo Jin;Junchi Yan;Wenhao Li;Tsung-Hui Chang;Jun Wang;Hongyuan Zha",
        "authorids": "52194501003@stu.ecnu.edu.cn;xfwang@sei.ecnu.edu.cn;bjin@cs.ecnu.edu.cn;yanjunchi@sjtu.edu.cn;52194501026@stu.ecnu.edu.cn;changtsunghui@cuhk.edu.cn;jwang@sei.ecnu.edu.cn;zha@sei.ecnu.edu.cn",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nsheng2020learning,\ntitle={Learning Structured Communication for Multi-agent Reinforcement Learning},\nauthor={Junjie Sheng and Xiangfeng Wang and Bo Jin and Junchi Yan and Wenhao Li and Tsung-Hui Chang and Jun Wang and Hongyuan Zha},\nyear={2020},\nurl={https://openreview.net/forum?id=BklWt24tvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BklWt24tvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "536;817;142",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "527;432;164",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            498.3333333333333,
            276.85174532389874
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            374.3333333333333,
            153.70173135726944
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 71,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5162435897990382628&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BklXkCNYDB",
        "title": "Fast Training of Sparse Graph Neural Networks on Dense Hardware",
        "track": "main",
        "status": "Reject",
        "tldr": "Is sparse hardware necessary for training sparse GNNs? No. Does large-batch training work for sparse GNNs? Yes. So what? We can train a model in 13 minutes that previously took almost a day.",
        "abstract": "Graph neural networks have become increasingly popular in recent years due to their ability to naturally encode relational input data and their ability to operate on large graphs by using a sparse representation of graph adjacency matrices. As we look to scale up these models using custom hardware, a natural assumption would be that we need hardware tailored to sparse operations and/or dynamic control flow. In this work, we question this assumption by scaling up sparse graph neural networks using a platform targeted at dense computation on fixed-size data. Drawing inspiration from optimization of numerical algorithms on sparse matrices, we develop techniques that enable training the sparse graph neural network model from Allamanis et al. (2018) in 13 minutes using a 512-core TPUv2 Pod, whereas the original training takes almost a day.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matej Balog;Bart van Merri\u00ebnboer;Subhodeep Moitra;Yujia Li;Daniel Tarlow",
        "authorids": "matej.balog@gmail.com;bartvm@google.com;smoitra@google.com;yujiali@google.com;dtarlow@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nbalog2020fast,\ntitle={Fast Training of Sparse Graph Neural Networks on Dense Hardware},\nauthor={Matej Balog and Bart van Merri{\\\"e}nboer and Subhodeep Moitra and Yujia Li and Daniel Tarlow},\nyear={2020},\nurl={https://openreview.net/forum?id=BklXkCNYDB}\n}",
        "github": "https://github.com/anonymous-authors-iclr2020/fast_training_of_sparse_graph_neural_networks_on_dense_hardware/blob/master/code.ipynb",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BklXkCNYDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "164;505;205",
        "wc_reply_reviewers": "540;58;0",
        "wc_reply_authors": "1681;609;279",
        "reply_reviewers": "2;1;0",
        "reply_authors": "4;3;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            291.3333333333333,
            152.0095026269374
        ],
        "wc_reply_reviewers_avg": [
            199.33333333333334,
            242.0486636103483
        ],
        "wc_reply_authors_avg": [
            856.3333333333334,
            598.4877238135763
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.247219128924647
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7483679452246116817&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BklYhxBYwH",
        "title": "Fuzzing-Based Hard-Label Black-Box Attacks Against Machine Learning Models",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Machine learning models are known to be vulnerable to adversarial examples. Based on different levels of knowledge that attackers have about the models, adversarial example generation methods can be categorized into white-box and black-box attacks. We study the most realistic attacks, hard-label black-box attacks, where attackers only have the query access of a model and only the final predicted labels are available. The main limitation of the existing hard-label black-box attacks is that they need a large number of model queries, making them inefficient and even infeasible in practice. Inspired by the very successful fuzz testing approach in traditional software testing and computer security domains, we propose fuzzing-based hard-label black-box attacks against machine learning models. We design an AdvFuzzer to explore multiple paths between a source image and a guidance image, and design a LocalFuzzer to explore the nearby space around a given input for identifying potential adversarial examples. We demonstrate that our fuzzing attacks are feasible and effective in generating successful adversarial examples with significantly reduced number of model queries and L0 distance. More interestingly, supplied with a successful adversarial example as a seed, LocalFuzzer can immediately generate more successful adversarial examples even with smaller L2 distance from the source example, indicating that LocalFuzzer itself can be an independent and useful tool to augment many adversarial example generation algorithms.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yi Qin;Chuan Yue",
        "authorids": "yiqin@mines.edu;chuanyue@mines.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nqin2020fuzzingbased,\ntitle={Fuzzing-Based Hard-Label Black-Box Attacks Against Machine Learning Models},\nauthor={Yi Qin and Chuan Yue},\nyear={2020},\nurl={https://openreview.net/forum?id=BklYhxBYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BklYhxBYwH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "261;302;319",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "431;406;621",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            294.0,
            24.34474618201362
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            486.0,
            96.00347215943113
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18317275684531213496&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BklZ9A4YDr",
        "title": "Learning to Generate 3D Training Data through Hybrid Gradient",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Synthetic images rendered by graphics engines are a promising source for training deep networks. However, it is challenging to ensure that they can help train a network to perform well on real images, because a graphics-based generation pipeline requires numerous design decisions such as the selection of 3D shapes and the placement of the camera. In this work, we propose a new method that optimizes the generation of 3D training data based on what we call \"hybrid gradient\". We parametrize the design decisions as a real vector, and combine the approximate gradient and the analytical gradient to obtain the hybrid gradient of the network performance with respect to this vector. We evaluate our approach on the task of estimating surface normal and depth from a single image. Experiments on standard benchmarks show that our approach can outperform the prior state of the art on optimizing the generation of 3D training data, particularly in terms of computational efficiency.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dawei Yang;Jia Deng",
        "authorids": "ydawei@umich.edu;jiadeng@princeton.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BklZ9A4YDr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "679;454;225",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.6666666666667,
            185.3471217892405
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7768734525918658112&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Bkle6T4YvB",
        "title": "From English to Foreign Languages: Transferring Pre-trained Language Models",
        "track": "main",
        "status": "Reject",
        "tldr": "How to train non-English BERT within one day on using a single GPU",
        "abstract": "Pre-trained models have demonstrated their effectiveness in many downstream natural language processing (NLP) tasks. The availability of multilingual pre-trained models enables zero-shot transfer of NLP tasks from high resource languages to low resource ones. However, recent research in improving pre-trained models focuses heavily on English. While it is possible to train the latest neural architectures for other languages from scratch, it is undesirable due to the required amount of compute. In this work, we tackle the problem of transferring an existing pre-trained model from English to other languages under a limited computational budget. With a single GPU, our approach can obtain a foreign BERT-base model within a day and a foreign BERT-large within two days. Furthermore, evaluating our models on six languages, we demonstrate that our models are better than multilingual BERT on two zero-shot tasks: natural language inference and dependency parsing.",
        "keywords": "pretrained language model;zero-shot transfer;parsing;natural language inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ke Tran",
        "authorids": "ketranmanh@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\ntran2020from,\ntitle={From English to Foreign Languages: Transferring Pre-trained Language Models},\nauthor={Ke Tran},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkle6T4YvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bkle6T4YvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "154;259;118",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;284;91",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            177.0,
            59.81638571495272
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            125.0,
            118.40889606219064
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 46,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8643414394174440157&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BkleBaVFwB",
        "title": "Scalable Generative Models for Graphs with Graph Attention Mechanism",
        "track": "main",
        "status": "Reject",
        "tldr": "We proposed a generative model for graphs that is scalable in terms of the number of nodes, dataset size, and the number of node/edge labels utilizing graph attention mechanism. ",
        "abstract": "Graphs are ubiquitous real-world data structures, and generative models that approximate distributions over graphs and derive new samples from them have significant importance. Among the known challenges in graph generation tasks, scalability handling of large graphs and datasets is one of the most important for practical applications. Recently, an increasing number of graph generative models have been proposed and have demonstrated impressive results. However, scalability is still an unresolved problem due to the complex generation process or difficulty in training parallelization. \nIn this paper, we first define scalability from three different perspectives: number of nodes, data, and node/edge labels. Then, we propose GRAM, a generative model for graphs that is scalable in all three contexts, especially in training. We aim to achieve scalability by employing a novel graph attention mechanism, formulating the likelihood of graphs in a simple and general manner. Also, we apply two techniques to reduce computational complexity. Furthermore, we construct a unified and non-domain-specific evaluation metric in node/edge-labeled graph generation tasks by combining a graph kernel and Maximum Mean Discrepancy. Our experiments on synthetic and real-world graphs demonstrated the scalability of our models and their superior performance compared with baseline methods.",
        "keywords": "Graph Generative Model;Attention Mechanism",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wataru Kawai;Yusuke Mukuta;Tatsuya Harada",
        "authorids": "w-kawai@mi.t.u-tokyo.ac.jp;mukuta@mi.t.u-tokyo.ac.jp;harada@mi.t.u-tokyo.ac.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkawai2020scalable,\ntitle={Scalable Generative Models for Graphs with Graph Attention Mechanism},\nauthor={Wataru Kawai and Yusuke Mukuta and Tatsuya Harada},\nyear={2020},\nurl={https://openreview.net/forum?id=BkleBaVFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkleBaVFwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "663;1538;599",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1094;622;1432",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;3",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            933.3333333333334,
            428.36147767458687
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1049.3333333333333,
            332.186025527197
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17747946626934046910&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BklekANtwr",
        "title": "Unsupervised Learning of Automotive 3D Crash Simulations using LSTMs",
        "track": "main",
        "status": "Reject",
        "tldr": "A two branch LSTM based network architecture learns the representation and dynamics of 3D meshes of numerical crash simulations.",
        "abstract": "Long short-term memory (LSTM) networks allow to exhibit temporal dynamic behavior with feedback connections and seem a natural choice for learning sequences of 3D meshes. We introduce an approach for dynamic mesh representations as used for numerical simulations of car crashes. To bypass the complication of using 3D meshes, we transform the surface mesh sequences into spectral descriptors that efficiently encode the shape. A two branch LSTM based network architecture is chosen to learn the representations and dynamics of the crash during the simulation. The architecture is based on unsupervised video prediction by an LSTM without any convolutional layer. It uses an encoder LSTM to map an input sequence into a fixed length vector representation. On this representation one decoder LSTM performs the reconstruction of the input sequence, while the other decoder LSTM predicts the future behavior by receiving initial steps of the sequence as seed. The spatio-temporal error behavior of the model is analysed to study how well the model can extrapolate the learned spectral descriptors into the future, that is, how well it has learned to represent the underlying dynamical structural mechanics. Considering that only a few training examples are available, which is the typical case for numerical simulations, the network performs very well.",
        "keywords": "LSTM;surface data;geometric deep learning;numerical simulation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amin Abbasloo;Jochen Garcke;Rodrigo Iza-Teran",
        "authorids": "amin.abbasloo@scai.fraunhofer.de;garcke@ins.uni-bonn.de;rodrigo.iza-teran@scai.fraunhofer.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nabbasloo2020unsupervised,\ntitle={Unsupervised Learning of Automotive 3D Crash Simulations using {\\{}LSTM{\\}}s},\nauthor={Amin Abbasloo and Jochen Garcke and Rodrigo Iza-Teran},\nyear={2020},\nurl={https://openreview.net/forum?id=BklekANtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BklekANtwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "456;207;99",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            254.0,
            149.48578527739687
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zOYIGqUFPpoJ:scholar.google.com/&scioq=Unsupervised+Learning+of+Automotive+3D+Crash+Simulations+using+LSTMs&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BklfR3EYDH",
        "title": "Keyframing the Future: Discovering Temporal Hierarchy with Keyframe-Inpainter Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a model that learns to discover informative frames in a future video sequence and represent the video via its keyframes.",
        "abstract": "To flexibly and efficiently reason about temporal sequences, abstract representations that compactly represent the important information in the sequence are needed. One way of constructing such representations is by focusing on the important events in a sequence. In this paper, we propose a model that learns both to discover such key events (or keyframes) as well as to represent the sequence in terms of them.  We do so using a hierarchical Keyframe-Inpainter (KeyIn) model that first generates keyframes and their temporal placement and then inpaints the sequences between keyframes. We propose a fully differentiable formulation for efficiently learning the keyframe placement. We show that KeyIn finds informative keyframes in several datasets with diverse dynamics. When evaluated on a planning task, KeyIn outperforms other recent proposals for learning hierarchical representations.",
        "keywords": "representation learning;variational inference;video generation;temporal hierarchy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Karl Pertsch;Oleh Rybkin;Jingyun Yang;Konstantinos G. Derpanis;Kostas Daniilidis;Joseph J. Lim;Andrew Jaegle",
        "authorids": "pertsch@usc.edu;oleh@seas.upenn.edu;jingyuny@usc.edu;kosta@ryerson.ca;kostas@seas.upenn.edu;limjj@usc.edu;ajaegle@upenn.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\npertsch2020keyframing,\ntitle={Keyframing the Future: Discovering Temporal Hierarchy with Keyframe-Inpainter Prediction},\nauthor={Karl Pertsch and Oleh Rybkin and Jingyun Yang and Konstantinos G. Derpanis and Kostas Daniilidis and Joseph J. Lim and Andrew Jaegle},\nyear={2020},\nurl={https://openreview.net/forum?id=BklfR3EYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BklfR3EYDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "573;614;494",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "858;942;417",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            560.3333333333334,
            49.80182950677838
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            739.0,
            230.25637884757938
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:e5u_5zH3IAUJ:scholar.google.com/&scioq=Keyframing+the+Future:+Discovering+Temporal+Hierarchy+with+Keyframe-Inpainter+Prediction&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Bklfcxrtvr",
        "title": "Geometry-Aware Visual Predictive Models of Intuitive Physics",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Learning object dynamics for model-based control usually involves choosing among two alternatives: i) engineered 3D state representations comprised of 3D object locations and poses, or, ii) learnt 2D image representations trained end-to-end for the dynamics prediction task. The former requires laborious human annotations to extract the 3D information from 2D images, and does not permit end-to-end learning. The latter has not shown until today to generalize across camera viewpoints or to handle camera motion and cross-object occlusions. We propose neural architectures that learn to disentangle an RGB-D video steam into camera motion and 3D scene appearance, and capture the latter into 3D feature representations that can be trained end-to-end with 3D object detection and object motion forecasting. We feed object-centric 3D feature maps and actions of the agent into differentiable neural modules and learn to forecast object 3D motion. We empirically demonstrate the proposed 3D representations learn object dynamics that generalize across camera viewpoints and can handle object occlusions. They do not suffer from error accumulation when unrolled over time thanks to the permanence of object appearance in 3D. They outperform by a margin both 2D learned image representations as well as engineered 3D ones in forecasting object dynamics.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hsiao-Yu Fish Tung;Zhou Xian;Mihir Prabhudesai;Katerina Fragkiadaki",
        "authorids": "htung@cs.cmu.edu;zhouxian@cmu.edu;mprabhud@cs.cmu.edu;katef@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bklfcxrtvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "718;1094;575",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            795.6666666666666,
            218.8825154176449
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wmRJcbXMsDIJ:scholar.google.com/&scioq=Geometry-Aware+Visual+Predictive+Models+of+Intuitive+Physics&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Bklg1grtDr",
        "title": "Neural Design of Contests and All-Pay Auctions using Multi-Agent Simulation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a multi-agent learning approach to designing all-pay auctions, which works even in settings where computing the Nash equilibrium bidding strategies is intractable. ",
        "abstract": "We propose a multi-agent learning approach for designing crowdsourcing contests and all-pay auctions. Prizes in contests incentivise contestants to expend effort on their entries, with different prize allocations resulting in different incentives and bidding behaviors. In contrast to auctions designed manually by economists, our method searches the possible design space using a simulation of the multi-agent learning process, and can thus handle settings where a game-theoretic equilibrium analysis is not tractable. Our method simulates agent learning in contests and evaluates the utility of the resulting outcome for the auctioneer. Given a large contest design space, we assess through simulation many possible contest designs within the space, and fit a neural network to predict outcomes for previously untested contest designs. Finally, we apply mirror descent to optimize the design so as to achieve more desirable outcomes. Our empirical analysis shows our approach closely matches the optimal outcomes in settings where the equilibrium is known, and can produce high quality designs in settings where the equilibrium strategies are not solvable analytically. ",
        "keywords": "Auctions;Mechanism Design;Multi-Agent;Fictitious Play",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thomas Anthony;Ian Gemp;Janos Kramar;Tom Eccles;Andrea Tacchetti;Yoram Bachrach",
        "authorids": "twa@google.com;imgemp@google.com;janosk@google.com;eccles@google.com;atacchet@google.com;yorambac@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nanthony2020neural,\ntitle={Neural Design of Contests and All-Pay Auctions using Multi-Agent Simulation},\nauthor={Thomas Anthony and Ian Gemp and Janos Kramar and Tom Eccles and Andrea Tacchetti and Yoram Bachrach},\nyear={2020},\nurl={https://openreview.net/forum?id=Bklg1grtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bklg1grtDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "152;306;281",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "362;599;1154",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            246.33333333333334,
            67.4800382006083
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            705.0,
            331.90661337189414
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_V2Cw9BYMfkJ:scholar.google.com/&scioq=Neural+Design+of+Contests+and+All-Pay+Auctions+using+Multi-Agent+Simulation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BklhsgSFvB",
        "title": "Learning to Transfer via Modelling Multi-level Task Dependency",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel multi-task learning framework which extracts multi-view dependency relationship automatically and use it to guide the knowledge transfer among different tasks.",
        "abstract": "Multi-task learning has been successful in modeling multiple related tasks with large, carefully curated labeled datasets. By leveraging the relationships among different tasks, multi-task learning framework can improve the performance significantly. However, most of the existing works are under the assumption that the predefined tasks are related to each other. Thus, their applications on real-world are limited, because rare real-world problems are closely related. Besides, the understanding of relationships among tasks has been ignored by most of the current methods. Along this line, we propose a novel multi-task learning framework - Learning To Transfer Via Modelling Multi-level Task Dependency, which constructed attention based dependency relationships among different tasks. At the same time, the dependency relationship can be used to guide what knowledge should be transferred, thus the performance of our model also be improved. To show the effectiveness of our model and the importance of considering multi-level dependency relationship, we conduct experiments on several public datasets, on which we obtain significant improvements over current methods.",
        "keywords": "multi-task learning;attention mechanism",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haonan Wang;Zhenbang Wu;Ziniu Hu;Yizhou Sun",
        "authorids": "haonan3@illinois.edu;zw12@illinois.edu;bull@cs.ucla.edu;yzsun@cs.ucla.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020learning,\ntitle={Learning to Transfer via Modelling Multi-level Task Dependency},\nauthor={Haonan Wang and Zhenbang Wu and Ziniu Hu and Yizhou Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=BklhsgSFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BklhsgSFvB",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "616;146;501;245",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "561;681;449;200",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            377.0,
            189.2630444645758
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            472.75,
            177.5617850214398
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-CGz9fe7IOIJ:scholar.google.com/&scioq=Learning+to+Transfer+via+Modelling+Multi-level+Task+Dependency&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "BkljIlHtvS",
        "title": "Decoupling Adaptation from Modeling with Meta-Optimizers for Meta Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We find that deep models are crucial for MAML to work and propose a method which enables effective meta-learning in smaller models.",
        "abstract": "Meta-learning methods, most notably Model-Agnostic Meta-Learning (Finn et al, 2017) or MAML, have achieved great success in adapting to new tasks quickly, after having been trained on similar tasks.\nThe mechanism behind their success, however, is poorly understood.\nWe begin this work with an experimental analysis of MAML, finding that deep models are crucial for its success, even given sets of simple tasks where a linear model would suffice on any individual task.\nFurthermore, on image-recognition tasks, we find that the early layers of MAML-trained models learn task-invariant features, while later layers are used for adaptation, providing further evidence that these models require greater capacity than is strictly necessary for their individual tasks.\nFollowing our findings, we propose a method which enables better use of model capacity at inference time by separating the adaptation aspect of meta-learning into parameters that are only used for adaptation but are not part of the forward model.\nWe find that our approach enables more effective meta-learning in smaller models, which are suitably sized for the individual tasks.\n",
        "keywords": "meta-learning;MAML;analysis;depth;meta-optimizers",
        "primary_area": "",
        "supplementary_material": "",
        "author": "S\u00e9bastien M.R. Arnold;Shariq Iqbal;Fei Sha",
        "authorids": "arnolds@usc.edu;shariqiqbal2810@gmail.com;fsha@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\narnold2020decoupling,\ntitle={Decoupling Adaptation from Modeling with Meta-Optimizers for Meta Learning},\nauthor={S{\\'e}bastien M.R. Arnold and Shariq Iqbal and Fei Sha},\nyear={2020},\nurl={https://openreview.net/forum?id=BkljIlHtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkljIlHtvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "224;870;316",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "715;848;313",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            470.0,
            285.3255450650479
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            625.3333333333334,
            227.42959838645064
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12480905048674277327&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkllBpEKDH",
        "title": "Continuous Adaptation in Multi-agent Competitive Environments",
        "track": "main",
        "status": "Reject",
        "tldr": "We construct a simplified baseball game scenario to develop and evaluate the adaptation capability of learning agents.",
        "abstract": "In a multi-agent competitive environment, we would expect an agent who can quickly adapt to environmental changes may have a higher probability to survive and beat other agents. In this paper, to discuss whether the adaptation capability can help a learning agent to improve its competitiveness in a multi-agent environment, we construct a simplified baseball game scenario to develop and evaluate the adaptation capability of learning agents. Our baseball game scenario is modeled as a two-player zero-sum stochastic game with only the final reward. We purpose a modified Deep CFR algorithm to learn a strategy that approximates the Nash equilibrium strategy. We also form several teams, with different teams adopting different playing strategies, trying to analyze (1) whether an adaptation mechanism can help in increasing the winning percentage and (2) what kind of initial strategies can help a team to get a higher winning percentage. The experimental results show that the learned Nash-equilibrium strategy is very similar to real-life baseball game strategy. Besides, with the proposed strategy adaptation mechanism, the winning percentage can be increased for the team with a Nash-equilibrium initial strategy. Nevertheless, based on the same adaptation mechanism, those teams with deterministic initial strategies actually become less competitive.",
        "keywords": "multi-agent environment;continuous adaptation;Nash equilibrium;deep counterfactual regret minimization;reinforcement learning;stochastic game;baseball",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kuei-Tso Lee;Sheng-Jyh Wang",
        "authorids": "fuj30089@gmail.com;shengjyh@faculty.nctu.edu.tw",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlee2020continuous,\ntitle={Continuous Adaptation in Multi-agent Competitive Environments},\nauthor={Kuei-Tso Lee and Sheng-Jyh Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=BkllBpEKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkllBpEKDH",
        "pdf_size": 0,
        "rating": "1;1",
        "confidence": "0;0",
        "wc_review": "904;389",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "1312;1312",
        "reply_reviewers": "0;0",
        "reply_authors": "2;2",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            646.5,
            257.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1312.0,
            0.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6SUIQ4t6SXIJ:scholar.google.com/&scioq=Continuous+Adaptation+in+Multi-agent+Competitive+Environments&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Bkll_kHFPB",
        "title": "A Syntax-Aware Approach for Unsupervised Text Style Transfer",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Unsupervised text style transfer aims to rewrite the text of a source style into a target style while preserving the style-independent content, without parallel training corpus. Most of the existing methods address the problem by only leveraging the surface forms of words. In this paper, we incorporate the syntactic knowledge and propose a multi-task learning based Syntax-Aware Style Transfer (SAST) model. Our SAST jointly learns to generate a transferred output with aligned words and syntactic labels, where the alignment between the words and syntactic labels is enforced with a consistency constraint. The auxiliary syntactic label generation task regularizes the model to form more generalized representations, which is a desirable property especially in unsupervised tasks. Experimental results on two benchmark datasets for text style transfer demonstrate the effectiveness of the proposed method in terms of transfer accuracy, content preservation, and fluency.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yun Ma;Yangbin Chen;Xudong Mao;Qing Li",
        "authorids": "mayun371@gmail.com;robinchen2-c@my.cityu.edu.hk;xudong.xdmao@gmail.com;qing-prof.li@polyu.edu.hk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bkll_kHFPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "528;320;431",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            426.3333333333333,
            84.97973614665767
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12829617578849913171&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BklmtJBKDB",
        "title": "Conditional Flow Variational Autoencoders for Structured Sequence Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a conditional flow prior based CF-VAE for generative modelling of conditional distributions and effective regularisation schemes.",
        "abstract": "Prediction of future states of the environment and interacting agents is a key competence required for autonomous agents to operate successfully in the real world. Prior work for structured sequence prediction based on latent variable models imposes a uni-modal standard Gaussian prior on the latent variables. This induces a strong model bias which makes it challenging to fully capture the multi-modality of the distribution of the future states. In this work, we introduce Conditional Flow Variational Autoencoders (CF-VAE) using our novel conditional normalizing flow based prior to capture complex multi-modal conditional distributions for effective structured sequence prediction. Moreover, we propose two novel regularization schemes which stabilizes training and deals with posterior collapse for stable training and better match to the data distribution. Our experiments on three multi-modal structured sequence prediction datasets -- MNIST Sequences, Stanford Drone and HighD -- show that the proposed method obtains state of art results across different evaluation metrics.",
        "keywords": "Variational Inference;Normalizing Flows;Trajectories",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Apratim Bhattacharyya;Michael Hanselmann;Mario Fritz;Bernt Schiele;Christoph-Nikolas Straehle",
        "authorids": "abhattac@mpi-inf.mpg.de;michael.hanselmann@de.bosch.com;fritz@cispa.saarland;schiele@mpi-inf.mpg.de;christoph-nikolas.straehle@de.bosch.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nbhattacharyya2020conditional,\ntitle={Conditional Flow Variational Autoencoders for Structured Sequence Prediction},\nauthor={Apratim Bhattacharyya and Michael Hanselmann and Mario Fritz and Bernt Schiele and Christoph-Nikolas Straehle},\nyear={2020},\nurl={https://openreview.net/forum?id=BklmtJBKDB}\n}",
        "github": "https://drive.google.com/drive/folders/1L7RgmpA9j4gxF5hF8axRxDfQlI1xcfXo?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BklmtJBKDB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "484;465;238",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "841;639;395",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            395.6666666666667,
            111.75668013839511
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            625.0,
            182.3476533072654
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 109,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2556052927757208875&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Bkln2a4tPB",
        "title": "Customizing Sequence Generation with Multi-Task Dynamical Systems",
        "track": "main",
        "status": "Reject",
        "tldr": "Tailoring predictions from sequence models (such as LDSs and RNNs) via an explicit latent code.",
        "abstract": "Dynamical system models (including RNNs) often lack the ability to adapt the sequence generation or prediction to a given context, limiting their real-world application. In this paper we show that hierarchical multi-task dynamical systems (MTDSs) provide direct user control over sequence generation, via use of a latent  code z that specifies the customization to the\nindividual data sequence. This enables style transfer, interpolation and morphing within generated sequences. We show the MTDS can improve predictions via latent code interpolation, and avoid the long-term performance degradation of standard RNN approaches.",
        "keywords": "Time-series modelling;Dynamical systems;RNNs;Multi-task learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alex Bird;Christopher K. I. Williams",
        "authorids": "abird@turing.ac.uk;ckiw@inf.ed.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nbird2020customizing,\ntitle={Customizing Sequence Generation with Multi-Task Dynamical Systems},\nauthor={Alex Bird and Christopher K. I. Williams},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkln2a4tPB}\n}",
        "github": "https://bitbucket.org/user3036976834/mtds_iclr_codebase/src/master/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bkln2a4tPB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "273;279;208",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "486;783;373",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            253.33333333333334,
            32.14895885647862
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            547.3333333333334,
            172.9090962198217
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3239317649063216946&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Bklr0kBKvB",
        "title": "Geometry-aware Generation of Adversarial and Cooperative Point Clouds",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recent studies show that machine learning models are vulnerable to adversarial examples. In 2D image domain, these examples are obtained by adding imperceptible noises to natural images. This paper studies adversarial generation of point clouds by learning to deform those approximating object surfaces of certain categories. As 2D manifolds embedded in the 3D Euclidean space, object surfaces enjoy the general properties of smoothness and fairness. We thus argue that in order to achieve imperceptible surface shape deformations, adversarial point clouds should have the same properties with similar degrees of smoothness/fairness to the benign ones, while being close to the benign ones as well when measured under certain distance metrics of point clouds. To this end, we propose a novel loss function to account for imperceptible, geometry-aware deformations of point clouds, and use the proposed loss in an adversarial objective to attack representative models of point set classifiers. Experiments show that our proposed method achieves stronger attacks than existing methods, without introduction of noticeable outliers and surface irregularities. In this work, we also investigate an opposite direction that learns to deform point clouds of object surfaces in the same geometry-aware, but cooperative manner. Cooperatively generated point clouds are more favored by machine learning models in terms of improved classification confidence or accuracy. We present experiments verifying that our proposed objective succeeds in learning cooperative shape deformations.",
        "keywords": "Adversarial attack;Point cloud classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuxin Wen;Jiehong Lin;Ke Chen;Kui Jia",
        "authorids": "wen.yuxin@mail.scut.edu.cn;lin.jiehong@mail.scut.edu.cn;chenk@scut.edu.cn;kuijia@scut.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwen2020geometryaware,\ntitle={Geometry-aware Generation of Adversarial and Cooperative Point Clouds},\nauthor={Yuxin Wen and Jiehong Lin and Ke Chen and Kui Jia},\nyear={2020},\nurl={https://openreview.net/forum?id=Bklr0kBKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bklr0kBKvB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "314;512;322",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "775;747;215",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            382.6666666666667,
            91.5107765360027
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            579.0,
            257.6405765143889
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "Bklrea4KwS",
        "title": "Deep Multiple Instance Learning with Gaussian Weighting",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper we present a deep Multiple Instance Learning (MIL) method that can be trained end-to-end to perform classification from weak supervision. Our MIL method is implemented as a two stream neural network, specialized in tasks of instance classification and weighting. Our instance weighting stream makes use of Gaussian radial basis function to normalize the instance weights by comparing instances locally within the bag and globally across bags. The final classification score of the bag is an aggregate of all instance classification scores. The instance representation is shared by both instance classification and weighting streams. The Gaussian instance weighting allows us to regularize the representation learning of instances such that all positive instances to be closer to each other w.r.t. the instance weighting function. We evaluate our method on five standard MIL datasets and show that our method outperforms other MIL methods. We also evaluate our model on two datasets where all models are trained end-to-end. Our method obtain better bag-classification and instance classification results on these datasets. We conduct extensive experiments to investigate the robustness of the proposed model and obtain interesting insights.",
        "keywords": "Multiple instance learning;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Basura Fernando;Hakan Bilen",
        "authorids": "basura.fernando@anu.edu.au;hbilen@ed.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nfernando2020deep,\ntitle={Deep Multiple Instance Learning with Gaussian Weighting},\nauthor={Basura Fernando and Hakan Bilen},\nyear={2020},\nurl={https://openreview.net/forum?id=Bklrea4KwS}\n}",
        "github": "https://www.dropbox.com/sh/8x65fxbgu7n43ox/AAAUSu2h989Nto_ZJbT_gOb1a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bklrea4KwS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "176;417;247",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "231;783;184",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            280.0,
            101.11709384009544
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            399.3333333333333,
            271.9709951855569
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bZEgb5j1aRgJ:scholar.google.com/&scioq=Deep+Multiple+Instance+Learning+with+Gaussian+Weighting&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BklsagBYPS",
        "title": "A GOODNESS OF FIT MEASURE FOR GENERATIVE NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We define a goodness of fit measure for generative networks which captures how well the network can generate the training data, which is necessary to learn the true data distribution.\nWe demonstrate how our measure can be leveraged to understand mode collapse in generative adversarial networks and provide practitioners with a novel way to perform model comparison and early stopping without having to access another trained model as with Frechet Inception Distance or Inception Score. This measure shows that several successful, popular generative models, such as DCGAN and WGAN, fall very short of learning the data distribution. We identify this issue in generative models and empirically show that overparameterization via subsampling data and using a mixture of models improves performance in terms of goodness of fit.",
        "keywords": "generative adversarial networks;goodness of fit;inception score;empirical approximation error;validation metric;frechet inception score",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lorenzo Luzi;Randall Balestriero;Richard Baraniuk",
        "authorids": "lorenzo.luzi.28@gmail.com;randallbalestriero@gmail.com;richb@rice.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nluzi2020a,\ntitle={A {\\{}GOODNESS{\\}} {\\{}OF{\\}} {\\{}FIT{\\}} {\\{}MEASURE{\\}} {\\{}FOR{\\}} {\\{}GENERATIVE{\\}} {\\{}NETWORKS{\\}}},\nauthor={Lorenzo Luzi and Randall Balestriero and Richard Baraniuk},\nyear={2020},\nurl={https://openreview.net/forum?id=BklsagBYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BklsagBYPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "425;538;194",
        "wc_reply_reviewers": "0;89;0",
        "wc_reply_authors": "368;303;300",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            385.6666666666667,
            143.1650174526662
        ],
        "wc_reply_reviewers_avg": [
            29.666666666666668,
            41.95500235040182
        ],
        "wc_reply_authors_avg": [
            323.6666666666667,
            31.372316175606517
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nsOJ9WIhB6wJ:scholar.google.com/&scioq=A+GOODNESS+OF+FIT+MEASURE+FOR+GENERATIVE+NETWORKS&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BkltK0NtPr",
        "title": "WHAT ILLNESS OF LANDSCAPE CAN OVER-PARAMETERIZATION ALONE CURE?",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Over-parameterized networks are widely believed to have nice landscape, but what rigorous results can we prove? In this work, we prove that: (i) from under-parameterized to over-parameterized networks, there is a phase transition from having sub-optimal basins to no sub-optimal basins; (ii) over-parameterization alone cannot eliminate bad non-strict local minima. Specifically, we prove that for any continuous activation functions, the loss surface of a class of over-parameterized networks has no sub-optimal basin, where \u201cbasin\u201d is defined as the setwise strict local minimum. Furthermore, for under-parameterized network, we construct loss landscape with strict local minimum that is not global. We then show that it is impossible to prove \u201call over-parameterized networks have no sub-optimal local minima\u201d, by giving counter-examples for 1-hidden-layer networks with a class of neurons.\nViewing various bad patterns of landscape as illnesses (bad basins, flat regions, etc.), our results indicate that over-parameterization is not a panacea for every \u201cillness\u201d of the landscape, but it can cure one practically annoying illness (bad basins).",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dawei Li;Tian Ding;Ruoyu Sun",
        "authorids": "dawei2@illinois.edu;dt016@ie.cuhk.edu.hk;ruoyus@illinois.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkltK0NtPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "215;361;528",
        "wc_reply_reviewers": "0;197;0",
        "wc_reply_authors": "372;1219;1164",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            368.0,
            127.87754559212757
        ],
        "wc_reply_reviewers_avg": [
            65.66666666666667,
            92.86669059583322
        ],
        "wc_reply_authors_avg": [
            918.3333333333334,
            386.9679860430604
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cDs8oD4kWuMJ:scholar.google.com/&scioq=WHAT+ILLNESS+OF+LANDSCAPE+CAN+OVER-PARAMETERIZATION+ALONE+CURE%3F&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Bklu2grKwB",
        "title": "Learning RNNs with Commutative State Transitions",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Many machine learning tasks involve analysis of set valued inputs, and thus the learned functions are expected to be permutation invariant. Recent works (e.g., Deep Sets) have sought to characterize the neural architectures which result in permutation invariance. These typically correspond to applying the same pointwise function to all set components, followed by sum aggregation. Here we take a different approach to such architectures and focus on recursive architectures such as RNNs, which are not permutation invariant in general, but can implement permutation invariant functions in a very compact manner. We \n first show that commutativity and associativity of the state transition function result in permutation invariance. Next, we derive a regularizer that minimizes the degree of non-commutativity in the transitions. Finally, we demonstrate that the resulting method outperforms other methods for learning permutation invariant models, due to its use of recursive computation.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Edo Cohen-Karlik;Amir Globerson",
        "authorids": "edocoh@gmail.com;amir.globerson@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ncohen-karlik2020learning,\ntitle={Learning {\\{}RNN{\\}}s with Commutative State Transitions},\nauthor={Edo Cohen-Karlik and Amir Globerson},\nyear={2020},\nurl={https://openreview.net/forum?id=Bklu2grKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bklu2grKwB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "238;438;355",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "116;233;219",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            343.6666666666667,
            82.04199466668817
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            189.33333333333334,
            52.16853031814827
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ujTzFuE6bBIJ:scholar.google.com/&scioq=Learning+RNNs+with+Commutative+State+Transitions&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BkluqlSFDS",
        "title": "Federated Learning with Matched Averaging",
        "track": "main",
        "status": "Talk",
        "tldr": "Communication efficient federated learning with layer-wise matching",
        "abstract": "Federated learning allows edge devices to collaboratively learn a shared model while keeping the training data on device, decoupling the ability to do model training from the need to store the data in the cloud. We propose Federated matched averaging (FedMA) algorithm designed for federated learning of modern neural network architectures e.g. convolutional neural networks (CNNs) and LSTMs. FedMA constructs the shared global model in a layer-wise manner by matching and averaging hidden elements (i.e. channels for convolution layers; hidden states for LSTM; neurons for fully connected layers) with similar feature extraction signatures. Our experiments indicate that FedMA not only outperforms popular state-of-the-art federated learning algorithms on deep CNN and LSTM architectures trained on real world datasets, but also reduces the overall communication burden.",
        "keywords": "federated learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongyi Wang;Mikhail Yurochkin;Yuekai Sun;Dimitris Papailiopoulos;Yasaman Khazaeni",
        "authorids": "hongyiwang@cs.wisc.edu;mikhail.yurochkin@ibm.com;yuekai@umich.edu;dimitris@papail.io;yasaman.khazaeni@us.ibm.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nWang2020Federated,\ntitle={Federated Learning with Matched Averaging},\nauthor={Hongyi Wang and Mikhail Yurochkin and Yuekai Sun and Dimitris Papailiopoulos and Yasaman Khazaeni},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkluqlSFDS}\n}",
        "github": "https://github.com/IBM/FedMA",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkluqlSFDS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "17;958;387",
        "wc_reply_reviewers": "0;105;0",
        "wc_reply_authors": "0;834;712",
        "reply_reviewers": "0;1;0",
        "reply_authors": "0;2;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            454.0,
            387.07191407626914
        ],
        "wc_reply_reviewers_avg": [
            35.0,
            49.49747468305833
        ],
        "wc_reply_authors_avg": [
            515.3333333333334,
            367.7837528886899
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1489,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5955953368413772471&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BklxI0VtDB",
        "title": "ROS-HPL: Robotic Object Search with Hierarchical Policy Learning and Intrinsic-Extrinsic Modeling",
        "track": "main",
        "status": "Reject",
        "tldr": "In this paper, we present a novel two-layer hierarchical policy learning framework that builds on intrinsic and extrinsic rewards for the task of robotic object search.",
        "abstract": "Despite significant progress in Robotic Object Search (ROS) over the recent years with deep reinforcement learning based approaches, the sparsity issue in reward setting as well as the lack of interpretability of the previous ROS approaches leave much to be desired. We present a novel policy learning approach for ROS, based on a hierarchical and interpretable modeling with intrinsic/extrinsic reward setting, to tackle these two challenges. More specifically, we train the low-level policy by deliberating between an action that achieves an immediate sub-goal and the one that is better suited for achieving the final goal. We also introduce a new evaluation metric, namely the extrinsic reward, as a harmonic measure of the object search success rate and the average steps taken. Experiments conducted with multiple settings on the House3D environment validate and show that the intelligent agent, trained  with our model, can achieve a better object search performance (higher success rate with lower average steps, measured by SPL: Success weighted by inverse Path Length). In addition, we conduct studies w.r.t. the parameter that controls the weighted overall reward from intrinsic and extrinsic components. The results suggest it is critical to devise a proper trade-off strategy to perform the object search well.",
        "keywords": "Robotic Object Search;Hierarchical Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin Ye;Shibin Zheng;Yezhou Yang",
        "authorids": "xinye1@asu.edu;szheng31@asu.edu;yz.yang@asu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nye2020roshpl,\ntitle={{\\{}ROS{\\}}-{\\{}HPL{\\}}: Robotic Object Search with Hierarchical Policy Learning and Intrinsic-Extrinsic Modeling},\nauthor={Xin Ye and Shibin Zheng and Yezhou Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=BklxI0VtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BklxI0VtDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "442;303;350",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "346;377;331",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.0,
            57.72925312756667
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            351.3333333333333,
            19.154343864744856
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:s3Fv2_pRFFQJ:scholar.google.com/&scioq=ROS-HPL:+Robotic+Object+Search+with+Hierarchical+Policy+Learning+and+Intrinsic-Extrinsic+Modeling&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BklxN0NtvB",
        "title": "Noisy Machines: Understanding noisy neural networks and enhancing robustness to analog hardware errors using distillation",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a novel training method enhancing neural network robustness to random noise in weights, making it more practical to deploy neural networks on analog accelerators.",
        "abstract": "The success of deep learning has brought forth a wave of interest in computer hardware design to better meet the high demands of neural network inference. In particular, analog computing hardware has been heavily motivated specifically for accelerating neural networks, based on either electronic, optical or photonic devices, which may well achieve lower power consumption than conventional digital electronics. However, these proposed analog accelerators suffer from the intrinsic noise generated by their physical components, which makes it challenging to achieve high accuracy on deep neural networks. Hence, for successful deployment on analog accelerators, it is essential to be able to train deep neural networks to be robust to random continuous noise in the network weights, which is a somewhat new challenge in machine learning. In this paper, we advance the understanding of noisy neural networks. We outline how a noisy neural network has reduced learning capacity as a result of loss of mutual information between its input and output. To combat this, we propose using knowledge distillation combined with noise injection during training to achieve more noise robust networks, which is demonstrated experimentally across different networks and datasets, including ImageNet. Our method achieves models with as much as 2X greater noise tolerance compared with the previous best attempts, which is a significant step towards making analog hardware practical for deep learning.",
        "keywords": "network noise robustness;analog accelerator;noise injection;distillation;error rate",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chuteng Zhou;Prad Kadambi;Matthew Mattina;Paul N. Whatmough",
        "authorids": "chu.zhou@arm.com;pkadambi@asu.edu;matthew.mattina@arm.com;paul.whatmough@arm.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhou2020noisy,\ntitle={Noisy Machines: Understanding noisy neural networks and enhancing robustness to analog hardware errors using distillation},\nauthor={Chuteng Zhou and Prad Kadambi and Matthew Mattina and Paul N. Whatmough},\nyear={2020},\nurl={https://openreview.net/forum?id=BklxN0NtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BklxN0NtvB",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "79;178;654;286",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "85;357;589;554",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            299.25,
            217.50560337609696
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            396.25,
            200.28401708573753
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 54,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1481831764241114986&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Bkx1mxSKvB",
        "title": "Disentangling Trainability and Generalization in Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Disentangling Trainability and Generalization in Deep Learning via the Evolution of the Neural Tangent Kernels",
        "abstract": "A fundamental goal in deep learning is the characterization of trainability and generalization of neural networks as a function of their architecture and hyperparameters. In this paper, we discuss these challenging issues in the context of wide neural networks at large depths where we will see that the situation simplifies considerably. To do this, we leverage recent advances that have separately shown: (1) that in the wide network limit, random networks before training are Gaussian Processes governed by a kernel known as the Neural Network Gaussian Process (NNGP) kernel, (2) that at large depths the spectrum of the NNGP kernel simplifies considerably and becomes ``weakly data-dependent'', and (3) that gradient descent training of wide neural networks is described by a kernel called the Neural Tangent Kernel (NTK) that is related to the NNGP. Here we show that by combining the in the large depth limit the spectrum of the NTK simplifies in much the same way as that of the NNGP kernel. By analyzing this spectrum, we arrive at a precise characterization of trainability and generalization across a range of architectures including Fully Connected Networks (FCNs) and Convolutional Neural Networks (CNNs). We find that there are large regions of hyperparameter space where networks will train but will fail to generalize, in contrast with several recent results. By comparing CNNs with- and without-global average pooling, we show that CNNs without average pooling have very nearly identical learning dynamics to FCNs while CNNs with pooling contain a correction that alters its generalization performance. We perform a thorough empirical investigation of these theoretical results and finding excellent agreement on real datasets.",
        "keywords": "NTK;NNGP;mean field theory;CNN;trainability and generalization;Gaussian process",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lechao Xiao;Jeffrey Pennington;Sam Schoenholz",
        "authorids": "xlc@google.com;jpennin@google.com;schsam@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nxiao2020disentangling,\ntitle={Disentangling Trainability and Generalization in Deep Learning},\nauthor={Lechao Xiao and Jeffrey Pennington and Sam Schoenholz},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkx1mxSKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkx1mxSKvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "194;376;355",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "282;666;971",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            308.3333333333333,
            81.29917315419361
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            639.6666666666666,
            281.89872097774565
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 33,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8842434721102242022&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Bkx29TVFPr",
        "title": "An implicit function learning approach for parametric modal regression",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a simple and novel modal regression algorithm which is easy to scale to large problems. ",
        "abstract": "For multi-valued functions---such as when the conditional distribution on targets given the inputs is multi-modal---standard regression approaches are not always desirable because they provide the conditional mean. Modal regression approaches aim to instead find the conditional mode, but are restricted to nonparametric approaches. Such approaches can be difficult to scale, and make it difficult to benefit from parametric function approximation, like neural networks, which can learn complex relationships between inputs and targets. In this work, we propose a parametric modal regression algorithm, by using the implicit function theorem to develop an objective for learning a joint parameterized function over inputs and targets. We empirically demonstrate on several synthetic problems that our method (i) can learn multi-valued functions and produce the conditional modes, (ii) scales well to high-dimensional inputs and (iii) is even more effective for certain unimodal problems, particularly for high frequency data where the joint function over inputs and targets can better capture the complex relationship between them. We conclude by showing that our method provides small improvements on two regression datasets that have asymmetric distributions over the targets. ",
        "keywords": "regression;modal regression;implicit function theorem;multivalue function",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yangchen Pan;Martha White;Amir-massoud Farahmand",
        "authorids": "pan6@ualberta.ca;whitem@ualberta.ca;farahmand@vectorinstitute.ai",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npan2020an,\ntitle={An implicit function learning approach for parametric modal regression},\nauthor={Yangchen Pan and Martha White and Amir-massoud Farahmand},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkx29TVFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bkx29TVFPr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "448;200;796",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "25;28;215",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            481.3333333333333,
            244.45494926923084
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            89.33333333333333,
            88.86819203492077
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17939980069198750883&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Bkx4AJSFvB",
        "title": "Efficient Bi-Directional Verification of ReLU Networks via Quadratic Programming",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural networks are known to be sensitive to adversarial perturbations. To investigate this undesired behavior we consider the problem of computing the distance to the decision boundary (DtDB) from a given sample for a deep NN classifier. In this work we present an iterative procedure where in each step we solve a convex quadratic programming (QP) task. Solving the single initial QP already results in a lower bound on the DtDB and can be used as a robustness certificate of the classifier around a given sample. In contrast to currently known approaches our method also provides upper bounds used as a measure of quality for the certificate. We show that our approach provides better or competitive results in comparison with a wide range of existing techniques.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aleksei Kuvshinov;Stephan Guennemann",
        "authorids": "kuvshino@in.tum.de;guennemann@in.tum.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkuvshinov2020efficient,\ntitle={Efficient Bi-Directional Verification of Re{\\{}LU{\\}} Networks via Quadratic Programming},\nauthor={Aleksei Kuvshinov and Stephan Guennemann},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkx4AJSFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkx4AJSFvB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "689;783;223",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "821;930;628",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            565.0,
            244.8564205134647
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            793.0,
            124.87059968890462
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LMkKA_zz9jkJ:scholar.google.com/&scioq=Efficient+Bi-Directional+Verification+of+ReLU+Networks+via+Quadratic+Programming&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Bkx5XyrtPS",
        "title": "Depth creates no more spurious local minima in linear networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that a deep linear network has no spurious local minima as long as it is true for the two layer case. ",
        "abstract": "We show that for any convex differentiable loss,  a deep linear network has no spurious local minima as long as it is true for the two layer case.  This reduction greatly simplifies the study on the existence of spurious local minima in deep linear networks. When applied to the quadratic loss, our result immediately implies the powerful result by Kawaguchi (2016). Further, with the recent work by Zhou& Liang (2018), we can remove all the assumptions in (Kawaguchi, 2016).  This property  holds  for  more  general  \u201cmulti-tower\u201d  linear  networks  too.  Our  proof builds on the work in (Laurent & von Brecht, 2018) and develops a new perturbation argument to show that any spurious local minimum must have full rank, a structural property which can be useful more generally",
        "keywords": "local minimum;deep linear network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Li Zhang",
        "authorids": "liqzhang@google.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nzhang2020depth,\ntitle={Depth creates no more spurious local minima in linear networks},\nauthor={Li Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkx5XyrtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkx5XyrtPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "191;355;99",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "454;636;43",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            215.0,
            105.88043571248971
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            377.6666666666667,
            248.03539173987966
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Gje-_Gpakh8J:scholar.google.com/&scioq=Depth+creates+no+more+spurious+local+minima+in+linear+networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Bkx5ceHFwH",
        "title": "Common sense and Semantic-Guided Navigation via Language in Embodied Environments",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "One key element which differentiates humans from artificial agents in performing various tasks is that humans have access to common sense and semantic understanding, learnt from past experiences. In this work, we evaluate whether common sense and semantic understanding benefit an artificial agent when completing a room navigation task, wherein we ask the agent to navigate to a target room (e.g. ``go to the kitchen\"), in a realistic 3D environment. We leverage semantic information and patterns observed during training to build the common sense which guides the agent to reach the target. We encourage semantic understanding within the agent by introducing grounding as an auxiliary task. We train and evaluate the agent in three settings: (i)~imitation learning using expert trajectories (ii)~reinforcement learning using Proximal Policy Optimization and (iii)~self-supervised imitation learning for fine-tuning the agent on unseen environments using auxiliary tasks. From our experiments, we observed that common sense helps the agent in long-term planning, while semantic understanding helps in short-term and local planning (such as guiding the agent when to stop). When combined, the agent generalizes better. Further, incorporating common sense and semantic understanding leads to 40\\% improvement in task success and 112\\% improvement in success per length (\\textit{SPL}) over the baseline during imitation learning. Moreover, initial evidence suggests that the cross-modal embeddings learnt during training capture structural and positional patterns of the environment, implying that the agent inherently learns a map of the environment. It also suggests that navigation in multi-modal tasks leads to better semantic understanding.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dian Yu;Chandra Khatri;Alexandros Papangelis;Mahdi Namazifar;Andrea Madotto;Huaixiu Zheng;Gokhan Tur",
        "authorids": "dian.yu@uber.com;chandrak@uber.com;apapangelis@uber.com;mahdin@uber.com;amadotto@connect.ust.hk;huaixiu.zheng@uber.com;gokhan@uber.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bkx5ceHFwH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "459;1000;353",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            604.0,
            283.33843132668517
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8819332837858956243&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Bkx8JJBtDS",
        "title": "Counting the Paths in Deep Neural Networks as a Performance Predictor",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A quantitative measure to predict the performances of deep neural network models.",
        "abstract": "We propose a novel quantitative measure to predict the performance of a deep neural network classifier, where the measure is derived exclusively from the graph structure of the network. We expect that this measure is a fundamental first step in developing a method to evaluate new network architectures and reduce the reliance on the computationally expensive trial and error or \"brute force\" optimisation processes involved in model selection. The measure is derived in the context of multi-layer perceptrons (MLPs), but the definitions are shown to be useful also in the context of deep convolutional neural networks (CNN), where it is able to estimate and compare the relative performance of different types of neural networks, such as VGG, ResNet, and DenseNet. Our measure is also used to study the effects of some important \"hidden\" hyper-parameters of the DenseNet architecture, such as number of layers, growth rate and the dimension of 1x1 convolutions in DenseNet-BC. Ultimately, our measure facilitates the optimisation of the DenseNet design, which shows improved results compared to the baseline.\n",
        "keywords": "learning theory;deep learning;convolutional neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michele Sasdelli;Ian Reid;Gustavo Carneiro",
        "authorids": "michele.sasdelli@adelaide.edu.au;ian.reid@adelaide.edu.au;gustavo.carneiro@adelaide.edu.au",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://www.dropbox.com/s/am7lalpkz101b1z/source_code.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bkx8JJBtDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "515;693;316",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            508.0,
            153.98917710886914
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11872791155362203839&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkxA5lBFvH",
        "title": "Hope For The Best But Prepare For The Worst: Cautious Adaptation In RL Agents",
        "track": "main",
        "status": "Reject",
        "tldr": "Adaptation of an RL agent in a target environment with unknown dynamics is fast and safe when we transfer prior experience in a variety of environments and then select risk-averse actions during adaptation.",
        "abstract": "We study the problem of safe adaptation: given a model trained on a variety of past experiences for some task, can this model learn to perform that task in a new situation while avoiding catastrophic failure? This problem setting occurs frequently in real-world reinforcement learning scenarios such as a vehicle adapting to drive in a new city, or a robotic drone adapting a policy trained only in simulation. While learning without catastrophic failures is exceptionally difficult, prior experience can allow us to learn models that make this much easier. These models might not directly transfer to new settings, but can enable cautious adaptation that is substantially safer than na\\\"{i}ve adaptation as well as learning from scratch. Building on this intuition, we propose risk-averse domain adaptation (RADA). RADA works in two steps: it first trains probabilistic model-based RL agents in a population of source domains to gain experience and capture epistemic uncertainty about the environment dynamics. Then, when dropped into a new environment, it employs a pessimistic exploration policy, selecting actions that have the best worst-case performance as forecasted by the probabilistic model. We show that this simple maximin policy accelerates domain adaptation in a safety-critical driving environment with varying vehicle sizes. We compare our approach against other approaches for adapting to new environments, including meta-reinforcement learning.",
        "keywords": "safety;risk;uncertainty;adaptation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jesse Zhang;Brian Cheung;Chelsea Finn;Dinesh Jayaraman;Sergey Levine",
        "authorids": "jessezhang@berkeley.edu;bcheung@berkeley.edu;cbfinn@cs.stanford.edu;dineshjayaraman@berkeley.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang2020hope,\ntitle={Hope For The Best But Prepare For The Worst: Cautious Adaptation In {\\{}RL{\\}} Agents},\nauthor={Jesse Zhang and Brian Cheung and Chelsea Finn and Dinesh Jayaraman and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxA5lBFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkxA5lBFvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "73;372;697",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;685;693",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            380.6666666666667,
            254.8206340851454
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            459.3333333333333,
            324.8141348867414
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MhOQFUzSoG8J:scholar.google.com/&scioq=Hope+For+The+Best+But+Prepare+For+The+Worst:+Cautious+Adaptation+In+RL+Agents&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BkxDKREtPS",
        "title": "Characterizing convolutional neural networks with one-pixel signature",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Cnvolutional neural networks characterization for backdoored classifier detection and understanding.",
        "abstract": "We propose a new representation, one-pixel signature, that can be used to reveal the characteristics of the convolution neural networks (CNNs). Here, each CNN classifier is associated with a signature that is created by generating, pixel-by-pixel, an adversarial value that is the result of the largest change to the class prediction. The one-pixel signature is agnostic to the design choices of CNN architectures such as type, depth, activation function, and how they were trained. It can be computed efficiently for a black-box classifier without accessing the network parameters. Classic networks such as LetNet, VGG, AlexNet, and ResNet demonstrate different characteristics in their signature images. For application, we focus on the classifier backdoor detection problem where a CNN classifier has been maliciously inserted with an unknown Trojan. We show the effectiveness of the one-pixel signature in detecting backdoored CNN. Our proposed one-pixel signature representation is general and it can be applied in problems where discriminative classifiers, particularly neural network based, are to be characterized.",
        "keywords": "Neural Network characterization;backdoor detection;one-pixel signature",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shanjiaoyang Huang;Weiqi Peng;Zhuowen Tu",
        "authorids": "shh236@ucsd.edu;wep012@ucsd.edu;ztu@ucsd.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkxDKREtPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "433;436;176",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "158;412;89",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.3333333333333,
            121.8642231702516
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            219.66666666666666,
            138.88684442939712
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Pj-gTPEpHR0J:scholar.google.com/&scioq=Characterizing+convolutional+neural+networks+with+one-pixel+signature&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BkxDthVtvS",
        "title": "Equivariant neural networks and equivarification",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "A key difference from existing works is that our equivarification method can be applied without knowledge of the detailed functions of a layer in a neural network, and hence, can be generalized to any feedforward neural networks. Although the network size scales up, the constructed equivariant neural network does not increase the complexity of the network compared with the original one, in terms of the number of parameters. As an illustration, we build an equivariant neural network for image classification by equivarifying a convolutional neural network. Results show that our proposed method significantly reduces the design and training complexity, yet preserving the learning performance in terms of accuracy.",
        "keywords": "equivariant;invariant;neural network;equivarification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Erkao Bao;Linqi Song",
        "authorids": "baoerkao@gmail.com;linqi.song@cityu.edu.hk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nbao2020equivariant,\ntitle={Equivariant neural networks and equivarification},\nauthor={Erkao Bao and Linqi Song},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxDthVtvS}\n}",
        "github": "https://github.com/symplecticgeometry/equivariant-neural-networks-and-equivarification",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkxDthVtvS",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "238;656;164;1005",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "584;738;307;692",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            515.75,
            339.08286229180027
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            580.25,
            167.37140586133583
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8561351756626719304&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BkxDxJHFDr",
        "title": "Power up! Robust Graph Convolutional Network based on Graph Powering",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a framework for robust graph neural networks based on graph powering",
        "abstract": "Graph convolutional networks (GCNs) are powerful tools for graph-structured data. However, they have been recently shown to be vulnerable to topological attacks. To enhance adversarial robustness, we go beyond spectral graph theory to robust graph theory. By challenging the classical graph Laplacian, we propose a new convolution operator that is provably robust in the spectral domain and is incorporated in the GCN architecture to improve expressivity and interpretability. By extending the original graph to a sequence of graphs, we also propose a robust training paradigm that encourages transferability across graphs that span a range of spatial and spectral characteristics. The proposed approaches are demonstrated in extensive experiments to {simultaneously} improve performance in both benign and adversarial situations. ",
        "keywords": "graph mining;graph neural network;adversarial robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ming Jin;Heng Chang;Wenwu Zhu;Somayeh Sojoudi",
        "authorids": "jinming@berkeley.edu;changh@berkeley.edu;wwzhu@tsinghua.edu.cn;sojoudi@berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njin2020power,\ntitle={Power up! Robust Graph Convolutional Network based on Graph Powering},\nauthor={Ming Jin and Heng Chang and Wenwu Zhu and Somayeh Sojoudi},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxDxJHFDr}\n}",
        "github": "https://www.dropbox.com/sh/p36pzx1ock2iamo/AABEr7FtM5nqwC4i9nICLIsta?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkxDxJHFDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "553;377;403",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1024;1151;999",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            444.3333333333333,
            77.56860762504951
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1058.0,
            66.54822812567339
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2866238880665546784&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkxFi2VYvS",
        "title": "Semi-supervised Semantic Segmentation using Auxiliary Network",
        "track": "main",
        "status": "Reject",
        "tldr": "We design a two-branch semi-supervised segmentation system consisting of a segmentation network and an auxiliary CNN network that validates labels (ground-truth) on the unlabeled images",
        "abstract": "Recently, the convolutional neural networks (CNNs) have shown great success on semantic segmentation task. However, for practical applications such as autonomous driving, the popular supervised learning method faces two challenges: the demand of low computational complexity and the need of huge training dataset accompanied by ground truth. Our focus in this paper is semi-supervised learning. We wish to use both labeled and unlabeled data in the training process. A highly efficient semantic segmentation network is our platform, which achieves high segmentation accuracy at low model size and high inference speed. We propose a semi-supervised learning approach to improve segmentation accuracy by including extra images without labels. While most existing semi-supervised learning methods are designed based on the adversarial learning techniques, we present a new and different approach, which trains an auxiliary CNN network that validates labels (ground-truth) on the unlabeled images. Therefore, in the supervised training phase, both the segmentation network and the auxiliary network are trained using labeled images. Then, in the unsupervised training phase, the unlabeled images are segmented and a subset of image pixels are picked up by the auxiliary network; and then they are used as ground truth to train the segmentation network. Thus, at the end, all dataset images can be used for retraining the segmentation network to improve the segmentation results. We use Cityscapes and CamVid datasets to verify the effectiveness of our semi-supervised scheme, and our experimental results show that it can improve the mean IoU for about 1.2% to 2.9% on the challenging Cityscapes dataset.",
        "keywords": "deep learning;semi-supervised segmentation;semantic segmentation;CNN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei-Hsu Chen;Hsueh-Ming Hang",
        "authorids": "qoososola520.ee06g@nctu.edu.tw;hmhang@nctu.edu.tw",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nchen2020semisupervised,\ntitle={Semi-supervised Semantic Segmentation using Auxiliary Network},\nauthor={Wei-Hsu Chen and Hsueh-Ming Hang},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxFi2VYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkxFi2VYvS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "398;536;152",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            362.0,
            158.82065356873457
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TzgrdWd06JgJ:scholar.google.com/&scioq=Semi-supervised+Semantic+Segmentation+using+Auxiliary+Network&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BkxGAREYwB",
        "title": "Deep Expectation-Maximization in Hidden Markov Models via Simultaneous Perturbation Stochastic Approximation",
        "track": "main",
        "status": "Reject",
        "tldr": "We rendered Expectation-Maximization iteration as a network layer by approximating its gradient.",
        "abstract": "We propose a novel method to estimate the parameters of a collection of Hidden Markov Models (HMM), each of which corresponds to a set of known features. The observation sequence of an individual HMM is noisy and/or insufficient, making parameter estimation solely based on its corresponding observation sequence a challenging problem. The key idea is to combine the classical Expectation-Maximization (EM) algorithm with a neural network, while these two are jointly trained in an end-to-end fashion, mapping the HMM features to its parameters and effectively fusing the information across different HMMs. In order to address the numerical difficulty in computing the gradient of the EM iteration, simultaneous perturbation stochastic approximation (SPSA) is employed to approximate the gradient. We also provide a rigorous proof that the approximated gradient due to SPSA converges to the true gradient almost surely. The efficacy of the proposed method is demonstrated on synthetic data as well as a real-world e-Commerce dataset. ",
        "keywords": "recommender system;gradient approximation;Hidden Markov Model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chong Li;Dan Shen;C.J. Richard Shi;Hongxia Yang",
        "authorids": "chongli@uw.edu;dshen@alibaba-inc.com;cjshi@uw.edu;yang.yhx@alibaba-inc.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nli2020deep,\ntitle={Deep Expectation-Maximization in Hidden Markov Models via Simultaneous Perturbation Stochastic Approximation},\nauthor={Chong Li and Dan Shen and C.J. Richard Shi and Hongxia Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxGAREYwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkxGAREYwB",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "480;417",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "769;568",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            448.5,
            31.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            668.5,
            100.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0rft7r8T7VgJ:scholar.google.com/&scioq=Deep+Expectation-Maximization+in+Hidden+Markov+Models+via+Simultaneous+Perturbation+Stochastic+Approximation&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "BkxREeHKPS",
        "title": "On the Parameterization of Gaussian Mean Field Posteriors in Bayesian Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Mean field VB uses twice as many parameters; we tie variance parameters in mean field VB without any loss in ELBO, gaining speed and lower variance gradients.",
        "abstract": "Variational Bayesian Inference is a popular methodology for approximating posterior distributions in Bayesian neural networks. Recent work developing this class of methods has explored ever richer parameterizations of the approximate posterior in the hope of improving performance. In contrast, here we share a curious experimental finding that suggests instead restricting the variational distribution to a more compact parameterization. For a variety of deep Bayesian neural networks trained using Gaussian mean-field variational inference, we find that the posterior standard deviations consistently exhibits strong low-rank structure after convergence. This means that by decomposing these variational parameters into a low-rank factorization, we can make our variational approximation more compact without decreasing the models' performance. What's more, we find that such factorized parameterizations are easier to train since they improve the signal-to-noise ratio of stochastic gradient estimates of the variational lower bound, resulting in faster convergence.",
        "keywords": "variational Bayes;Bayesian neural networks;mean field",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jakub \u015awi\u0105tkowski;Kevin Roth;Bastiaan S. Veeling;Linh Tran;Joshua V. Dillon;Jasper Snoek;Stephan Mandt;Tim Salimans;Rodolphe Jenatton;Sebastian Nowozin",
        "authorids": "kuba.swiatkowski@gmail.com;kevin.roth@inf.ethz.ch;basveeling@gmail.com;linh.tran@imperial.ac.uk;jvdillon@google.com;jaspersnoek@gmail.com;stephan.mandt@gmail.com;salimans@google.com;rjenatton@google.com;nowozin@google.com",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@misc{\n{\\'s}wi{\\k{a}}tkowski2020on,\ntitle={On the Parameterization of Gaussian Mean Field Posteriors in Bayesian Neural Networks},\nauthor={Jakub {\\'S}wi{\\k{a}}tkowski and Kevin Roth and Bastiaan S. Veeling and Linh Tran and Joshua V. Dillon and Jasper Snoek and Stephan Mandt and Tim Salimans and Rodolphe Jenatton and Sebastian Nowozin},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxREeHKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkxREeHKPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "488;437;250",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "544;638;668",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            391.6666666666667,
            102.31433048318412
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            616.6666666666666,
            52.822554105440815
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:r7yzVv09ZPAJ:scholar.google.com/&scioq=On+the+Parameterization+of+Gaussian+Mean+Field+Posteriors+in+Bayesian+Neural+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BkxRRkSKwr",
        "title": "Towards Hierarchical Importance Attribution: Explaining Compositional Semantics for Neural Sequence Models",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose measurement of phrase importance and algorithms for hierarchical explanation of neural sequence model predictions",
        "abstract": "The impressive performance of neural networks on natural language processing tasks attributes to their ability to model complicated word and phrase compositions. To explain how the model handles semantic compositions, we study hierarchical explanation of neural network predictions. We identify non-additivity and context independent importance attributions within hierarchies as two desirable properties for highlighting word and phrase compositions. We show some prior efforts on hierarchical explanations, e.g. contextual decomposition, do not satisfy the desired properties mathematically, leading to inconsistent explanation quality in different models. In this paper, we start by proposing a formal and general way to quantify the importance of each word and phrase. Following the formulation, we propose Sampling and Contextual Decomposition (SCD) algorithm and Sampling and Occlusion (SOC) algorithm. Human and metrics evaluation on both LSTM models and BERT Transformer models on multiple datasets show that our algorithms outperform prior hierarchical explanation algorithms. Our algorithms help to visualize semantic composition captured by models, extract classification rules and improve human trust of models.",
        "keywords": "natural language processing;interpretability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xisen Jin;Zhongyu Wei;Junyi Du;Xiangyang Xue;Xiang Ren",
        "authorids": "xisenjin@usc.edu;zywei@fudan.edu.cn;junyidu@usc.edu;xyxue@fudan.edu.cn;xiangren@usc.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nJin2020Towards,\ntitle={Towards Hierarchical Importance Attribution: Explaining Compositional Semantics for Neural Sequence Models},\nauthor={Xisen Jin and Zhongyu Wei and Junyi Du and Xiangyang Xue and Xiang Ren},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxRRkSKwr}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=BkxRRkSKwr)",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkxRRkSKwr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "574;120;718",
        "wc_reply_reviewers": "12;0;79",
        "wc_reply_authors": "952;63;379",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            470.6666666666667,
            254.8324068001469
        ],
        "wc_reply_reviewers_avg": [
            30.333333333333332,
            34.75949110994323
        ],
        "wc_reply_authors_avg": [
            464.6666666666667,
            367.9531975069051
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 128,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11120913965328531571&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BkxSmlBFvr",
        "title": "You CAN Teach an Old Dog New Tricks! On Training Knowledge Graph Embeddings",
        "track": "main",
        "status": "Poster",
        "tldr": "We study the impact of training strategies on the performance of knowledge graph embeddings.",
        "abstract": "Knowledge graph embedding (KGE) models learn algebraic representations of the entities and relations in a knowledge graph. A vast number of KGE techniques for multi-relational link prediction have been proposed in the recent literature, often with state-of-the-art performance. These approaches differ along a number of dimensions, including different model architectures, different training strategies, and different approaches to hyperparameter optimization. In this paper, we take a step back and aim to summarize and quantify empirically the impact of each of these dimensions on model performance. We report on the results of an extensive experimental study with popular model architectures and training strategies across a wide range of hyperparameter settings. We found that when trained appropriately, the relative performance differences between various model architectures often shrinks and sometimes even reverses when compared to prior results. For example, RESCAL~\\citep{nickel2011three}, one of the first KGE models, showed strong performance when trained with state-of-the-art techniques; it was competitive to or outperformed more recent architectures. We also found that good (and often superior to prior studies) model configurations can be found by exploring relatively few random samples from a large hyperparameter space. Our results suggest that many of the more advanced architectures and techniques proposed in the literature should be revisited to reassess their individual benefits. To foster further reproducible research, we provide all our implementations and experimental results as part of the open source LibKGE framework.",
        "keywords": "knowledge graph embeddings;hyperparameter optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Ruffinelli;Samuel Broscheit;Rainer Gemulla",
        "authorids": "daniel@informatik.uni-mannheim.de;broscheit@informatik.uni-mannheim.de;rgemulla@uni-mannheim.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nRuffinelli2020You,\ntitle={You CAN Teach an Old Dog New Tricks! On Training Knowledge Graph Embeddings},\nauthor={Daniel Ruffinelli and Samuel Broscheit and Rainer Gemulla},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxSmlBFvr}\n}",
        "github": "https://github.com/uma-pi1/kge",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkxSmlBFvr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "446;742;176",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "647;317;41",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            454.6666666666667,
            231.14978308928224
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            335.0,
            247.7256547069762
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 273,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13990804853925145143&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BkxUvnEYDH",
        "title": "Program Guided Agent",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose a modular framework that can accomplish tasks specified by programs and achieve zero-shot generalization to more complex tasks.",
        "abstract": "Developing agents that can learn to follow natural language instructions has been an emerging research direction. While being accessible and flexible, natural language instructions can sometimes be ambiguous even to humans. To address this, we propose to utilize programs, structured in a formal language, as a precise and expressive way to specify tasks. We then devise a modular framework that learns to perform a task specified by a program \u2013 as different circumstances give rise to diverse ways to accomplish the task, our framework can perceive which circumstance it is currently under, and instruct a multitask policy accordingly to fulfill each subtask of the overall task. Experimental results on a 2D Minecraft environment not only demonstrate that the proposed framework learns to reliably accomplish program instructions and achieves zero-shot generalization to more complex instructions but also verify the efficiency of the proposed modulation mechanism for learning the multitask policy. We also conduct an analysis comparing various models which learn from programs and natural language instructions in an end-to-end fashion.",
        "keywords": "Program Execution;Program Executor;Program Understanding;Program Guided Agent;Learning to Execute;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shao-Hua Sun;Te-Lin Wu;Joseph J. Lim",
        "authorids": "shaohuas@usc.edu;telinwu@usc.edu;limjj@usc.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nsun2020program,\ntitle={Program Guided Agent},\nauthor={Shao-Hua Sun and Te-Lin Wu and Joseph J. Lim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxUvnEYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkxUvnEYDH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "980;691;517",
        "wc_reply_reviewers": "308;0;163",
        "wc_reply_authors": "2790;1613;1183",
        "reply_reviewers": "1;0;2",
        "reply_authors": "5;3;4",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            729.3333333333334,
            190.95258282853595
        ],
        "wc_reply_reviewers_avg": [
            157.0,
            125.81202910161916
        ],
        "wc_reply_authors_avg": [
            1862.0,
            679.2706873306596
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            4.0,
            0.816496580927726
        ],
        "replies_avg": [
            22,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 81,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18015127338929584539&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BkxX30EFPS",
        "title": "Perceptual Generative Autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Modern generative models are usually designed to match target distributions directly in the data space, where the intrinsic dimensionality of data can be much lower than the ambient dimensionality. We argue that this discrepancy may contribute to the difficulties in training generative models. We therefore propose to map both the generated and target distributions to the latent space using the encoder of a standard autoencoder, and train the generator (or decoder) to match the target distribution in the latent space. The resulting method, perceptual generative autoencoder (PGA), is then incorporated with a maximum likelihood or variational autoencoder (VAE) objective to train the generative model. With maximum likelihood, PGAs generalize the idea of reversible generative models to unrestricted neural network architectures and arbitrary latent dimensionalities. When combined with VAEs, PGAs can generate sharper samples than vanilla VAEs. Compared to other autoencoder-based generative models using simple priors, PGAs achieve state-of-the-art FID scores on CIFAR-10 and CelebA.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zijun Zhang;Ruixiang Zhang;Zongpeng Li;Yoshua Bengio;Liam Paull",
        "authorids": "zijun.zhang@ucalgary.ca;sodabeta7@gmail.com;zongpeng@whu.edu.cn;yoshua.bengio@mila.quebec;paulll@iro.umontreal.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang2020perceptual,\ntitle={Perceptual Generative Autoencoders},\nauthor={Zijun Zhang and Ruixiang Zhang and Zongpeng Li and Yoshua Bengio and Liam Paull},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxX30EFPS}\n}",
        "github": "https://bit.ly/2U0kRYL",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkxX30EFPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "356;252;278",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "267;122;117",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.3333333333333,
            44.19150245113747
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            168.66666666666666,
            69.56212251569735
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8244017166037108075&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BkxXe0Etwr",
        "title": "CAQL: Continuous Action Q-Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "A general framework of value-based reinforcement learning for continuous control",
        "abstract": "Reinforcement learning (RL) with value-based methods (e.g., Q-learning) has shown success in a variety of domains such as\ngames and recommender systems (RSs). When the action space is finite, these algorithms implicitly finds a policy by learning the optimal value function, which are often very efficient. \nHowever, one major challenge of extending Q-learning to tackle continuous-action RL problems is that obtaining optimal Bellman backup requires solving a continuous action-maximization (max-Q) problem. While it is common to restrict the parameterization of the Q-function to be concave in actions to simplify the max-Q problem, such a restriction might lead to performance degradation. Alternatively, when the Q-function is parameterized with a generic feed-forward neural network (NN), the max-Q problem can be NP-hard. In this work, we propose the CAQL method which minimizes the Bellman residual using Q-learning with one of several plug-and-play action optimizers. In particular, leveraging the strides of optimization theories in deep NN, we show that max-Q problem can be solved optimally with mixed-integer programming (MIP)---when the Q-function has sufficient representation power, this MIP-based optimization induces better policies and is more robust than counterparts, e.g., CEM or GA, that approximate the max-Q solution. To speed up training of CAQL, we develop three techniques, namely (i) dynamic tolerance, (ii) dual filtering, and (iii) clustering.\nTo speed up inference of CAQL, we introduce the action function that concurrently learns the optimal policy.\nTo demonstrate the efficiency of CAQL we compare it with state-of-the-art RL algorithms on benchmark continuous control problems that have different degrees of action constraints and show that CAQL significantly outperforms policy-based methods in heavily constrained environments.",
        "keywords": "Reinforcement learning (RL);DQN;Continuous control;Mixed-Integer Programming (MIP)",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Moonkyung Ryu;Yinlam Chow;Ross Anderson;Christian Tjandraatmadja;Craig Boutilier",
        "authorids": "mkryu@google.com;yinlamchow@google.com;rander@google.com;ctjandra@google.com;cboutilier@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nRyu2020CAQL:,\ntitle={CAQL: Continuous Action Q-Learning},\nauthor={Moonkyung Ryu and Yinlam Chow and Ross Anderson and Christian Tjandraatmadja and Craig Boutilier},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxXe0Etwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkxXe0Etwr",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "144;323",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "382;606",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            233.5,
            89.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            494.0,
            112.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 62,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10491533799311815291&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BkxaXeHYDB",
        "title": "Newton Residual Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We demonstrate how residual blocks can be viewed as Gauss-Newton steps; we propose a new residual block that exploits second order information.",
        "abstract": "A plethora of computer vision tasks, such as optical flow and image alignment, can be formulated as non-linear optimization problems. Before the resurgence of deep learning, the dominant family for solving such optimization problems was numerical optimization, e.g, Gauss-Newton (GN). More recently, several attempts were made to formulate learnable GN steps as cascade regression architectures. In this paper, we investigate recent machine learning architectures, such as deep neural networks with residual connections, under the above perspective. To this end, we first demonstrate how residual blocks (when considered as discretization of ODEs) can be viewed as GN steps. Then, we go a step further and propose a new residual block, that is reminiscent of Newton's method in numerical optimization and exhibits faster convergence. We thoroughly evaluate the proposed Newton-ResNet by conducting experiments on image and speech classification and image generation, using 4 datasets. All the experiments demonstrate that Newton-ResNet requires less parameters to achieve the same performance with the original ResNet.",
        "keywords": "Residual learning;Resnet;Newton",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Grigorios Chrysos;Jiankang Deng;Yannis Panagakis;Stefanos Zafeiriou",
        "authorids": "g.chrysos@imperial.ac.uk;j.deng16@imperial.ac.uk;i.panagakis@imperial.ac.uk;s.zafeiriou@imperial.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkxaXeHYDB",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "92;354;232",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            226.0,
            107.04516180877427
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18203681394647461487&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkxackSKvH",
        "title": "Learning Entailment-Based Sentence Embeddings from Natural Language Inference",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a natural language inference model whose interaction layer imposes a direct interpretation of the induced sentence embeddings in terms of entailment and contradiction.",
        "abstract": "Large datasets on natural language inference are a potentially valuable resource for inducing semantic representations of natural language sentences.  But in many such models the embeddings computed by the sentence encoder goes through an MLP-based interaction layer before predicting its label, and thus some of the information about textual entailment is encoded in the interpretation of sentence embeddings given by this parameterised MLP.  \nIn this work we propose a simple interaction layer based on predefined entailment and contradiction scores applied directly to the sentence embeddings.  This parameter-free interaction model achieves results on natural language inference competitive with MLP-based models, demonstrating that the trained sentence embeddings directly represent the information needed for textual entailment, and the inductive bias of this model leads to better generalisation to other related datasets.",
        "keywords": "sentence embeddings;textual entailment;natural language inference;interpretability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rabeeh Karimi Mahabadi*;Florian Mai*;James Henderson",
        "authorids": "rkarimi@idiap.ch;florian.mai@idiap.ch;james.henderson@idiap.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmahabadi*2020learning,\ntitle={Learning Entailment-Based Sentence Embeddings from Natural Language Inference},\nauthor={Rabeeh Karimi Mahabadi* and Florian Mai* and James Henderson},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxackSKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkxackSKvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "317;554;199",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "339;362;315",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            356.6666666666667,
            147.61737326239378
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            338.6666666666667,
            19.189117286165672
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2491642101654043682&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BkxadR4KvS",
        "title": "Insights on Visual Representations for Embodied Navigation Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recent advances in deep reinforcement learning require a large amount of training data and generally result in representations that are often over specialized to the target task. In this work, we study the underlying potential causes for this specialization by measuring the similarity between representations trained on related, but distinct tasks. We use the recently proposed projection weighted Canonical Correlation Analysis (PWCCA) to examine the task dependence of visual representations learned across different embodied navigation tasks.  Surprisingly, we find that slight differences in task have no measurable effect on the visual representation for both SqueezeNet and ResNet architectures.  We then empirically demonstrate that visual representations learned on one task can be effectively transferred to a different task.  Interestingly, we show that if the tasks constrain the agent to spatially disjoint parts of the environment, differences in representation emerge for SqueezeNet models but less-so for ResNets, suggesting that ResNets feature inductive biases which encourage more task-agnostic representations, even in the context of spatially separated tasks.  We generalize our analysis to examine permutations of an environment and find, surprisingly, permutations of an environment also do not influence the visual representation. Our analysis provides insight on the overfitting of representations in RL and provides suggestions of how to design tasks that induce task-agnostic representations.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Erik Wijmans;Julian Straub;Irfan Essa;Dhruv Batra;Judy Hoffman;Ari Morcos",
        "authorids": "etw@gatech.edu;julian.straub@oculus.com;irfan@gatech.edu;dbatra@gatech.edu;judy@gatech.edu;arimorcos@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nwijmans2020insights,\ntitle={Insights on Visual Representations for Embodied Navigation Tasks},\nauthor={Erik Wijmans and Julian Straub and Irfan Essa and Dhruv Batra and Judy Hoffman and Ari Morcos},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxadR4KvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkxadR4KvS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "285;474;290",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "287;570;537",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            349.6666666666667,
            87.94063654281538
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            464.6666666666667,
            126.34960317398003
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dEOH-EnYpWsJ:scholar.google.com/&scioq=Insights+on+Visual+Representations+for+Embodied+Navigation+Tasks&hl=en&as_sdt=0,33",
        "gs_version_total": 3
    },
    {
        "id": "Bkxd9JBYPH",
        "title": "Representing Model Uncertainty of Neural Networks in Sparse Information Form",
        "track": "main",
        "status": "Reject",
        "tldr": "An approximate inference algorithm for deep learning",
        "abstract": "This paper addresses the problem of representing a system's belief using multi-variate normal distributions (MND) where the underlying model is based on a deep neural network (DNN). The major challenge with DNNs is the computational complexity that is needed to obtain model uncertainty using MNDs. To achieve a scalable method, we propose a novel approach that expresses the parameter posterior in sparse information form. Our inference algorithm is based on a novel Laplace Approximation scheme, which involves a diagonal correction of the Kronecker-factored eigenbasis. As this makes the inversion of the information matrix intractable - an operation that is required for full Bayesian analysis, we devise a low-rank   approximation of this eigenbasis and a memory-efficient sampling scheme. We provide both a theoretical analysis and an empirical evaluation on various benchmark data sets, showing the superiority of our approach over existing methods.",
        "keywords": "Model Uncertainty;Neural Networks;Sparse representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jongseok Lee;Rudolph Triebel",
        "authorids": "jongseok.lee@dlr.de;rudolph.triebel@dlr.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlee2020representing,\ntitle={Representing Model Uncertainty of Neural Networks in Sparse Information Form},\nauthor={Jongseok Lee and Rudolph Triebel},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkxd9JBYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer5",
        "site": "https://openreview.net/forum?id=Bkxd9JBYPH",
        "pdf_size": 0,
        "rating": "1;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "94;571;148;391",
        "wc_reply_reviewers": "0;112;0;0",
        "wc_reply_authors": "604;1926;427;870",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "1;3;1;2",
        "rating_avg": [
            3.25,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            301.0,
            191.87105044795058
        ],
        "wc_reply_reviewers_avg": [
            28.0,
            48.49742261192856
        ],
        "wc_reply_authors_avg": [
            956.75,
            581.38600559353
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.75,
            0.82915619758885
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12132600910287448783&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkxdqA4tvB",
        "title": "Collapsed amortized variational inference for switching nonlinear dynamical systems",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose an efficient inference method for switching nonlinear dynamical systems. The key idea is to learn an inference network which can be used as a proposal distribution for the continuous latent variables, while performing exact marginalization of the discrete latent variables. This allows us to use the reparameterization trick, and apply end-to-end training with SGD. We show that this method can successfully segment time series data (including videos) into meaningful \"regimes\", due to the use of piece-wise nonlinear dynamics.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhe Dong;Bryan A. Seybold;Kevin P. Murphy;Hung H. Bui",
        "authorids": "zhedong@google.com;baseybold@gmail.com;kpmurphy@google.com;bui.h.hung@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ndong2020collapsed,\ntitle={Collapsed amortized variational inference for switching nonlinear dynamical systems},\nauthor={Zhe Dong and Bryan A. Seybold and Kevin P. Murphy and Hung H. Bui},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxdqA4tvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkxdqA4tvB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "587;449;539",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "445;631;248",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            525.0,
            57.201398584300364
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            441.3333333333333,
            156.38058987255704
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11161047594030795535&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Bkxe2AVtPS",
        "title": "Shifted and Squeezed 8-bit Floating Point format for Low-Precision Training of Deep Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a novel 8-bit format that eliminates the need for loss scaling, stochastic rounding, and other low precision techniques",
        "abstract": "Training with larger number of parameters while keeping fast iterations is an increasingly\nadopted strategy and trend for developing better performing Deep Neural\nNetwork (DNN) models. This necessitates increased memory footprint and\ncomputational requirements for training. Here we introduce a novel methodology\nfor training deep neural networks using 8-bit floating point (FP8) numbers.\nReduced bit precision allows for a larger effective memory and increased computational\nspeed. We name this method Shifted and Squeezed FP8 (S2FP8). We\nshow that, unlike previous 8-bit precision training methods, the proposed method\nworks out of the box for representative models: ResNet50, Transformer and NCF.\nThe method can maintain model accuracy without requiring fine-tuning loss scaling\nparameters or keeping certain layers in single precision. We introduce two\nlearnable statistics of the DNN tensors - shifted and squeezed factors that are used\nto optimally adjust the range of the tensors in 8-bits, thus minimizing the loss in\ninformation due to quantization.",
        "keywords": "Low-precision training;numerics;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Leopold Cambier;Anahita Bhiwandiwalla;Ting Gong;Oguz H. Elibol;Mehran Nekuii;Hanlin Tang",
        "authorids": "lcambier@stanford.edu;anahita.bhiwandiwalla@intel.com;ting.gong@intel.com;oguz.h.elibol@intel.com;mehran.nekuii@intel.com;hanlin.tang@intel.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nCambier2020Shifted,\ntitle={Shifted and Squeezed 8-bit Floating Point format for Low-Precision Training of Deep Neural Networks},\nauthor={Leopold Cambier and Anahita Bhiwandiwalla and Ting Gong and Oguz H. Elibol and Mehran Nekuii and Hanlin Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkxe2AVtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Bkxe2AVtPS",
        "pdf_size": 0,
        "rating": "1;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "338;454;27;393",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "583;380;0;113",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;0;1",
        "rating_avg": [
            5.25,
            2.5860201081971503
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            303.0,
            164.54634605484256
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            269.0,
            227.82339651581003
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.75,
            0.4330127018922193
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 63,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11418905950936596951&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BkxfaTVFwH",
        "title": "GENESIS: Generative Scene Inference and Sampling with Object-Centric Latent Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "We present the first object-centric generative model of 3D visual scenes capable of both decomposing and generating scenes.",
        "abstract": "Generative latent-variable models are emerging as promising tools in robotics and reinforcement learning. Yet, even though tasks in these domains typically involve distinct objects, most state-of-the-art generative models do not explicitly capture the compositional nature of visual scenes. Two recent exceptions, MONet and IODINE, decompose scenes into objects in an unsupervised fashion. Their underlying generative processes, however, do not account for component interactions. Hence, neither of them allows for principled sampling of novel scenes. Here we present GENESIS, the first  object-centric generative model of 3D visual scenes capable of both decomposing and generating scenes by capturing relationships between scene components. GENESIS parameterises a spatial GMM over images which is decoded from a set of object-centric latent variables that are either inferred sequentially in an amortised fashion or sampled from an autoregressive prior. We train GENESIS on several publicly available datasets and evaluate its performance on scene generation, decomposition, and semi-supervised learning.",
        "keywords": "Generative modelling;object-centric representations;scene generation;variational inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Martin Engelcke;Adam R. Kosiorek;Oiwi Parker Jones;Ingmar Posner",
        "authorids": "martin@robots.ox.ac.uk;adamk@robots.ox.ac.uk;oiwi@robots.ox.ac.uk;ingmar@robots.ox.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nEngelcke2020GENESIS:,\ntitle={GENESIS: Generative Scene Inference and Sampling with Object-Centric Latent Representations},\nauthor={Martin Engelcke and Adam R. Kosiorek and Oiwi Parker Jones and Ingmar Posner},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxfaTVFwH}\n}",
        "github": "https://github.com/applied-ai-lab/genesis",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkxfaTVFwH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "353;1217;416",
        "wc_reply_reviewers": "0;0;273",
        "wc_reply_authors": "700;1302;661",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;3;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            662.0,
            393.2861553627333
        ],
        "wc_reply_reviewers_avg": [
            91.0,
            128.69343417595164
        ],
        "wc_reply_authors_avg": [
            887.6666666666666,
            293.4102171969401
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 316,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12595023313997791876&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BkxfshNYwB",
        "title": "Mincut Pooling in Graph Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "A new pooling layer for GNNs that learns how to pool nodes, according to their features, the graph connectivity, and the dowstream task objective.",
        "abstract": "The advance of node pooling operations in Graph Neural Networks (GNNs) has lagged behind the feverish design of new message-passing techniques, and pooling remains an important and challenging endeavor for the design of deep architectures.\nIn this paper, we propose a pooling operation for GNNs that leverages a differentiable unsupervised loss based on the minCut optimization objective.\nFor each node, our method learns a soft cluster assignment vector that depends on the node features, the target inference task (e.g., a graph classification loss), and, thanks to the minCut objective, also on the connectivity structure of the graph.\nGraph pooling is obtained by applying the matrix of assignment vectors to the adjacency matrix and the node features.\nWe validate the effectiveness of the proposed pooling method on a variety of supervised and unsupervised tasks.",
        "keywords": "Graph Neural Networks;Pooling;Graph Cuts;Spectral Clustering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Filippo Maria Bianchi;Daniele Grattarola;Cesare Alippi",
        "authorids": "fibi@norceresearch.no;grattd@usi.ch;alippc@usi.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nbianchi2020mincut,\ntitle={Mincut Pooling in Graph Neural Networks},\nauthor={Filippo Maria Bianchi and Daniele Grattarola and Cesare Alippi},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxfshNYwB}\n}",
        "github": "https://www.dropbox.com/s/n4376n70uvwxjhj/ICLR_code_mincut.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkxfshNYwB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "220;330;187",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1016;677;86",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            245.66666666666666,
            61.13555066862125
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            593.0,
            384.2889537834779
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 50,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13035959594953985865&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BkxgrAVFwH",
        "title": "Wasserstein-Bounded Generative Adversarial Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Propose an improved framework for WGANs and demonstrate its better performance in theory and practice.",
        "abstract": "In the field of Generative Adversarial Networks (GANs), how to design a stable training strategy remains an open problem. Wasserstein GANs have largely promoted the stability over the original GANs by introducing Wasserstein distance, but still remain unstable and are prone to a variety of failure modes. In this paper, we present a general framework named Wasserstein-Bounded GAN (WBGAN), which improves a large family of WGAN-based approaches by simply adding an upper-bound constraint to the Wasserstein term. Furthermore, we show that WBGAN can reasonably measure the difference of distributions which almost have no intersection. Experiments demonstrate that WBGAN can stabilize as well as accelerate convergence in the training processes of a series of WGAN-based variants.",
        "keywords": "GAN;WGAN;GENERATIVE ADVERSARIAL NETWORKS",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Peng Zhou;Bingbing Ni;Lingxi Xie;Xiaopeng Zhang;Hang Wang;Cong Geng;Qi Tian",
        "authorids": "zhoupengcv@sjtu.edu.cn;nibingbing@sjtu.edu.cn;198808xc@gmail.com;zxphistory@gmail.com;wang--hang@sjtu.edu.cn;gengcong@sjtu.edu.cn;tian.qi1@huawei.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nzhou2020wassersteinbounded,\ntitle={Wasserstein-Bounded Generative Adversarial Networks},\nauthor={Peng Zhou and Bingbing Ni and Lingxi Xie and Xiaopeng Zhang and Hang Wang and Cong Geng and Qi Tian},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxgrAVFwH}\n}",
        "github": "https://github.com/AnonymousGFR/wbgan.pytorch",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkxgrAVFwH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "566;335;400",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            433.6666666666667,
            97.26367370309545
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:68SbGM2kRe8J:scholar.google.com/&scioq=Wasserstein-Bounded+Generative+Adversarial+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BkxmKgHtwH",
        "title": "Pragmatic Evaluation of Adversarial Examples in Natural Language",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present a framework for evaluating adversarial examples in natural language processing and demonstrate that generated adversarial examples are often not semantics-preserving, syntactically correct, or non-suspicious.",
        "abstract": "Attacks on natural language models are difficult to compare due to their different definitions of what constitutes a successful attack. We present a taxonomy of constraints to categorize these attacks. For each constraint, we present a real-world use case and a way to measure how well generated samples enforce the constraint. We then employ our framework to evaluate two state-of-the art attacks which fool models with synonym substitution. These attacks claim their adversarial perturbations preserve the semantics and syntactical correctness of the inputs, but our analysis shows these constraints are not strongly enforced. For a significant portion of these adversarial examples, a grammar checker detects an increase in errors. Additionally, human studies indicate that many of these adversarial examples diverge in semantic meaning from the input or do not appear to be human-written. Finally, we highlight the need for standardized evaluation of attacks that share constraints. Without shared evaluation metrics, it is up to researchers to set thresholds that determine the trade-off between attack quality and attack success. We recommend well-designed human studies to determine the best threshold to approximate human judgement.",
        "keywords": "adversarial examples;natural language processing;analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "John Morris;Eli Lifland;Ji Gao;Jack Lanchantin;Yangfeng Ji;Yanjun Qi",
        "authorids": "jm8wx@virginia.edu;edl9cy@virginia.edu;jg6yd@virginia.edu;jjl5sw@virginia.edu;yj3fs@virginia.edu;yq2h@virginia.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkxmKgHtwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "614;747;188",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "283;343;123",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            516.3333333333334,
            238.43144833590128
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            249.66666666666666,
            92.85592184789412
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:p241aVE7898J:scholar.google.com/&scioq=Pragmatic+Evaluation+of+Adversarial+Examples+in+Natural+Language&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BkxmOn4FwH",
        "title": "Ensemble methods and LSTM outperformed other eight machine learning classifiers in an EEG-based BCI experiment",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "Two Algorithms outperformed eight others on a EEG-based BCI experiment",
        "abstract": "We review eight machine learning classification algorithms to analyze Electroencephalographic (EEG) signals in order to distinguish EEG patterns associated with five basic educational tasks. There is a large variety of classifiers being used in this EEG-based Brain-Computer Interface (BCI) field. While previous EEG experiments used several classifiers in the same experiments or reviewed different algorithms on datasets from different experiments, our approach focuses on review eight classifier categories on the same dataset, including linear classifiers, non-linear Bayesian classifiers, nearest neighbour classifiers, ensemble methods, adaptive classifiers, tensor classifiers, transfer learning and deep learning. Besides, we intend to find an approach which can run smoothly on the current mainstream personal computers and smartphones.  The empirical evaluation demonstrated that Random Forest and LSTM (Long Short-Term Memory) outperform other approaches. We used a data set which users were conducting five frequently-conduct learning-related tasks, including reading, writing, and typing. Results showed that these best two algorithms could correctly classify different users with an accuracy increase of  5% to 9%, use each task independently. Within each subject, the tasks could be recognized with an accuracy increase of  4% to 7%, compared with other approaches. This work suggests that Random Forest could be a recommended approach (fast and accurate) for current mainstream hardware, while LSTM has the potential to be the first-choice approach when the mainstream computers and smartphones can process more data in a shorter time. ",
        "keywords": "Ensemble methods;LSTM;EEG;BCI;Machine learning;Classifier",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiaodong Qu;Zepeng Hu;Zhaonan Li;Timothy J. Hickey",
        "authorids": "xiqu@brandeis.edu;zepenghu@brandeis.edu;zli@brandeis.edu;tjhickey@brandeis.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nanonymous2020ensemble,\ntitle={Ensemble methods and {\\{}LSTM{\\}} outperformed other eight machine learning classifiers in an {\\{}EEG{\\}}-based {\\{}BCI{\\}} experiment},\nauthor={Anonymous},\nbooktitle={Submitted to International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxmOn4FwH},\nnote={under review}\n}",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=BkxmOn4FwH",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            2,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16416779657893068569&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkxnKkrtvS",
        "title": "Semi-Supervised Named Entity Recognition with CRF-VAEs",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We embed a CRF in a VAE of tokens and NER tags for semi-supervised learning and show improvements in low-resource settings.",
        "abstract": "We investigate methods for semi-supervised learning (SSL) of a neural linear-chain conditional random field (CRF) for Named Entity Recognition (NER) by treating the tagger as the amortized variational posterior in a generative model of text given tags. We first illustrate how to incorporate a CRF in a VAE, enabling end-to-end training on semi-supervised data. We then investigate a series of increasingly complex deep generative models of tokens given tags enabled by end-to-end optimization, comparing the proposed models against supervised and strong CRF SSL baselines on the Ontonotes5 NER dataset. We find that our best proposed model consistently improves performance by $\\approx 1\\%$ F1 in low- and moderate-resource regimes and easily addresses degenerate model behavior in a more difficult, partially supervised setting.",
        "keywords": "vae;ner;tagging;crf;nlp;semi-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thomas Effland;Michael Collins",
        "authorids": ";",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BkxnKkrtvS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "227;211;385",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "237;156;229",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            274.3333333333333,
            78.5252967025418
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            207.33333333333334,
            36.444783196257625
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2415185934074252868&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkxoglrtvH",
        "title": "Layerwise Learning Rates for Object Features in Unsupervised and Supervised Neural Networks And Consequent Predictions for the Infant Visual System",
        "track": "main",
        "status": "Reject",
        "tldr": "Unsupervised networks learn from bottom up; machines and infants acquire visual classes in different orders",
        "abstract": "To understand how object vision develops in infancy and childhood, it will be necessary to develop testable computational models. Deep neural networks (DNNs) have proven valuable as models of adult vision, but it is not yet clear if they have any value as models of development. As a first model, we measured learning in a DNN designed to mimic the architecture and representational geometry of the visual system (CORnet). We quantified the development of explicit object representations at each level of this network through training by freezing the convolutional layers and training an additional linear decoding layer. We evaluate decoding accuracy on the whole ImageNet validation set, and also for individual visual classes. CORnet, however, uses supervised training and because infants have only extremely impoverished access to labels they must instead learn in an unsupervised manner. We therefore also measured learning in a state-of-the-art unsupervised network (DeepCluster). CORnet and DeepCluster differ in both supervision and in the convolutional networks at their heart, thus to isolate the effect of supervision, we ran a control experiment in which we trained the convolutional network from DeepCluster (an AlexNet variant) in a supervised manner. We make predictions on how learning should develop across brain regions in infants. In all three networks, we also tested for a relationship in the order in which infants and machines acquire visual classes, and found only evidence for a counter-intuitive relationship. We discuss the potential reasons for this.",
        "keywords": "deep learning;unsupervised;supervised;infant learning;age of acquisition;DeepCluster;CORnet;AlexNet",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rhodri Cusack;Cliona O'Doherty;Anna Birbeck;Anna Truzzi",
        "authorids": "cusackrh@tcd.ie;odoherc1@tcd.ie;birbecka@tcd.ie;truzzia@tcd.ie",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ncusack2020layerwise,\ntitle={Layerwise Learning Rates for Object Features in Unsupervised and Supervised Neural Networks And Consequent Predictions for the Infant Visual System},\nauthor={Rhodri Cusack and Cliona O'Doherty and Anna Birbeck and Anna Truzzi},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxoglrtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkxoglrtvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "315;282;275",
        "wc_reply_reviewers": "0;0;285",
        "wc_reply_authors": "510;528;800",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            290.6666666666667,
            17.441967269268172
        ],
        "wc_reply_reviewers_avg": [
            95.0,
            134.35028842544403
        ],
        "wc_reply_authors_avg": [
            612.6666666666666,
            132.66834169796837
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7114997384108718742&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Bkxonh4Ywr",
        "title": "Localizing and Amortizing: Efficient Inference for Gaussian Processes",
        "track": "main",
        "status": "Reject",
        "tldr": "A scalable variational inference for GP leveraging nearest neighbors and amortization.",
        "abstract": "The inference of Gaussian Processes concerns the distribution of the underlying function given observed data points. GP inference based on local ranges of data points is able to capture fine-scale correlations and allow fine-grained decomposition of the computation. Following this direction, we propose a new inference model that considers the correlations and observations of the K nearest neighbors for the inference at a data point. Compared with previous works, we also eliminate the data ordering prerequisite to simplify the inference process. Additionally, the inference task is decomposed to small subtasks with several technique innovations, making our model well suits the stochastic optimization. Since the decomposed small subtasks have the same structure, we further speed up the inference procedure with amortized inference. Our model runs efficiently and achieves good performances on several benchmark tasks.",
        "keywords": "Gaussian Processes;Variational Inference;Amortized Inference;Nearest Neighbors",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Linfeng Liu;Liping Liu",
        "authorids": "linfeng.liu@tufts.edu;liping.liu@tufts.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nliu2020localizing,\ntitle={Localizing and Amortizing: Efficient Inference for Gaussian Processes},\nauthor={Linfeng Liu and Liping Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkxonh4Ywr}\n}",
        "github": "https://www.dropbox.com/sh/edl3r5hyndu9too/AABu_mU8EvRMzEQlYXM2u3C9a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bkxonh4Ywr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1064;395;395",
        "wc_reply_reviewers": "0;0;114",
        "wc_reply_authors": "529;202;677",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            618.0,
            315.3696244092002
        ],
        "wc_reply_reviewers_avg": [
            38.0,
            53.74011537017761
        ],
        "wc_reply_authors_avg": [
            469.3333333333333,
            198.45458478508937
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2772174436833979027&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BkxpMTEtPB",
        "title": "GLAD: Learning Sparse Graph Recovery",
        "track": "main",
        "status": "Poster",
        "tldr": "A data-driven learning algorithm based on unrolling the Alternating Minimization optimization for sparse graph recovery.",
        "abstract": "Recovering sparse conditional independence graphs from data is a fundamental problem in machine learning with wide applications. A popular formulation of the problem is an $\\ell_1$ regularized maximum likelihood estimation. Many convex optimization algorithms have been designed to solve this formulation to recover the graph structure. Recently, there is a surge of interest to learn algorithms directly based on data, and in this case, learn to map empirical covariance to the sparse precision matrix. However, it is a challenging task in this case, since the symmetric positive definiteness (SPD) and sparsity of the matrix are not easy to enforce in learned algorithms, and a direct mapping from data to precision matrix may contain many parameters. We propose a deep learning architecture, GLAD, which uses an Alternating Minimization (AM) algorithm as our model inductive bias, and learns the model parameters via supervised learning. We show that GLAD learns a very compact and effective model for recovering sparse graphs from data.",
        "keywords": "Meta learning;automated algorithm design;learning structure recovery;Gaussian graphical models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Harsh Shrivastava;Xinshi Chen;Binghong Chen;Guanghui Lan;Srinivas Aluru;Han Liu;Le Song",
        "authorids": "hshrivastava3@gatech.edu;xinshi.chen@gatech.edu;binghong@gatech.edu;george.lan@isye.gatech.edu;aluru@cc.gatech.edu;hanliu@northwestern.edu;lsong@cc.gatech.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nShrivastava2020GLAD:,\ntitle={GLAD: Learning Sparse Graph Recovery},\nauthor={Harsh Shrivastava and Xinshi Chen and Binghong Chen and Guanghui Lan and Srinivas Aluru and Han Liu and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxpMTEtPB}\n}",
        "github": "https://github.com/Harshs27/GLAD",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BkxpMTEtPB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "194;677;386",
        "wc_reply_reviewers": "0;0;25",
        "wc_reply_authors": "613;1137;849",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;2;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            419.0,
            198.55981466550577
        ],
        "wc_reply_reviewers_avg": [
            8.333333333333334,
            11.785113019775793
        ],
        "wc_reply_authors_avg": [
            866.3333333333334,
            214.2729308356258
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 50,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17323993038772593390&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BkxtNaEYDr",
        "title": "Learning Boolean Circuits with Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Training neural-networks is computationally hard. However, in practice they are trained efficiently using gradient-based algorithms, achieving remarkable performance on natural data. To bridge this gap, we observe the property of local correlation: correlation between small patterns of the input and the target label. We focus on learning deep neural-networks with a variant of gradient-descent, when the target function is a tree-structured Boolean circuit. We show that in this case, the existence of correlation between the gates of the circuit and the target label determines whether the optimization succeeds or fails. Using this result, we show that neural-networks can learn the (log n)-parity problem for most product distributions. These results hint that local correlation may play an important role in differentiating between distributions that are hard or easy to learn.",
        "keywords": "neural-networks;deep learning theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eran Malach;Shai Shalev-Shwartz",
        "authorids": "eran.malach@mail.huji.ac.il;shais@cs.huji.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmalach2020learning,\ntitle={Learning Boolean Circuits with Neural Networks},\nauthor={Eran Malach and Shai Shalev-Shwartz},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxtNaEYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkxtNaEYDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "994;239;374",
        "wc_reply_reviewers": "618;0;0",
        "wc_reply_authors": "528;0;290",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            535.6666666666666,
            328.743398750792
        ],
        "wc_reply_reviewers_avg": [
            206.0,
            291.3279938488576
        ],
        "wc_reply_authors_avg": [
            272.6666666666667,
            215.9032705222925
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9939848061385856765&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BkxthxHYvr",
        "title": "Conditional generation of molecules from disentangled representations",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Though machine learning approaches have shown great success in estimating properties of small molecules, the inverse problem of generating molecules with desired properties remains challenging. This difficulty is in part because the set of molecules which have a given property is structurally very diverse. Treating this inverse problem as a conditional distribution estimation task, we draw upon work in learning disentangled representations to learn a conditional distribution over molecules given a desired property, where the molecular structure is encoded in a continuous latent random variable. By including property information as an input factor independent from the structure representation, one can perform conditional molecule generation via a ``style transfer'' process, in which we explicitly set the property to a desired value at generation time. In contrast to existing approaches, we disentangle the latent factors from the property factors using a regularization term which constrains the generated molecules to have the property provided to the  generation network, no matter how the latent factor changes.",
        "keywords": "molecule generation;disentangling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amina Mollaysa;Brooks Paige;Alexandros  Kalousis",
        "authorids": "amina.mollaysa@gmail.com;tbpaige@gmail.com;alexandros.kalousis@hesge.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmollaysa2020conditional,\ntitle={Conditional generation of molecules from disentangled representations},\nauthor={Amina Mollaysa and Brooks Paige and Alexandros  Kalousis},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxthxHYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BkxthxHYvr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "723;660;994",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1535;1363;686",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            792.3333333333334,
            144.9007323039543
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1194.6666666666667,
            366.47176638256263
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4477888847868112879&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Bkxv90EKPB",
        "title": "Bayesian Meta Sampling for Fast Uncertainty Adaptation",
        "track": "main",
        "status": "Poster",
        "tldr": "We proposed a Bayesian meta sampling method for adapting the model uncertainty in meta learning",
        "abstract": "Meta learning has been making impressive progress for fast model adaptation. However, limited work has been done on learning fast uncertainty adaption for Bayesian modeling. In this paper, we propose to achieve the goal by placing meta learning on the space of probability measures, inducing the concept of meta sampling for fast uncertainty adaption. Specifically, we propose a Bayesian meta sampling framework consisting of two main components: a meta sampler and a sample adapter. The meta sampler is constructed by adopting a neural-inverse-autoregressive-flow (NIAF) structure, a variant of the recently proposed neural autoregressive flows, to efficiently generate meta samples to be adapted. The sample adapter moves meta samples to task-specific samples, based on a newly proposed and general Bayesian sampling technique, called optimal-transport Bayesian sampling. The combination of the two components allows a simple learning procedure for the\nmeta sampler to be developed, which can be efficiently optimized via standard back-propagation. Extensive experimental results demonstrate the efficiency and effectiveness of the proposed framework, obtaining better sample quality and faster\nuncertainty adaption compared to related methods.",
        "keywords": "Bayesian Sampling;Uncertainty Adaptation;Meta Learning;Variational Inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhenyi Wang;Yang Zhao;Ping Yu;Ruiyi Zhang;Changyou Chen",
        "authorids": "zhenyiwa@buffalo.edu;yzhao63@buffalo.edu;pingyu@buffalo.edu;ryzhang@cs.duke.edu;changyou@buffalo.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nWang2020Bayesian,\ntitle={Bayesian Meta Sampling for Fast Uncertainty Adaptation},\nauthor={Zhenyi Wang and Yang Zhao and Ping Yu and Ruiyi Zhang and Changyou Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bkxv90EKPB}\n}",
        "github": "[![github](/images/github_icon.svg) zheshiyige/meta-sampling](https://github.com/zheshiyige/meta-sampling)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bkxv90EKPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "136;628;1427",
        "wc_reply_reviewers": "0;0;17",
        "wc_reply_authors": "513;216;794",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            730.3333333333334,
            531.9926900082578
        ],
        "wc_reply_reviewers_avg": [
            5.666666666666667,
            8.013876853447538
        ],
        "wc_reply_authors_avg": [
            507.6666666666667,
            235.9976459392951
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15645160927746258341&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BkxzsT4Yvr",
        "title": "Deep Gradient Boosting -- Layer-wise Input Normalization of Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "What can we learn about training neural networks if we treat each layer as a gradient boosting problem?",
        "abstract": "Stochastic gradient descent (SGD) has been the dominant optimization method for training deep neural networks due to its many desirable properties. One of the more remarkable and least understood quality of SGD is that it generalizes relatively well\non unseen data even when the neural network has millions of parameters. We hypothesize that in certain cases it is desirable to relax its intrinsic generalization properties and introduce an extension of SGD called deep gradient boosting (DGB). The key idea of DGB is that back-propagated gradients inferred using the chain rule can be viewed as pseudo-residual targets of a gradient boosting problem. Thus at each layer of a neural network the weight update is calculated by solving the corresponding boosting problem using a linear base learner. The resulting weight update formula can also be viewed as a normalization procedure of the data that arrives at each layer during the forward pass. When implemented as a separate input normalization layer (INN) the new architecture shows improved performance on image recognition tasks when compared to the same architecture without normalization layers. As opposed to batch normalization (BN), INN has no learnable parameters however it matches its performance on CIFAR10 and ImageNet classification tasks.",
        "keywords": "sgd;dgb;boosting;batch norm;input norm",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Erhan Bilal",
        "authorids": "ebilal@us.ibm.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nbilal2020deep,\ntitle={Deep Gradient Boosting -- Layer-wise Input Normalization of Neural Networks},\nauthor={Erhan Bilal},\nyear={2020},\nurl={https://openreview.net/forum?id=BkxzsT4Yvr}\n}",
        "github": "https://gofile.io/?c=S3giCL",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BkxzsT4Yvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "226;195;299",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "199;305;407",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            240.0,
            43.59663595584718
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            303.6666666666667,
            84.92087820763251
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jZsKgj4RjQ8J:scholar.google.com/&scioq=Deep+Gradient+Boosting+--+Layer-wise+Input+Normalization+of+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "Bye-sxHFwB",
        "title": "A Gradient-Based Approach to Neural Networks Structure Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Designing the architecture of deep neural networks (DNNs) requires human expertise and is a cumbersome task. One approach to automatize this task has been considering DNN architecture parameters such as the number of layers, the number of neurons per layer, or the activation function of each layer as  hyper-parameters, and using an external method for optimizing it. Here we propose a novel neural network model, called Farfalle Neural Network, in which important architecture features such as the number of neurons in each layer and the wiring among the neurons are automatically learned during the training process. We show that the proposed model can replace a stack of dense layers, which is used as a part of many DNN architectures. It can achieve higher accuracy using significantly fewer parameters.  ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amir Ali Moinfar;Amirkeivan Mohtashami;Mahdieh Soleymani;Ali Sharifi-Zarchi",
        "authorids": "moinfar@ce.sharif.edu;mohtashami@ce.sharif.edu;soleymani@sharif.edu;sharifi@sharif.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmoinfar2020a,\ntitle={A Gradient-Based Approach to Neural Networks Structure Learning},\nauthor={Amir Ali Moinfar and Amirkeivan Mohtashami and Mahdieh Soleymani and Ali Sharifi-Zarchi},\nyear={2020},\nurl={https://openreview.net/forum?id=Bye-sxHFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bye-sxHFwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "270;268;787",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "565;394;536",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            441.6666666666667,
            244.18890683694502
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            498.3333333333333,
            74.71873184743494
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DfY86AysHjsJ:scholar.google.com/&scioq=A+Gradient-Based+Approach+to+Neural+Networks+Structure+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Bye2uJHYwr",
        "title": "Weighted Empirical Risk Minimization: Transfer Learning based on Importance Sampling",
        "track": "main",
        "status": "Reject",
        "tldr": "When training and testing distributions are different, importance sampling works for many common practical cases.",
        "abstract": "We consider statistical learning problems, when the distribution $P'$ of the training observations $Z'_1,\\; \\ldots,\\; Z'_n$ differs from the distribution $P$ involved in the risk one seeks to minimize (referred to as the \\textit{test distribution}) but is still defined on the same measurable space as $P$ and dominates it. In the unrealistic case where the likelihood ratio $\\Phi(z)=dP/dP'(z)$ is known, one may straightforwardly extends the Empirical Risk Minimization (ERM) approach to this specific \\textit{transfer learning} setup using the same idea as that behind Importance Sampling, by minimizing a weighted version of the empirical risk functional computed from the 'biased' training data $Z'_i$ with weights $\\Phi(Z'_i)$. Although the \\textit{importance function} $\\Phi(z)$ is generally unknown in practice, we show that, in various situations frequently encountered in practice, it takes a simple form and can be directly estimated from the $Z'_i$'s and some auxiliary information on the statistical population $P$. By means of linearization techniques, we then prove that the generalization capacity of the approach aforementioned is preserved when plugging the resulting estimates of the $\\Phi(Z'_i)$'s into the  weighted empirical risk. Beyond these theoretical guarantees, numerical results provide strong empirical evidence of the relevance of the approach promoted in this article.",
        "keywords": "statistical learning theory;importance sampling;positive unlabeled (PU) learning;selection bias",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Robin Vogel;Mastane Achab;Charles Tillier;St\u00e9phan Cl\u00e9men\u00e7on",
        "authorids": "robin.vogel@telecom-paris.fr;mastane.achab@telecom-paris.fr;charles.tillier@telecom-paris.fr;stephan.clemencon@telecom-paris.fr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nvogel2020weighted,\ntitle={Weighted Empirical Risk Minimization: Transfer Learning based on Importance Sampling},\nauthor={Robin Vogel and Mastane Achab and Charles Tillier and St{\\'e}phan Cl{\\'e}men{\\c{c}}on},\nyear={2020},\nurl={https://openreview.net/forum?id=Bye2uJHYwr}\n}",
        "github": "https://drive.google.com/drive/folders/1-tWJ4n4WyXuTza8dLPngyHSVprKUZFVJ",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bye2uJHYwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1269;536;81",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "819;235;94",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            628.6666666666666,
            489.40530805821425
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            382.6666666666667,
            313.85807125018925
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=227735536718134608&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Bye30kSYDH",
        "title": "PolyGAN: High-Order Polynomial Generators",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We model the data generator (in GAN) by means of a high-order polynomial represented by high-order tensors.",
        "abstract": "Generative Adversarial Networks (GANs) have become the gold standard when it comes to learning generative models for high-dimensional distributions. Since their advent, numerous variations of GANs have been introduced in the literature, primarily focusing on utilization of novel loss functions, optimization/regularization strategies and network architectures. In this paper, we turn our attention to the generator and investigate the use of high-order polynomials as an alternative class of universal function approximators. Concretely, we propose PolyGAN, where we model the data generator by means of a high-order polynomial whose unknown parameters are naturally represented by high-order tensors. We introduce two tensor decompositions that significantly reduce the number of parameters and show how they can be efficiently implemented by hierarchical neural networks that only employ linear/convolutional blocks. We exhibit for the first time that by using our approach a GAN generator can approximate the data distribution without using any activation functions. Thorough experimental evaluation on both synthetic and real data (images and 3D point clouds) demonstrates the merits of PolyGAN against the state of the art.",
        "keywords": "tensor decomposition;Generative Adversarial Networks;polynomial expansion;function approximation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Grigorios Chrysos;Stylianos Moschoglou;Yannis Panagakis;Stefanos Zafeiriou",
        "authorids": "g.chrysos@imperial.ac.uk;s.moschoglou@imperial.ac.uk;i.panagakis@imperial.ac.uk;s.zafeiriou@imperial.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bye30kSYDH",
        "pdf_size": 0,
        "rating": "1;1;1;3",
        "confidence": "0;0;0;0",
        "wc_review": "87;310;318;105",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            1.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            205.0,
            109.22225048038517
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16086481637640390186&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Bye3P1BYwr",
        "title": "Deep End-to-end Unsupervised Anomaly Detection",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This paper proposes a novel method to detect anomalies in large datasets under a fully unsupervised setting. The key idea behind our algorithm is to learn the representation underlying normal data. To this end, we leverage the latest clustering\ntechnique suitable for handling high dimensional data. This hypothesis provides a reliable starting point for normal data selection. We train an autoencoder from the normal data subset, and iterate between hypothesizing normal candidate subset\nbased on clustering and representation learning. The reconstruction error from the learned autoencoder serves as a scoring function to assess the normality of the data. Experimental results on several public benchmark datasets show that the proposed method outperforms state-of-the-art unsupervised techniques and is comparable to semi-supervised techniques in most cases.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Li Tangqing;Wang Zheng;Liu Siying;Daniel Lin Wen-Yan",
        "authorids": "li_tangqing@u.nus.edu;sliu50@illinois.edu;zhwang@i2r.a-star.edu.sg;daniellin@smu.edu.sg",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntangqing2020deep,\ntitle={Deep End-to-end Unsupervised Anomaly Detection },\nauthor={Li Tangqing and Wang Zheng and Liu Siying and Daniel Lin Wen-Yan},\nyear={2020},\nurl={https://openreview.net/forum?id=Bye3P1BYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bye3P1BYwr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "209;427;282",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "578;514;246",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            306.0,
            90.60169240509067
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            446.0,
            143.81469558660083
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IOXMuZepwGQJ:scholar.google.com/&scioq=Deep+End-to-end+Unsupervised+Anomaly+Detection&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Bye4iaEFwr",
        "title": "Improving Dirichlet Prior Network for Out-of-Distribution Example Detection",
        "track": "main",
        "status": "Reject",
        "tldr": "An improved framework for Dirichlet Prior Network for efficient training and detecting OOD examples along with identifying distributional uncertainty.",
        "abstract": "Determining the source of uncertainties in the predictions of AI systems are important. It allows the users to act in an informative manner to improve the safety of such systems, applied to the real-world sensitive applications. Predictive uncertainties can originate from the uncertainty in model parameters, data uncertainty or due to distributional mismatch between training and test examples. While recently, significant progress has been made to improve the predictive uncertainty estimation of deep learning models, most of these approaches either conflate the distributional uncertainty with model uncertainty or data uncertainty. In contrast, the Dirichlet Prior Network (DPN)  can model distributional uncertainty distinctly by parameterizing a prior Dirichlet over the predictive categorical distributions. However, their complex loss function by explicitly incorporating KL divergence between Dirichlet distributions often makes the error surface \nill-suited to optimize for challenging datasets with multiple classes. In this paper, we present an improved DPN framework by proposing a novel loss function using the standard cross-entropy loss along with a regularization term to control the sharpness of the output Dirichlet distributions from the network. Our proposed loss function aims to improve the training efficiency of the DPN framework for challenging classification tasks with large number of classes. In our experiments using synthetic and real datasets, we demonstrate that our DPN models can distinguish the distributional uncertainty from other uncertainty types. Our proposed approach significantly improves DPN frameworks and outperform the existing OOD detectors on CIFAR-10 and CIFAR-100 dataset while also being able to recognize distributional uncertainty distinctly.",
        "keywords": "predictive uncertainty;distributional uncertainty;Dirichlet distribution;out-of-distribution detection;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jay Nandy",
        "authorids": "a0123886@u.nus.edu",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nnandy2020improving,\ntitle={Improving Dirichlet Prior Network for Out-of-Distribution Example Detection},\nauthor={Jay Nandy},\nyear={2020},\nurl={https://openreview.net/forum?id=Bye4iaEFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Bye4iaEFwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "522;180;392",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "891;540;619",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            364.6666666666667,
            140.95231660230192
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            683.3333333333334,
            150.34257177821885
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6509321399763569096&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Bye6weHFvB",
        "title": "Plan2Vec: Unsupervised Representation Learning by Latent Plans",
        "track": "main",
        "status": "Reject",
        "tldr": "Plan2Vec poses unsupervised representation learning as an RL problem, to extend local information to a consistent global embedding",
        "abstract": "Creating a useful representation of the world takes more than just rote memorization of individual data samples. This is because fundamentally, we use our internal representation to plan, to solve problems, and to navigate the world. For a representation to be amenable to planning, it is critical for it to embody some notion of optimality. A representation learning objective that explicitly considers some form of planning should generate representations which are more computationally valuable than those that memorize samples. In this paper, we introduce \\textbf{Plan2Vec}, an unsupervised representation learning objective inspired by value-based reinforcement learning methods. By abstracting away low-level control with a learned local metric, we show that it is possible to learn plannable representations that inform long-range structures, entirely passively from high-dimensional sequential datasets without supervision. A latent space is learned by playing an ``Imagined Planning Game\" on the graph formed by the data points, using a local metric function trained contrastively from context. We show that the global metric on this learned embedding can be used to plan with O(1) complexity by linear interpolation. This exponential speed-up is critical for planning with a learned representation on any problem containing non-trivial global topology. We demonstrate the effectiveness of Plan2Vec on simulated toy tasks from both proprioceptive and image states, as well as two real-world image datasets, showing that Plan2Vec can effectively plan using learned representations. Additional results and videos can be found at \\url{https://sites.google.com/view/plan2vec}.",
        "keywords": "Unsupervised Learning;Reinforcement Learning;Manifold Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ge Yang;Amy Zhang;Ari Morcos;Joelle Pineau;Pieter Abbeel;Roberto Calandra",
        "authorids": "yangge1987@gmail.com;amyzhang2011@gmail.com;arimorcos@gmail.com;jpineau@cs.mcgill.ca;pabbeel@cs.berkeley.edu;rcalandra@fb.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nyang2020planvec,\ntitle={Plan2Vec: Unsupervised Representation Learning by Latent Plans},\nauthor={Ge Yang and Amy Zhang and Ari Morcos and Joelle Pineau and Pieter Abbeel and Roberto Calandra},\nyear={2020},\nurl={https://openreview.net/forum?id=Bye6weHFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bye6weHFvB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "673;240;1737",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "674;404;565",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            883.3333333333334,
            628.9845431769811
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            547.6666666666666,
            110.90636691471875
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 31,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8760211965464938811&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "Bye8hREtvB",
        "title": "Natural Image Manipulation for Autoregressive Models Using Fisher Scores",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop a novel method to perform image interpolation and semantic manipulation using autoregressive models through fisher scores",
        "abstract": "Deep autoregressive models are one of the most powerful models that exist today which achieve state-of-the-art bits per dim. However, they lie at a strict disadvantage when it comes to controlled sample generation compared to latent variable models. Latent variable models such as VAEs and normalizing flows allow meaningful semantic manipulations in latent space, which autoregressive models do not have. In this paper, we propose using Fisher scores as a method to extract embeddings from an autoregressive model to use for interpolation and show that our method provides more meaningful sample manipulation compared to alternate embeddings such as network activations.",
        "keywords": "fisher score;generative models;image interpolation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wilson Yan;Jonathan Ho;Pieter Abbeel",
        "authorids": "wilson1.yan@berkeley.edu;jonathanho@berkeley.edu;pabbeel@cs.berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyan2020natural,\ntitle={Natural Image Manipulation for Autoregressive Models Using Fisher Scores},\nauthor={Wilson Yan and Jonathan Ho and Pieter Abbeel},\nyear={2020},\nurl={https://openreview.net/forum?id=Bye8hREtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bye8hREtvB",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "281;560;100",
        "wc_reply_reviewers": "0;372;0",
        "wc_reply_authors": "452;557;0",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            313.6666666666667,
            189.20946652380323
        ],
        "wc_reply_reviewers_avg": [
            124.0,
            175.36248173426378
        ],
        "wc_reply_authors_avg": [
            336.3333333333333,
            241.65586182742507
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:va1z6K6DxfcJ:scholar.google.com/&scioq=Natural+Image+Manipulation+for+Autoregressive+Models+Using+Fisher+Scores&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "ByeAK1BKPB",
        "title": "Projected Canonical Decomposition for Knowledge Base Completion",
        "track": "main",
        "status": "Reject",
        "tldr": "We diagnose and fix an optimization issue with Adagrad applied to the Tucker decomposition, yielding better performances for knowledge base completion at small embedding sizes.",
        "abstract": "The leading approaches to tensor completion and link prediction are based on the canonical polyadic (CP) decomposition of tensors. While these approaches were originally motivated by low rank approximations, the best performances are usually obtained for ranks as high as permitted by computation constraints. For large scale factorization problems where the factor dimensions have to be kept small, the performances of these approaches tend to drop drastically. The other main tensor factorization model, Tucker decomposition, is more flexible than CP for fixed factor dimensions, so we expect Tucker-based approaches to yield better performance under strong constraints on the number of parameters. However, as we show in this paper through experiments on standard benchmarks of link prediction in knowledge bases, ComplEx, a variant of CP, achieves similar performances to recent approaches based on Tucker decomposition on all operating points in terms of number of parameters. In a control experiment, we show that one problem in the practical application of Tucker decomposition to large-scale tensor completion comes from the adaptive optimization algorithms based on diagonal rescaling, such as Adagrad. We present a new algorithm for a constrained version of Tucker which implicitly applies Adagrad to a CP-based model with an additional projection of the embeddings onto a fixed lower dimensional subspace. The resulting Tucker-style extension of ComplEx obtains similar best performances as ComplEx, with substantial gains on some datasets under constraints on the number of parameters.",
        "keywords": "knowledge base completion;adagrad",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Timoth\u00e9e Lacroix;Guillaume Obozinski;Joan Bruna;Nicolas Usunier",
        "authorids": "timothee.lax@gmail.com;guillaume.obozinski@epfl.ch;bruna@cims.nyu.edu;usunier@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nlacroix2020projected,\ntitle={Projected Canonical Decomposition for Knowledge Base Completion},\nauthor={Timoth{\\'e}e Lacroix and Guillaume Obozinski and Joan Bruna and Nicolas Usunier},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeAK1BKPB}\n}",
        "github": "http://s000.tinyupload.com/?file_id=00758301472623981604",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ByeAK1BKPB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "223;363;521",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "314;490;689",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.0,
            121.73194595777505
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            497.6666666666667,
            153.18906256286346
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:q7XhlRlNg7QJ:scholar.google.com/&scioq=Projected+Canonical+Decomposition+for+Knowledge+Base+Completion&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ByeDl1BYvH",
        "title": "Global graph curvature",
        "track": "main",
        "status": "Reject",
        "tldr": "Introduce a concept of global graph curvature specifically catered to the problem of embedding graphs and find its connection with popular local graph curvatures.",
        "abstract": "Recently, non-Euclidean spaces became popular for embedding structured data. However, determining suitable geometry and, in particular, curvature for a given dataset is still an open problem. In this paper, we define a notion of global graph curvature, specifically catered to the problem of embedding graphs, and analyze the problem of estimating this curvature using only graph-based characteristics (without actual graph embedding). We show that optimal curvature essentially depends on dimensionality of the embedding space and loss function one aims to minimize via embedding. We review the existing notions of local curvature (e.g., Ollivier-Ricci curvature) and analyze their properties theoretically and empirically. In particular, we show that such curvatures are often unable to properly estimate the global one. Hence, we propose a new estimator of global graph curvature specifically designed for zero-one loss function.",
        "keywords": "graph curvature;graph embedding;hyperbolic space;distortion;Ollivier curvature;Forman curvature",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liudmila Prokhorenkova;Egor Samosvat;Pim van der Hoorn",
        "authorids": "ostroumova-la@yandex-team.ru;sameg@yandex-team.ru;pimvdhoorn@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nprokhorenkova2020global,\ntitle={Global graph curvature},\nauthor={Liudmila Prokhorenkova and Egor Samosvat and Pim van der Hoorn},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeDl1BYvH}\n}",
        "github": "https://drive.google.com/open?id=12i5LD6yTyFRDrgkMJEencDAI0Au6Isrn",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ByeDl1BYvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "237;729;159",
        "wc_reply_reviewers": "0;216;0",
        "wc_reply_authors": "388;1056;19",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            375.0,
            252.3331131659101
        ],
        "wc_reply_reviewers_avg": [
            72.0,
            101.82337649086284
        ],
        "wc_reply_authors_avg": [
            487.6666666666667,
            429.1793202018113
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10114461068876428417&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "ByeGzlrKwH",
        "title": "Compression based bound for non-compressed network: unified generalization error analysis of large compressible deep neural network",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "One of the biggest issues in deep learning theory is the generalization ability of networks with huge model size. \nThe classical learning theory suggests that overparameterized models cause overfitting.\nHowever, practically used large deep models avoid overfitting, which is not well explained by the classical approaches.\nTo resolve this issue, several attempts have been made. \nAmong them, the compression based bound is one of the promising approaches. \nHowever, the compression based bound can be applied only to a compressed network, and it is not applicable to the non-compressed original network. \nIn this paper, we give a unified frame-work that can convert compression based bounds to those for non-compressed original networks.\nThe bound gives even better rate than the one for the compressed network by improving the bias term.\nBy establishing the unified frame-work, we can obtain a data dependent generalization error bound which gives a tighter evaluation than the data independent ones.\n",
        "keywords": "Generalization error;compression based bound;local Rademacher complexity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Taiji Suzuki;Hiroshi Abe;Tomoaki Nishimura",
        "authorids": "taiji@mist.i.u-tokyo.ac.jp;abe@ipride.co.jp;tomoaki.nishimura@nttdata.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nSuzuki2020Compression,\ntitle={Compression based bound for non-compressed network: unified generalization error analysis of large compressible deep neural network},\nauthor={Taiji Suzuki and Hiroshi Abe and Tomoaki Nishimura},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeGzlrKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByeGzlrKwH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "296;281;493",
        "wc_reply_reviewers": "0;297;93",
        "wc_reply_authors": "546;498;783",
        "reply_reviewers": "0;2;1",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            356.6666666666667,
            96.59652627754731
        ],
        "wc_reply_reviewers_avg": [
            130.0,
            124.04031602668546
        ],
        "wc_reply_authors_avg": [
            609.0,
            124.58731877683218
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2952692834291719613&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "ByeL1R4FvS",
        "title": "Unsupervised Data Augmentation for Consistency Training",
        "track": "main",
        "status": "Reject",
        "tldr": "A semi-supervised learning method that enforces a model's prediction to be robust to advanced data augmentations.",
        "abstract": "Semi-supervised learning lately has shown much promise in improving deep learning models when labeled data is scarce. Common among recent approaches is the use of consistency training on a large amount of unlabeled data to constrain model predictions to be invariant to input noise. In this work, we present a new perspective on how to effectively noise unlabeled examples and argue that the quality of noising, specifically those produced by advanced data augmentation methods, plays a crucial role in semi-supervised learning. By substituting simple noising operations with advanced data augmentation methods, our method brings substantial improvements across six language and three vision tasks under the same consistency training framework. On the IMDb text classification dataset, with only 20 labeled examples, our method achieves an error rate of 4.20, outperforming the state-of-the-art model trained on 25,000 labeled examples. On a standard semi-supervised learning benchmark, CIFAR-10, our method outperforms all previous approaches and achieves an error rate of 2.7% with only 4,000 examples, nearly matching the performance of models trained on 50,000 labeled examples. Our method also combines well with transfer learning, e.g., when finetuning from BERT, and yields improvements in high-data regime, such as ImageNet, whether when there is only 10% labeled data or when a full labeled set with 1.3M extra unlabeled examples is used.",
        "keywords": "Semi-supervised learning;computer vision;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qizhe Xie;Zihang Dai;Eduard Hovy;Minh-Thang Luong;Quoc V. Le",
        "authorids": "qizhex@cs.cmu.edu;dzihang@cs.cmu.edu;hovy@cs.cmu.edu;thangluong@google.com;qvl@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nxie2020unsupervised,\ntitle={Unsupervised Data Augmentation for Consistency Training},\nauthor={Qizhe Xie and Zihang Dai and Eduard Hovy and Minh-Thang Luong and Quoc V. Le},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeL1R4FvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByeL1R4FvS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "203;215;158",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "271;331;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            192.0,
            24.535688292770594
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            200.66666666666666,
            143.99151209552443
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2859,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12880251999793471515&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "ByeMPlHKPH",
        "title": "Lite Transformer with Long-Short Range Attention",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Transformer has become ubiquitous in natural language processing (e.g., machine translation, question answering); however, it requires enormous amount of computations to achieve high performance, which makes it not suitable for mobile applications that are tightly constrained by the hardware resources and battery. In this paper, we present an efficient mobile NLP architecture, Lite Transformer to facilitate deploying mobile NLP applications on edge devices. The key primitive is the Long-Short Range Attention (LSRA), where one group of heads specializes in the local context modeling (by convolution) while another group specializes in the long-distance relationship modeling (by attention). Such specialization brings consistent improvement over the vanilla transformer on three well-established language tasks: machine translation, abstractive summarization, and language modeling. Under constrained resources (500M/100M MACs), Lite Transformer outperforms transformer on WMT'14 English-French by 1.2/1.7 BLEU, respectively. Lite Transformer reduces the computation of transformer base model by 2.5x with 0.3 BLEU score degradation. Combining with pruning and quantization, we further compressed the model size of Lite Transformer by 18.2x. For language modeling, Lite Transformer achieves 1.8 lower perplexity than the transformer at around 500M MACs. Notably, Lite Transformer outperforms the AutoML-based Evolved Transformer by 0.5 higher BLEU for the mobile NLP setting without the costly architecture search that requires more than 250 GPU years. Code has been made available at https://github.com/mit-han-lab/lite-transformer.",
        "keywords": "efficient model;transformer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhanghao Wu*;Zhijian Liu*;Ji Lin;Yujun Lin;Song Han",
        "authorids": "zhanghao.wu@outlook.com;zhijian@mit.edu;jilin@mit.edu;yujunlin@mit.edu;songhan@mit.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nWu*2020Lite,\ntitle={Lite Transformer with Long-Short Range Attention},\nauthor={Zhanghao Wu* and Zhijian Liu* and Ji Lin and Yujun Lin and Song Han},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeMPlHKPH}\n}",
        "github": "[![github](/images/github_icon.svg) mit-han-lab/lite-transformer](https://github.com/mit-han-lab/lite-transformer) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=ByeMPlHKPH)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ByeMPlHKPH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "301;281;308",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "472;232;644",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            296.6666666666667,
            11.440668201153676
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            449.3333333333333,
            168.9602188550771
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 437,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=417738905489358302&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "ByeNra4FDB",
        "title": "Novelty Detection Via Blurring",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a novel OOD detector that employ blurred images as adversarial examples . Our model achieve significant OOD detection performance in various domains.",
        "abstract": " Conventional out-of-distribution (OOD) detection schemes based on variational autoencoder or Random Network Distillation (RND) are known to assign lower uncertainty to the OOD data than the target distribution. In this work, we discover that such conventional novelty detection schemes are also vulnerable to the blurred images. Based on the observation, we construct a novel RND-based OOD detector, SVD-RND, that utilizes blurred images during training. Our detector is simple, efficient in test time, and outperforms baseline OOD detectors in various domains. Further results show that SVD-RND learns a better target distribution representation than the baselines. Finally, SVD-RND combined with geometric transform achieves near-perfect detection accuracy in CelebA domain.",
        "keywords": "novelty;anomaly;uncertainty",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sungik Choi;Sae-Young Chung",
        "authorids": "si_choi@kaist.ac.kr;schung@kaist.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nChoi2020Novelty,\ntitle={Novelty Detection Via Blurring},\nauthor={Sungik Choi and Sae-Young Chung},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeNra4FDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByeNra4FDB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "1485;504;353",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1294;854;476",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            780.6666666666666,
            501.83950510983976
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            874.6666666666666,
            334.2666932588741
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 41,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4336516575519622219&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "ByeOkyrtPS",
        "title": "Recognizing Plans by Learning Embeddings from Observed Action Distributions",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Handling Uncertainty in Visual Perception for Plan Recognition",
        "abstract": "Plan recognition aims to look for target plans to best explain the observed actions based on plan libraries and/or domain models. Despite the success of previous approaches on plan recognition, they mostly rely on correct action observations. \nRecent advances in visual activity recognition have the potential of enabling applications such as automated video surveillance. Effective approaches for such problems would require the ability to recognize the plans of agents from video information. Traditional plan recognition algorithms rely on access to detailed planning domain models. One recent promising direction involves learning approximate (or shallow) domain models directly from the observed activity sequences. Such plan recognition approaches expect observed action sequences as inputs. However, visual inference results are often noisy and uncertain, typically represented as a distribution over possible actions. In this work, we develop a visual plan recognition framework that recognizes plans with an approximate domain model learned from uncertain visual data.",
        "keywords": "action representation learning;plan recognition;shallow model planning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yantian Zha;Yikang Li;Sriram Gopalakrishnan;Hankz Hankui Zhuo;Baoxin Li;Subbarao Kambhampati",
        "authorids": "yantian.zha@asu.edu;yikangli@asu.edu;sgopal28@asu.edu;zhuohank@mail.sysu.edu.cn;baoxin.li@asu.edu;rao@asu.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "https://drive.google.com/open?id=1tEr6zrXvmm0VvqqO0ecf-dUV7Ave0Rw0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ByeOkyrtPS",
        "pdf_size": 0,
        "rating": "1;1;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "1406;155;155;712",
        "wc_reply_reviewers": "239;0;0;0",
        "wc_reply_authors": "314;51;38;350",
        "reply_reviewers": "1;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            607.0,
            514.3038984880437
        ],
        "wc_reply_reviewers_avg": [
            59.75,
            103.49003575224042
        ],
        "wc_reply_authors_avg": [
            188.25,
            144.38555156247455
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6961375411874246269&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "ByePEC4KDS",
        "title": "Situating Sentence Embedders with Nearest Neighbor Overlap",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose nearest neighbor overlap, a procedure which quantifies similarity between embedders in a task-agnostic manner, and use it to compare 21 sentence embedders.",
        "abstract": "As distributed approaches to natural language semantics have developed and diversified, embedders for linguistic units larger than words (e.g., sentences) have come to play an increasingly important role.  To date, such embedders have been evaluated using benchmark tasks (e.g., GLUE) and linguistic probes.  We propose a comparative approach, nearest neighbor overlap (N2O), that quantifies similarity between embedders in a task-agnostic manner.  N2O requires only a collection of examples and is simple to understand: two embedders are more similar if, for the same set of inputs, there is greater overlap between the inputs' nearest neighbors.  We use N2O to compare 21 sentence embedders and show the effects of different design choices and architectures.",
        "keywords": "sentence embeddings;nearest neighbors;semantic similarity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lucy H. Lin;Noah A. Smith",
        "authorids": "lucylin@cs.washington.edu;nasmith@cs.washington.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlin2020situating,\ntitle={Situating Sentence Embedders with Nearest Neighbor Overlap},\nauthor={Lucy H. Lin and Noah A. Smith},\nyear={2020},\nurl={https://openreview.net/forum?id=ByePEC4KDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ByePEC4KDS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "350;426;428",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            401.3333333333333,
            36.307330144506935
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4072158268883489904&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ByeSYa4KPS",
        "title": "Sparse Networks from Scratch: Faster Training without Losing Performance",
        "track": "main",
        "status": "Reject",
        "tldr": "Redistributing and growing weights according to the momentum magnitude enables the training of sparse networks from random initializations that can reach dense performance levels with 5% to 50% weights while accelerating training by up to 5.6x.",
        "abstract": "We demonstrate the possibility of what we call sparse learning: accelerated training of deep neural networks that maintain sparse weights throughout training while achieving dense performance levels. We accomplish this by developing sparse momentum, an algorithm which uses exponentially smoothed gradients (momentum) to identify layers and weights which reduce the error efficiently. Sparse momentum redistributes pruned weights across layers according to the mean momentum magnitude of each layer. Within a layer, sparse momentum grows weights according to the momentum magnitude of zero-valued weights. We demonstrate state-of-the-art sparse performance on MNIST, CIFAR-10, and ImageNet, decreasing the mean error by a relative 8%, 15%, and 6% compared to other sparse algorithms. Furthermore, we show that sparse momentum reliably reproduces dense performance levels while providing up to 5.61x faster training. In our analysis, ablations show that the benefits of momentum redistribution and growth increase with the depth and size of the network. ",
        "keywords": "sparse learning;sparse networks;sparsity;efficient deep learning;efficient training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tim Dettmers;Luke Zettlemoyer",
        "authorids": "dettmers@cs.washington.edu;lsz@cs.washington.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ndettmers2020sparse,\ntitle={Sparse Networks from Scratch: Faster Training without Losing Performance},\nauthor={Tim Dettmers and Luke Zettlemoyer},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeSYa4KPS}\n}",
        "github": "https://www.dropbox.com/s/wes0wtt75iad4j4/sparse_learning.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByeSYa4KPS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "814;413;845",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "873;370;1416",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            690.6666666666666,
            196.74744103940859
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            886.3333333333334,
            427.13177774026076
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 404,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=855020073842741669&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ByeUBANtvB",
        "title": "Learning to solve the credit assignment problem",
        "track": "main",
        "status": "Poster",
        "tldr": "Perturbations can be used to train feedback weights to learn in fully connected and convolutional neural networks",
        "abstract": "Backpropagation is driving today's artificial neural networks (ANNs). However, despite extensive research, it remains unclear if the brain implements this algorithm. Among neuroscientists, reinforcement learning (RL) algorithms are often seen as a realistic alternative: neurons can randomly introduce change, and use unspecific feedback signals to observe their effect on the cost and thus approximate their gradient. However, the convergence rate of such learning scales poorly with the number of involved neurons. Here we propose a hybrid learning approach. Each neuron uses an RL-type strategy to learn how to approximate the gradients that backpropagation would provide. We provide proof that our approach converges to the true gradient for certain classes of networks. In both feedforward and convolutional networks, we empirically show that our approach learns to approximate the gradient, and can match the performance of gradient-based learning. Learning feedback weights provides a biologically plausible mechanism of achieving good performance, without the need for precise, pre-specified learning rules.",
        "keywords": "biologically plausible deep learning;node perturbation;REINFORCE;synthetic gradients;feedback alignment",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Benjamin James Lansdell;Prashanth Ravi Prakash;Konrad Paul Kording",
        "authorids": "ben.lansdell@gmail.com;prprak@seas.upenn.edu;koerding@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLansdell2020Learning,\ntitle={Learning to solve the credit assignment problem},\nauthor={Benjamin James Lansdell and Prashanth Ravi Prakash and Konrad Paul Kording},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeUBANtvB}\n}",
        "github": "https://github.com/benlansdell/synthfeedback",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByeUBANtvB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "1493;195;889",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1764;349;459",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            859.0,
            530.3307144288992
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            857.3333333333334,
            642.6810164373061
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 69,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1954938718512669715&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "ByeVWkBYPH",
        "title": "Neural Networks for Principal Component Analysis: A New Loss Function Provably Yields Ordered Exact Eigenvectors",
        "track": "main",
        "status": "Reject",
        "tldr": "A new loss function for PCA with linear autoencoders that provably yields ordered exact eigenvectors ",
        "abstract": "In this paper, we propose a new loss function for performing principal component analysis (PCA) using linear autoencoders (LAEs). Optimizing the standard L2 loss results in a decoder matrix that spans the principal subspace of the sample covariance of the data, but fails to identify the exact eigenvectors. This downside originates from an invariance that cancels out in the global map. Here, we prove that our loss function eliminates this issue, i.e. the decoder converges to the exact ordered unnormalized eigenvectors of the sample covariance matrix. For this new loss, we establish that all local minima are global optima and also show that computing the new loss (and also its gradients) has the same order of complexity as the classical loss. We report numerical results on both synthetic simulations, and a real-data PCA experiment on MNIST (i.e., a 60,000 x784 matrix), demonstrating our approach to be practically applicable and rectify previous LAEs' downsides.",
        "keywords": "Principal Component Analysis;Autoencoder;Neural Network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Reza Oftadeh;Jiayi Shen;Zhangyang Wang;Dylan Shell",
        "authorids": "oftadeh.reza@gmail.com;asjyjya-617@tamu.edu;atlaswang@tamu.edu;dshell@tamu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\noftadeh2020neural,\ntitle={Neural Networks for Principal Component Analysis: A New Loss Function Provably Yields Ordered Exact Eigenvectors },\nauthor={Reza Oftadeh and Jiayi Shen and Zhangyang Wang and Dylan Shell},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeVWkBYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByeVWkBYPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "392;184;268",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "962;44;452",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            281.3333333333333,
            85.43743649920424
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            486.0,
            375.54227458436685
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:t8WWZ1V_hGMJ:scholar.google.com/&scioq=Neural+Networks+for+Principal+Component+Analysis:+A+New+Loss+Function+Provably+Yields+Ordered+Exact+Eigenvectors&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "ByeWogStDS",
        "title": "Sub-policy Adaptation for Hierarchical Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose HiPPO, a stable Hierarchical Reinforcement Learning algorithm that can train several levels of the hierarchy simultaneously, giving good performance both in skill discovery and adaptation.",
        "abstract": "Hierarchical reinforcement learning is a promising approach to tackle long-horizon decision-making problems with sparse rewards. Unfortunately, most methods still decouple the lower-level skill acquisition process and the training of a higher level that controls the skills in a new task. Leaving the skills fixed can lead to significant sub-optimality in the transfer setting. In this work, we propose a novel algorithm to discover a set of skills, and continuously adapt them along with the higher level even when training on a new task. Our main contributions are two-fold. First, we derive a new hierarchical policy gradient with an unbiased latent-dependent baseline, and we introduce Hierarchical Proximal Policy Optimization (HiPPO), an on-policy method to efficiently train all levels of the hierarchy jointly. Second, we propose a method of training time-abstractions that improves the robustness of the obtained skills to environment changes.  Code and videos are available at sites.google.com/view/hippo-rl.",
        "keywords": "Hierarchical Reinforcement Learning;Transfer;Skill Discovery",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexander Li;Carlos Florensa;Ignasi Clavera;Pieter Abbeel",
        "authorids": "alexli1@berkeley.edu;florensa@berkeley.edu;iclavera@berkeley.edu;pabbeel@berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLi2020Sub-policy,\ntitle={Sub-policy Adaptation for Hierarchical Reinforcement Learning},\nauthor={Alexander Li and Carlos Florensa and Ignasi Clavera and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeWogStDS}\n}",
        "github": "https://anonymous.4open.science/r/de105a6d-8f8b-405e-b90a-54ab74adcb17/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByeWogStDS",
        "pdf_size": 0,
        "rating": "3;8",
        "confidence": "0;0",
        "wc_review": "508;220",
        "wc_reply_reviewers": "392;0",
        "wc_reply_authors": "1165;212",
        "reply_reviewers": "1;0",
        "reply_authors": "2;1",
        "rating_avg": [
            5.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            364.0,
            144.0
        ],
        "wc_reply_reviewers_avg": [
            196.0,
            196.0
        ],
        "wc_reply_authors_avg": [
            688.5,
            476.5
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 106,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9780067471177651796&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ByeYOerFvr",
        "title": "Mixture Density Networks Find Viewpoint the Dominant Factor for Accurate Spatial Offset Regression",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We use mixture density networks to do full conditional density estimation for spatial offset regression and apply it to the human pose estimation task.",
        "abstract": "Offset regression is a standard method for spatial localization in many vision tasks, including human pose estimation, object detection, and instance segmentation. However, \nif high localization accuracy is crucial for a task, convolutional neural networks will offset regression\nusually struggle to deliver.  This can be attributed to the locality of the convolution operation, exacerbated by variance in scale, clutter, and viewpoint. An even more fundamental issue is the multi-modality of real-world images. As a consequence, they cannot be approximated adequately using a single mode model.  Instead, we propose to use mixture density networks (MDN) for offset regression, allowing the model to manage various modes efficiently and learning to predict full conditional density of the outputs given the input. On 2D human pose estimation in the wild, which requires accurate localisation of body keypoints, we show that this yields significant improvement in localization accuracy. In particular, our experiments reveal viewpoint variation as  the dominant  multi-modal factor. Further, by carefully initializing MDN parameters, we do not face any instabilities in training, which is known to be a big obstacle for widespread deployment of MDN. The method can be readily applied to any task with a spatial regression component. Our findings  highlight the multi-modal nature of real-world vision, and the significance of explicitly accounting for viewpoint variation, at least when spatial localization is concerned.\n",
        "keywords": "Mixture Density Estimation;Spatial Offset Regression;Dense Prediction;Human Pose Estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ali Varamesh;Tinne Tuytelaars",
        "authorids": "ali.varamesh@kuleuven.be;tinne.tuytelaars@esat.kuleuven.be",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByeYOerFvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "268;410;156",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "354;597;283",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            278.0,
            103.93587766823671
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            411.3333333333333,
            134.44784201400267
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:g0HaY20BSwMJ:scholar.google.com/&scioq=Mixture+Density+Networks+Find+Viewpoint+the+Dominant+Factor+for+Accurate+Spatial+Offset+Regression&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ByeaXeBFvH",
        "title": "Hydra: Preserving Ensemble Diversity for Model Distillation",
        "track": "main",
        "status": "Reject",
        "tldr": "We distill ensemble models using a shared body network and many heads, preserving ensemble diversity.",
        "abstract": "Ensembles of models have been empirically shown to improve predictive performance and to yield robust measures of uncertainty. However, they are expensive in computation and memory. Therefore, recent research has focused on distilling ensembles into a single compact model, reducing the computational and memory burden of the ensemble while trying to preserve its predictive behavior.  Most existing distillation formulations summarize the ensemble by capturing its average predictions. As a result, the diversity of the ensemble predictions, stemming from each individual member, is lost.  Thus the distilled model cannot provide a measure of uncertainty comparable to that of the original ensemble.  To retain more faithfully the diversity of the ensemble, we propose a distillation method based on a single multi-headed neural network, which we refer to as Hydra.  The shared body network learns a joint feature representation that enables each head to capture the predictive behavior of each ensemble member. We demonstrate that with a slight increase in parameter count, Hydra improves distillation performance on classification and regression settings while capturing the uncertainty behaviour of the original ensemble over both in-domain and out-of-distribution tasks.",
        "keywords": "model distillation;ensemble models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Linh Tran;Bastiaan S. Veeling;Kevin Roth;Jakub \u015awi\u0105tkowski;Joshua V. Dillon;Jasper Snoek;Stephan Mandt;Tim Salimans;Sebastian Nowozin;Rodolphe Jenatton",
        "authorids": "linh.tran@imperial.ac.uk;basveeling@gmail.com;kevin.roth@inf.ethz.ch;kuba.swiatkowski@gmail.com;jvdillon@google.com;jaspersnoek@gmail.com;stephan.mandt@gmail.com;salimans@google.com;nowozin@google.com;rjenatton@google.com",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@misc{\ntran2020hydra,\ntitle={Hydra: Preserving Ensemble Diversity for Model Distillation},\nauthor={Linh Tran and Bastiaan S. Veeling and Kevin Roth and Jakub {\\'S}wi{\\k{a}}tkowski and Joshua V. Dillon and Jasper Snoek and Stephan Mandt and Tim Salimans and Sebastian Nowozin and Rodolphe Jenatton},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeaXeBFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByeaXeBFvH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "197;452;358",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "502;812;722",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            335.6666666666667,
            105.29429656391123
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            678.6666666666666,
            130.2134998974974
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 61,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3480757516240877922&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ByeadyrtPB",
        "title": "Learning Deep-Latent Hierarchies by Stacking Wasserstein Autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "We train a deep-hierarchical-latent-variable model based on Optimal Transport.",
        "abstract": "Probabilistic models with hierarchical-latent-variable structures provide state-of-the-art results amongst non-autoregressive, unsupervised density-based models. However, the most common approach to training such models based on Variational Autoencoders often fails to leverage deep-latent hierarchies; successful approaches require complex inference and optimisation schemes. Optimal Transport is an alternative, non-likelihood-based framework for training generative models with appealing theoretical properties, in principle allowing easier training convergence between distributions. In this work we propose a novel approach to training models with deep-latent hierarchies based on Optimal Transport, without the need for highly bespoke models and inference networks. We show that our method enables the generative model to fully leverage its deep-latent hierarchy, and that in-so-doing, it is more effective than the original Wasserstein Autoencoder with Maximum Mean Discrepancy divergence.",
        "keywords": "Generative modelling;Optimal Transport",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Benoit Gaujac;Ilya Feige;David Barber",
        "authorids": "benoit.gaujac.16@ucl.ac.uk;ilya@faculty.ai;david.barber@ucl.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngaujac2020learning,\ntitle={Learning Deep-Latent Hierarchies by Stacking Wasserstein Autoencoders},\nauthor={Benoit Gaujac and Ilya Feige and David Barber},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeadyrtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ByeadyrtPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "418;278;83",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "786;472;8",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            259.6666666666667,
            137.37620689996584
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            422.0,
            319.57888958231683
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WdCehzkxpEEJ:scholar.google.com/&scioq=Learning+Deep-Latent+Hierarchies+by+Stacking+Wasserstein+Autoencoders&hl=en&as_sdt=0,33",
        "gs_version_total": 4
    },
    {
        "id": "ByebT3EYvr",
        "title": "Single Deep Counterfactual Regret Minimization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Better Deep Reinforcement Learning algorithm to approximate Counterfactual Regret Minimization",
        "abstract": "Counterfactual Regret Minimization (CFR) is the most successful algorithm for finding approximate Nash equilibria in imperfect information games. However, CFR's reliance on full game-tree traversals limits its scalability and generality. Therefore, the game's state- and action-space is often abstracted (i.e. simplified) for CFR, and the resulting strategy is then mapped back to the full game. This requires extensive expert-knowledge, is not practical in many games outside of poker, and often converges to highly exploitable policies. A recently proposed method, Deep CFR, applies deep learning directly to CFR, allowing the agent to intrinsically abstract and generalize over the state-space from samples, without requiring expert knowledge. In this paper, we introduce Single Deep CFR (SD-CFR), a variant of Deep CFR that has a lower overall approximation error by avoiding the training of an average strategy network. We show that SD-CFR is more attractive from a theoretical perspective and empirically outperforms Deep CFR with respect to exploitability and one-on-one play in poker.",
        "keywords": "Game Theory;Deep Reinforcement Learning;Counterfactual Regret Minimization;Imperfect Information Games;Games;Poker;Nash Equilibrium",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eric Steinberger",
        "authorids": "ericsteinberger.est@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "",
        "github": "https://drive.google.com/file/d/18Vu07ewvaZyPBVyOwsKbTu1gH0R8Zlq0/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer5;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByebT3EYvr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "1457;171;556",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "834;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            728.0,
            538.9106295729067
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            278.0,
            393.1513703397204
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7669981182975928906&as_sdt=5,47&sciodt=0,47&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ByedzkrKvH",
        "title": "Double Neural Counterfactual Regret Minimization",
        "track": "main",
        "status": "Poster",
        "tldr": "We proposed a double neural framework to solve large-scale imperfect information game. ",
        "abstract": "Counterfactual regret minimization (CFR) is a fundamental and effective technique for solving Imperfect Information Games (IIG). However, the original CFR algorithm only works for discrete states and action spaces, and the resulting strategy is maintained as a tabular representation. Such tabular representation limits the method from being directly applied to large games. In this paper, we propose a double neural representation for the IIGs, where one neural network represents the cumulative regret, and the other represents the average strategy.  Such neural representations allow us to avoid manual game abstraction and carry out end-to-end optimization. To make the learning efficient, we also developed several novel techniques including a robust sampling method and a mini-batch Monte Carlo Counterfactual Regret Minimization (MCCFR) method, which may be of independent interests.  Empirically, on games tractable to tabular approaches, neural strategies trained with our algorithm converge comparably to their tabular counterparts, and significantly outperform those based on deep reinforcement learning.  On extremely large games with billions of decision nodes, our approach achieved strong performance while using hundreds of times less memory than the tabular CFR. On head-to-head matches of hands-up no-limit texas hold'em, our neural agent beat the strong agent ABS-CFR by $9.8\\pm4.1$ chips per game. It's a successful application of neural CFR in large games.\n",
        "keywords": "Counterfactual Regret Minimization;Imperfect Information game;Neural Strategy;Deep Learning;Robust Sampling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hui Li;Kailiang Hu;Shaohua Zhang;Yuan Qi;Le Song",
        "authorids": "ken.lh@antfin.com;hkl163251@antfin.com;yaohua.zsh@antfin.com;yuan.qi@antfin.com;lsong@cc.gatech.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLi2020Double,\ntitle={Double Neural Counterfactual Regret Minimization},\nauthor={Hui Li and Kailiang Hu and Shaohua Zhang and Yuan Qi and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByedzkrKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByedzkrKvH",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "941;344",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "717;68",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            642.5,
            298.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            392.5,
            324.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 69,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9429260306571208094&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Byekm0VtwS",
        "title": "A Training Scheme for the Uncertain Neuromorphic Computing Chips",
        "track": "main",
        "status": "Reject",
        "tldr": "A training method that can make deep learning algorithms work better on neuromorphic computing chips with uncertainty",
        "abstract": "Uncertainty is a very important feature of the intelligence and helps the brain become a flexible, creative and powerful intelligent system. The crossbar-based neuromorphic computing chips, in which the computing is mainly performed by analog circuits, have the uncertainty and can be used to imitate the brain. However, most of the current deep neural networks have not taken the uncertainty of the neuromorphic computing chip into consideration. Therefore, their performances on the neuromorphic computing chips are not as good as on the original platforms (CPUs/GPUs). In this work, we proposed the uncertainty adaptation training scheme (UATS) that tells the uncertainty to the neural network in the training process. The experimental results show that the neural networks can achieve comparable inference performances on the uncertain neuromorphic computing chip compared to the results on the original platforms, and much better than the performances without this training scheme.",
        "keywords": "deep learning;neuromorphic computing;uncertainty;training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qingtian Zhang;Bin Gao;Huaqiang Wu",
        "authorids": "zhangqt0103@mail.tsinghua.edu.cn;gaob1@tsinghua.edu.cn;wuhq@tsinghua.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhang2020a,\ntitle={A Training Scheme for the Uncertain Neuromorphic Computing Chips},\nauthor={Qingtian Zhang and Bin Gao and Huaqiang Wu},\nyear={2020},\nurl={https://openreview.net/forum?id=Byekm0VtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Byekm0VtwS",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "105;103;386",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "47;0;247",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            198.0,
            132.9385823102784
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            98.0,
            107.09186087965166
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a-Km_e07lpUJ:scholar.google.com/&scioq=A+Training+Scheme+for+the+Uncertain+Neuromorphic+Computing+Chips&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "Byeq_xHtwS",
        "title": "Neural Video Encoding",
        "track": "main",
        "status": "Reject",
        "tldr": "We explore applications of differentiable programming to Kolmogorov complexity in order to realize efficient programs that encode data.",
        "abstract": "Deep neural networks have had unprecedented success in computer vision, natural language processing, and speech largely due to the ability to search for suitable task algorithms via differentiable programming. In this paper, we borrow ideas from Kolmogorov complexity theory and normalizing flows to explore the possibilities of finding arbitrary algorithms that represent data. In particular, algorithms which encode sequences of video image frames. Ultimately, we demonstrate neural video encoded using convolutional neural networks to transform autoregressive noise processes and show that this method has surprising cryptographic analogs for information security.",
        "keywords": "Kolmogorov complexity;differentiable programming;convolutional neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Abel Brown;Robert DiPietro",
        "authorids": "abelb@nvidia.com;rdipietro@nvidia.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nbrown2020neural,\ntitle={Neural Video Encoding},\nauthor={Abel Brown and Robert DiPietro},\nyear={2020},\nurl={https://openreview.net/forum?id=Byeq_xHtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Byeq_xHtwS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "224;179;269",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            224.0,
            36.742346141747674
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "ByeqyxBKvS",
        "title": "Quantum Semi-Supervised Kernel Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We extend quantum SVMs to semi-supervised setting, to deal with the likely problem of many missing class labels in huge datasets.",
        "abstract": "Quantum machine learning methods have the potential to facilitate learning using extremely large datasets. While the availability of data for training machine learning models is steadily increasing, oftentimes it is much easier to collect feature vectors that to obtain the corresponding labels. One of the approaches for addressing this issue is to use semi-supervised learning, which leverages not only the labeled samples, but also unlabeled feature vectors. Here, we present a quantum machine learning algorithm for training Semi-Supervised Kernel Support Vector Machines. The algorithm uses recent advances in quantum sample-based Hamiltonian simulation to extend the existing Quantum LS-SVM algorithm to handle the semi-supervised term in the loss, while maintaining the same quantum speedup as the Quantum LS-SVM.",
        "keywords": "quantum machine learning;semi-supervised learning;support vector machines",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Seyran Saeedi;Aliakbar Panahi;Tom Arodz",
        "authorids": "saeedis@vcu.edu;panahia@vcu.edu;tarodz@vcu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsaeedi2020quantum,\ntitle={Quantum Semi-Supervised Kernel Learning},\nauthor={Seyran Saeedi and Aliakbar Panahi and Tom Arodz},\nyear={2020},\nurl={https://openreview.net/forum?id=ByeqyxBKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ByeqyxBKvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "286;285;172",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "200;94;197",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            247.66666666666666,
            53.50597059103674
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            163.66666666666666,
            49.27699485786671
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15948313020076448417&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Byes0TNFDS",
        "title": "Entropy Penalty: Towards Generalization Beyond the IID Assumption",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that models are sensitive to all correlations when training using vanilla maximum likelihood, thus perform poorly under distribution shift. Information bottleneck can potentially address this.",
        "abstract": "It has been shown that instead of learning actual object features, deep networks tend to exploit non-robust (spurious) discriminative features that are shared between training and test sets. Therefore, while they achieve state of the art performance on such test sets, they achieve poor generalization on out of distribution (OOD) samples where the IID (independent, identical distribution) assumption breaks and the distribution of non-robust features shifts. Through theoretical and empirical analysis, we show that this happens because maximum likelihood training (without appropriate regularization) leads the model to depend on all the correlations (including spurious ones) present between inputs and targets in the dataset. We then show evidence that the information bottleneck (IB) principle can address this problem. To do so, we propose a regularization approach based on IB called Entropy Penalty, that reduces the model's dependence on spurious features-- features corresponding to such spurious correlations. This allows deep networks trained with Entropy Penalty to generalize well even under distribution shift of spurious features. As a controlled test-bed for evaluating our claim, we train deep networks with Entropy Penalty on a colored MNIST (C-MNIST) dataset and show that it is able to generalize well on vanilla MNIST, MNIST-M and SVHN datasets in addition to an OOD version of C-MNIST itself. The baseline regularization methods we compare against fail to generalize on this test-bed.",
        "keywords": "domain shift;information bottleneck;entropy penalty;out of distribution generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Devansh Arpit;Caiming Xiong;Richard Socher",
        "authorids": "devansharpit@gmail.com;cxiong@salesforce.com;rsocher@salesforce.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\narpit2020entropy,\ntitle={Entropy Penalty: Towards Generalization Beyond the {\\{}IID{\\}} Assumption},\nauthor={Devansh Arpit and Caiming Xiong and Richard Socher},\nyear={2020},\nurl={https://openreview.net/forum?id=Byes0TNFDS}\n}",
        "github": "https://www.sendspace.com/file/onfopp",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byes0TNFDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "454;580;512",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            515.3333333333334,
            51.49325737954005
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1069957504293687750&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ByetnC4FwS",
        "title": "Recurrent Chunking Mechanisms for Conversational Machine Reading Comprehension",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In this paper, we focus on the conversational machine reading comprehension (MRC) problem, where the input to a model could be a lengthy document and a series of interconnected questions.\nTo deal with long inputs, previous approaches usually chunk them into equally-spaced segments and predict answers based on each chunk independently without considering the information from other chunks. As a result, they may form chunks that fail to cover complete answers or have insufficient contexts around the correct answer required for question answering. Moreover, they are less capable of answering questions that need cross-chunk information.\n\nWe propose to let a model learn to chunk in a more flexible way via reinforcement learning: a model can decide the next chunk that it wants to process in either reading direction. We also apply recurrent mechanisms to allow information to be transferred between chunks. Experiments on two conversational MRC tasks -- CoQA and QuAC -- demonstrate the effectiveness of our recurrent chunking mechanisms: we can obtain chunks that are more likely to contain complete answers and at the same time provide sufficient contexts around the ground truth answers for better predictions. Specifically, our proposed mechanisms can lead to up to 7.5% improvement in F1 over the baseline when addressing extremely long texts. ",
        "keywords": "Recurrent Chunking Policy;Machine Reading Comprehension;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongyu Gong;Yelong Shen;Dian Yu;Jianshu Chen;Dong Yu",
        "authorids": "hgong6@illinois.edu;yelongshen@tencent.com;yudian@tencent.com;jianshuchen@tencent.com;dyu@tencent.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ngong2020recurrent,\ntitle={Recurrent Chunking Mechanisms for Conversational Machine Reading Comprehension},\nauthor={Hongyu Gong and Yelong Shen and Dian Yu and Jianshu Chen and Dong Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=ByetnC4FwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ByetnC4FwS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "220;480;478",
        "wc_reply_reviewers": "0;487;300",
        "wc_reply_authors": "119;583;312",
        "reply_reviewers": "0;2;2",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            392.6666666666667,
            122.0965009963658
        ],
        "wc_reply_reviewers_avg": [
            262.3333333333333,
            200.59300973751692
        ],
        "wc_reply_authors_avg": [
            338.0,
            190.31727894930262
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TryvBFjFHI4J:scholar.google.com/&scioq=Recurrent+Chunking+Mechanisms+for+Conversational+Machine+Reading+Comprehension&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "ByevJerKwS",
        "title": "The Secret Revealer: Generative Model Inversion Attacks Against Deep Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We develop a privacy attack that can recover the sensitive input data of a deep net from its output",
        "abstract": "This paper studies \\emph{model inversion attacks}, in which the access to a model is abused to infer information about the training data. Since its first introduction by~\\citet{fredrikson2014privacy}, such attacks have raised serious concerns given that training data usually contain sensitive information. Thus far, successful model inversion attacks have only been demonstrated on simple models, such as linear regression and logistic regression. Previous attempts to invert neural networks, even the ones with simple architectures, have failed to produce convincing results. We present a novel attack method, termed the \\emph{generative model inversion attack}, which can invert deep neural networks with high success rates. Rather than reconstructing private training data from scratch, we leverage partial public information, which can be very generic, to learn a distributional prior via generative adversarial networks (GANs) and use it to guide the inversion process. Moreover, we theoretically prove that a model's predictive power and its vulnerability to inversion attacks are indeed two sides of the same coin---highly predictive models are able to establish a strong correlation between features and labels, which coincides exactly with what an adversary exploits to mount the attacks.\nOur experiments demonstrate that the proposed attack improves identification accuracy over the existing work by about $75\\%$ for reconstructing face images from a state-of-the-art face recognition classifier. We also show that differential privacy, in its canonical form, is of little avail to protect against our attacks.",
        "keywords": "Model inversion attack;privacy;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuheng Zhang;Ruoxi Jia;Hengzhi Pei;Wenxiao Wang;Bo Li;Dawn Song",
        "authorids": ";ruoxijia@berkeley.edu;;;;",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "https://tinyurl.com/yxbnjk4s",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByevJerKwS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "383;378;582",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            447.6666666666667,
            95.00994100034421
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 614,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17078326569400271767&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "ByexElSYDr",
        "title": "Fair Resource Allocation in Federated Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a novel optimization objective that encourages fairness in heterogeneous federated networks, and develop a scalable method to solve it.",
        "abstract": "Federated learning involves training statistical models in massive, heterogeneous networks. Naively minimizing an aggregate loss function in such a network may disproportionately advantage or disadvantage some of the devices. In this work, we propose q-Fair Federated Learning (q-FFL), a novel optimization objective inspired by fair resource allocation in wireless networks that encourages a more fair (specifically, a more uniform) accuracy distribution across devices in federated networks. To solve q-FFL, we devise a communication-efficient method, q-FedAvg, that is suited to federated networks. We validate both the effectiveness of q-FFL and the efficiency of q-FedAvg on a suite of federated datasets with both convex and non-convex models, and show that q-FFL (along with q-FedAvg) outperforms existing baselines in terms of the resulting fairness, flexibility, and efficiency.",
        "keywords": "federated learning;fairness;distributed optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tian Li;Maziar Sanjabi;Ahmad Beirami;Virginia Smith",
        "authorids": "tianli@cmu.edu;maziar.sanjabi@gmail.com;ahmad.beirami@gmail.com;smithv@cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLi2020Fair,\ntitle={Fair Resource Allocation in Federated Learning},\nauthor={Tian Li and Maziar Sanjabi and Ahmad Beirami and Virginia Smith},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByexElSYDr}\n}",
        "github": "https://github.com/litian96/fair_flearn",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByexElSYDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "303;538;176",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "265;1162;197",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            339.0,
            149.9622174638221
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            541.3333333333334,
            439.75472961893456
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1079,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15902848371437893934&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Byg-An4tPr",
        "title": "Differential Privacy in Adversarial Learning with Provable Robustness",
        "track": "main",
        "status": "Reject",
        "tldr": "Preserving Differential Privacy in Adversarial Learning with Provable Robustness to Adversarial Examples",
        "abstract": "In this paper, we aim to develop a novel mechanism to preserve differential privacy (DP) in adversarial learning for deep neural networks, with provable robustness to adversarial examples. We leverage the sequential composition theory in DP, to establish a new connection between DP preservation and provable robustness. To address the trade-off among model utility, privacy loss, and robustness, we design an original, differentially private, adversarial objective function, based on the post-processing property in DP, to tighten the sensitivity of our model. An end-to-end theoretical analysis and thorough evaluations show that our mechanism notably improves the robustness of DP deep neural networks.",
        "keywords": "differential privacy;adversarial learning;robustness bound;adversarial example",
        "primary_area": "",
        "supplementary_material": "",
        "author": "NhatHai Phan;My T. Thai;Ruoming Jin;Han Hu;Dejing Dou",
        "authorids": "phan@njit.edu;mythai@cise.ufl.edu;rjin1@kent.edu;hh255@njit.edu;dou@cs.uoregon.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nphan2020differential,\ntitle={Differential Privacy in Adversarial Learning with Provable Robustness},\nauthor={NhatHai Phan and My T. Thai and Ruoming Jin and Han Hu and Dejing Dou},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg-An4tPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Byg-An4tPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "506;582;289",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "755;1077;769",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            459.0,
            124.14776142430706
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            867.0,
            148.60237772884614
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sDbRBoPG340J:scholar.google.com/&scioq=Differential+Privacy+in+Adversarial+Learning+with+Provable+Robustness&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Byg-wJSYDS",
        "title": "Discrepancy Ratio: Evaluating Model Performance When Even Experts Disagree on the Truth",
        "track": "main",
        "status": "Poster",
        "tldr": "A framework for evaluating model performance when even experts disagree on what the ground truth is.",
        "abstract": "In most machine learning tasks unambiguous ground truth labels can easily be acquired. However, this luxury is often not afforded to many high-stakes, real-world scenarios such as medical image interpretation, where even expert human annotators typically exhibit very high levels of disagreement with one another. While prior works have focused on overcoming noisy labels during training, the question of how to evaluate models when annotators disagree about ground truth has remained largely unexplored. To address this, we propose the discrepancy ratio: a novel, task-independent and principled framework for validating machine learning models in the presence of high label noise. Conceptually, our approach evaluates a model by comparing its predictions to those of human annotators, taking into account the degree to which annotators disagree with one another. While our approach is entirely general, we show that in the special case of binary classification, our proposed metric can be evaluated in terms of simple, closed-form expressions that depend only on aggregate statistics of the labels and not on any individual label. Finally, we demonstrate how this framework can be used effectively to validate machine learning models using two real-world tasks from medical imaging. The discrepancy ratio metric reveals what conventional metrics do not: that our models not only vastly exceed the average human performance, but even exceed the performance of the best human experts in our datasets.",
        "keywords": "Evaluation Metrics;Medical Imaging",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Igor Lovchinsky;Alon Daks;Israel Malkin;Pouya Samangouei;Ardavan Saeedi;Yang Liu;Swami Sankaranarayanan;Tomer Gafner;Ben Sternlieb;Patrick Maher;Nathan Silberman",
        "authorids": "ilovchinsky@butterflynetwork.com;adaks@butterflynetwork.com;imalkin@butterflynetwork.com;psamangouei@butterflynetwork.com;asaeedi@butterflynetwork.com;yliu@butterflynetwork.com;ssankaranarayanan@butterflynetwork.com;tgafner@butterflynetwork.com;bsternlieb@butterflynetwork.com;pmaher@butterflynetwork.com;nsilberman@butterflynetwork.com",
        "gender": ";;;;;;;;;;",
        "homepage": ";;;;;;;;;;",
        "dblp": ";;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;",
        "orcid": ";;;;;;;;;;",
        "linkedin": ";;;;;;;;;;",
        "or_profile": ";;;;;;;;;;",
        "aff": ";;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;",
        "position": ";;;;;;;;;;",
        "bibtex": "@inproceedings{\nLovchinsky2020Discrepancy,\ntitle={Discrepancy Ratio: Evaluating Model Performance When Even Experts Disagree on the Truth},\nauthor={Igor Lovchinsky and Alon Daks and Israel Malkin and Pouya Samangouei and Ardavan Saeedi and Yang Liu and Swami Sankaranarayanan and Tomer Gafner and Ben Sternlieb and Patrick Maher and Nathan Silberman},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg-wJSYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Byg-wJSYDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "620;379;1202",
        "wc_reply_reviewers": "0;0;133",
        "wc_reply_authors": "1263;764;1551",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            733.6666666666666,
            345.46812041376876
        ],
        "wc_reply_reviewers_avg": [
            44.333333333333336,
            62.69680126520721
        ],
        "wc_reply_authors_avg": [
            1192.6666666666667,
            325.1177564445774
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            11,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12793318164280155565&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Byg1v1HKDB",
        "title": "Abductive Commonsense Reasoning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Abductive reasoning is inference to the most plausible explanation. For example, if Jenny finds her house in a mess when she returns from work, and remembers that she left a window open, she can hypothesize that a thief broke into her house and  caused  the  mess,  as  the  most  plausible  explanation.  While  abduction has long been considered to be at the core of how people interpret and read between the lines in natural language (Hobbs et al., 1988), there has been relatively little research in support of abductive natural language inference and generation. We present the first study that investigates the viability of language-based abductive reasoning.  We introduce a challenge dataset, ART, that consists of over 20k commonsense narrative contexts and 200k explanations. Based on this dataset, we conceptualize two new tasks \u2013 (i) Abductive NLI: a multiple-choice question answering task for choosing the more likely explanation, and (ii) Abductive NLG: a conditional generation task for explaining given observations in natural language. On Abductive NLI, the best model achieves 68.9% accuracy, well below human performance of 91.4%.  On Abductive NLG, the current best language generators struggle even more, as they lack reasoning capabilities that are trivial for humans. Our analysis leads to new insights into the types of reasoning that deep pre-trained language models fail to perform\u2014despite their strong performance on the related but  more  narrowly  defined  task  of entailment NLI\u2014pointing  to  interesting  avenues for future research.",
        "keywords": "Abductive Reasoning;Commonsense Reasoning;Natural Language Inference;Natural Language Generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chandra Bhagavatula;Ronan Le Bras;Chaitanya Malaviya;Keisuke Sakaguchi;Ari Holtzman;Hannah Rashkin;Doug Downey;Wen-tau Yih;Yejin Choi",
        "authorids": "chandrab@allenai.org;ronanlb@allenai.org;chaitanyam@allenai.org;keisukes@allenai.org;arih@allenai.org;hrashkin@uw.edu;dougd@allenai.org;scottyih@fb.com;yejinc@allenai.org",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@inproceedings{\nBhagavatula2020Abductive,\ntitle={Abductive Commonsense Reasoning},\nauthor={Chandra Bhagavatula and Ronan Le Bras and Chaitanya Malaviya and Keisuke Sakaguchi and Ari Holtzman and Hannah Rashkin and Doug Downey and Wen-tau Yih and Yejin Choi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg1v1HKDB}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=Byg1v1HKDB)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Byg1v1HKDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "211;232;1109",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "492;531;674",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            517.3333333333334,
            418.4593435714182
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            565.6666666666666,
            78.24037036949375
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 468,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16544200144479839958&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "Byg5KyHYwr",
        "title": "Self-Imitation Learning via Trajectory-Conditioned Policy for Hard-Exploration Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "Self-imitation learning of diverse trajectories with trajectory-conditioned policy",
        "abstract": "Imitation learning from human-expert demonstrations has been shown to be greatly helpful for challenging reinforcement learning problems with sparse environment rewards. However, it is very difficult to achieve similar success without relying on expert demonstrations. Recent works on self-imitation learning showed that imitating the agent's own past good experience could indirectly drive exploration in some environments, but these methods often lead to sub-optimal and myopic behavior. To address this issue, we argue that exploration in diverse directions by imitating diverse trajectories, instead of focusing on limited good trajectories, is more desirable for the hard-exploration tasks. We propose a new method of learning a trajectory-conditioned policy to imitate diverse trajectories from the agent's own past experiences and show that such self-imitation helps avoid myopic behavior and increases the chance of finding a globally optimal solution for hard-exploration tasks, especially when there are misleading rewards. Our method significantly outperforms existing self-imitation learning and count-based exploration methods on various hard-exploration tasks with local optima. In particular, we report a state-of-the-art score of more than 20,000 points on Montezumas Revenge without using expert demonstrations or resetting to arbitrary states.",
        "keywords": "imitation learning;hard-exploration tasks;exploration and exploitation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yijie Guo;Jongwook Choi;Marcin Moczulski;Samy Bengio;Mohammad Norouzi;Honglak Lee",
        "authorids": "guoyijie@umich.edu;jwook@umich.edu;moczulski@google.com;bengio@google.com;mnorouzi@google.com;honglak@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nguo2020selfimitation,\ntitle={Self-Imitation Learning via Trajectory-Conditioned Policy for Hard-Exploration Tasks},\nauthor={Yijie Guo and Jongwook Choi and Marcin Moczulski and Samy Bengio and Mohammad Norouzi and Honglak Lee},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg5KyHYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byg5KyHYwr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "200;246;363",
        "wc_reply_reviewers": "147;0;0",
        "wc_reply_authors": "1031;456;465",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            269.6666666666667,
            68.6164865190737
        ],
        "wc_reply_reviewers_avg": [
            49.0,
            69.29646455628166
        ],
        "wc_reply_authors_avg": [
            650.6666666666666,
            268.9613768224394
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7652332311410054612&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Byg5ZANtvH",
        "title": "Short and Sparse Deconvolution --- A Geometric Approach",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Short-and-sparse deconvolution (SaSD) is the problem of extracting localized, recurring motifs in signals with spatial or temporal structure. Variants of this problem arise in applications such as image deblurring, microscopy, neural spike sorting, and more. The problem is challenging in both theory and practice, as natural optimization formulations are nonconvex. Moreover, practical deconvolution problems involve smooth motifs (kernels) whose spectra decay rapidly, resulting in poor conditioning and numerical challenges. This paper is motivated by recent theoretical advances \\citep{zhang2017global,kuo2019geometry}, which characterize the optimization landscape of a particular nonconvex formulation of SaSD. This is used to derive a provable algorithm that exactly solves certain non-practical instances of the SaSD problem. We leverage the key ideas from this theory (sphere constraints, data-driven initialization) to develop a practical algorithm, which performs well on data arising from a range of application areas. We highlight key additional challenges posed by the ill-conditioning of real SaSD problems and suggest heuristics (acceleration, continuation, reweighting) to mitigate them. Experiments demonstrate the performance and generality of the proposed method.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yenson Lau;Qing Qu;Han-Wen Kuo;Pengcheng Zhou;Yuqian Zhang;John Wright",
        "authorids": "y.lau@columbia.edu;qq213@nyu.edu;hk2673@columbia.edu;pz2230@columbia.edu;yz2557@cornell.edu;jw2966@columbia.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nLau2020Short,\ntitle={Short and Sparse Deconvolution --- A Geometric Approach},\nauthor={Yenson Lau and Qing Qu and Han-Wen Kuo and Pengcheng Zhou and Yuqian Zhang and John Wright},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg5ZANtvH}\n}",
        "github": "https://github.com/qingqu06/sparse_deconvolution",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byg5ZANtvH",
        "pdf_size": 0,
        "rating": "3;6",
        "confidence": "0;0",
        "wc_review": "831;358",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "867;403",
        "reply_reviewers": "0;0",
        "reply_authors": "2;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            594.5,
            236.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            635.0,
            232.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6538763414055408265&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Byg5flHFDr",
        "title": "EvoNet: A Neural Network for Predicting the Evolution of Dynamic Graphs",
        "track": "main",
        "status": "Reject",
        "tldr": "Combining graph neural networks and the RNN graph generative model, we propose a novel architecture that is able to learn from a sequence of evolving graphs and predict the graph topology evolution for the future timesteps",
        "abstract": "Neural networks for structured data like graphs have been studied extensively in recent years.\nTo date, the bulk of research activity has focused mainly on static graphs.\nHowever, most real-world networks are dynamic since their topology tends to change over time.\nPredicting the evolution of dynamic graphs is a task of high significance in the area of graph mining.\nDespite its practical importance, the task has not been explored in depth so far, mainly due to its challenging nature.\nIn this paper, we propose a model that predicts the evolution of dynamic graphs.\nSpecifically, we use a graph neural network along with a recurrent architecture to capture the temporal evolution patterns of dynamic graphs.\nThen, we employ a generative model which predicts the topology of the graph at the next time step and constructs a graph instance that corresponds to that topology.\nWe evaluate the proposed model on several artificial datasets following common network evolving dynamics, as well as on real-world datasets.\nResults demonstrate the effectiveness of the proposed model. ",
        "keywords": "temporal graphs;graph neural network;graph generative model;graph topology prediction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Changmin Wu;Giannis Nikolentzos;Michalis Vazirgiannis",
        "authorids": "changmin.wu@polytechnique.edu;giannisnik@hotmail.com;mvazirg@lix.polytechnique.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwu2020evonet,\ntitle={EvoNet: A Neural Network for Predicting the Evolution of Dynamic Graphs},\nauthor={Changmin Wu and Giannis Nikolentzos and Michalis Vazirgiannis},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg5flHFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Byg5flHFDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1450;128;298",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            625.3333333333334,
            587.2428988265607
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 17,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5578020944807560649&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Byg79h4tvB",
        "title": "PROTOTYPE-ASSISTED ADVERSARIAL LEARNING FOR UNSUPERVISED DOMAIN ADAPTATION",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a reliable conditional adversarial learning scheme along with a simple, generic yet effective framework for UDA tasks.",
        "abstract": "This paper presents a generic framework to tackle the crucial class mismatch problem in unsupervised domain adaptation (UDA) for multi-class distributions.  Previous adversarial learning methods condition domain alignment only on pseudo labels, but noisy and inaccurate pseudo labels may perturb the multi-class distribution embedded in probabilistic predictions, hence bringing insufficient alleviation to the latent mismatch problem.  Compared with pseudo labels, class prototypes are more accurate and reliable since they summarize over all the instances and are  able  to  represent  the  inherent  semantic  distribution  shared  across  domains. Therefore, we propose a novel Prototype-Assisted Adversarial Learning (PAAL) scheme, which incorporates instance probabilistic predictions and class prototypes together  to  provide  reliable  indicators  for  adversarial  domain  alignment.   With the PAAL scheme,  we align both the instance feature representations and class prototype  representations  to  alleviate  the  mismatch  among  semantically  different classes.   Also,  we exploit the class prototypes as proxy to minimize the within-class variance in the target domain to mitigate the mismatch among semantically similar classes.  With these novelties, we constitute a Prototype-Assisted Conditional Domain Adaptation (PACDA) framework which well tackles the class mismatch problem. We demonstrate the good performance and generalization ability of the PAAL scheme and also PACDA framework on two UDA tasks, i.e., object recognition (Office-Home,ImageCLEF-DA, andOffice) and synthetic-to-real semantic segmentation (GTA5\u2192CityscapesandSynthia\u2192Cityscapes).",
        "keywords": "Domain Adaptation;Transfer Learning;Adversarial Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dapeng Hu;Jian Liang*;Qibin Hou;Hanshu Yan;Jiashi Feng",
        "authorids": "dapeng.hu@u.nus.edu;liangjian92@gmail.com;andrewhoux@gmail.com;hanshu.yan@u.nus.edu;elefjia@nus.edu.sg",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nhu2020prototypeassisted,\ntitle={{\\{}PROTOTYPE{\\}}-{\\{}ASSISTED{\\}} {\\{}ADVERSARIAL{\\}} {\\{}LEARNING{\\}} {\\{}FOR{\\}} {\\{}UNSUPERVISED{\\}} {\\{}DOMAIN{\\}} {\\{}ADAPTATION{\\}}},\nauthor={Dapeng Hu and Jian Liang* and Qibin Hou and Hanshu Yan and Jiashi Feng},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg79h4tvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Byg79h4tvB",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "284;519",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "875;790",
        "reply_reviewers": "0;0",
        "reply_authors": "2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            401.5,
            117.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            832.5,
            42.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DvAqY48SSZkJ:scholar.google.com/&scioq=PROTOTYPE-ASSISTED+ADVERSARIAL+LEARNING+FOR+UNSUPERVISED+DOMAIN+ADAPTATION&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Byg9A24tvB",
        "title": "Rethinking Softmax Cross-Entropy Loss for Adversarial Robustness",
        "track": "main",
        "status": "Poster",
        "tldr": "Applying the softmax function in training leads to indirect and unexpected supervision on features. We propose a new training objective to explicitly induce dense feature regions for locally sufficient samples to benefit adversarial robustness.",
        "abstract": "Previous work shows that adversarially robust generalization requires larger sample complexity, and the same dataset, e.g., CIFAR-10, which enables good standard accuracy may not suffice to train robust models. Since collecting new training data could be costly, we focus on better utilizing the given data by inducing the regions with high sample density in the feature space, which could lead to locally sufficient samples for robust learning. We first formally show that the softmax cross-entropy (SCE) loss and its variants convey inappropriate supervisory signals, which encourage the learned feature points to spread over the space sparsely in training. This inspires us to propose the Max-Mahalanobis center (MMC) loss to explicitly induce dense feature regions in order to benefit robustness. Namely, the MMC loss encourages the model to concentrate on learning ordered and compact representations, which gather around the preset optimal centers for different classes. We empirically demonstrate that applying the MMC loss can significantly improve robustness even under strong adaptive attacks, while keeping state-of-the-art accuracy on clean inputs with little extra computation compared to the SCE loss.",
        "keywords": "Trustworthy Machine Learning;Adversarial Robustness;Training Objective;Sample Density",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianyu Pang;Kun Xu;Yinpeng Dong;Chao Du;Ning Chen;Jun Zhu",
        "authorids": "pty17@mails.tsinghua.edu.cn;kunxu.thu@gmail.com;dyp17@mails.tsinghua.edu.cn;duchao0726@gmail.com;ningchen@mail.tsinghua.edu.cn;dcszj@mail.tsinghua.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nPang2020Rethinking,\ntitle={Rethinking Softmax Cross-Entropy Loss for Adversarial Robustness},\nauthor={Tianyu Pang and Kun Xu and Yinpeng Dong and Chao Du and Ning Chen and Jun Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg9A24tvB}\n}",
        "github": "https://github.com/P2333/Max-Mahalanobis-Training",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Byg9A24tvB",
        "pdf_size": 0,
        "rating": "6;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "420;234;351;360",
        "wc_reply_reviewers": "101;0;41;95",
        "wc_reply_authors": "548;384;397;114",
        "reply_reviewers": "2;0;1;1",
        "reply_authors": "3;2;2;2",
        "rating_avg": [
            6.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            341.25,
            67.36235966769573
        ],
        "wc_reply_reviewers_avg": [
            59.25,
            41.42689343892443
        ],
        "wc_reply_authors_avg": [
            360.75,
            156.36715607825064
        ],
        "reply_reviewers_avg": [
            1.0,
            0.7071067811865476
        ],
        "reply_authors_avg": [
            2.25,
            0.4330127018922193
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 214,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12978417581755318851&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "Byg9AR4YDB",
        "title": "Exploring Cellular Protein Localization Through Semantic Image Synthesis",
        "track": "main",
        "status": "Reject",
        "tldr": "We explore cell-cell interactions across tumor environment contexts observed in highly multiplexed images, by image synthesis using a novel attention GAN architecture.",
        "abstract": "Cell-cell interactions have an integral role in tumorigenesis as they are critical in governing immune responses. As such, investigating specific cell-cell interactions has the potential to not only expand upon the understanding of tumorigenesis, but also guide clinical management of patient responses to cancer immunotherapies. A recent imaging technique for exploring cell-cell interactions, multiplexed ion beam imaging by time-of-flight (MIBI-TOF), allows for cells to be quantified in 36 different protein markers at sub-cellular resolutions in situ as high resolution multiplexed images. To explore the MIBI images, we propose a GAN for multiplexed data with protein specific attention. By conditioning image generation on cell types, sizes, and neighborhoods through semantic segmentation maps, we are able to observe how these factors affect cell-cell interactions simultaneously in different protein channels. Furthermore, we design a set of metrics and offer the first insights towards cell spatial orientations, cell protein expressions, and cell neighborhoods. Our model, cell-cell interaction GAN (CCIGAN), outperforms or matches existing image synthesis methods on all conventional measures and significantly outperforms on biologically motivated metrics. To our knowledge, we are the first to systematically model multiple cellular protein behaviors and interactions under simulated conditions through image synthesis.",
        "keywords": "Computational biology;image synthesis;GANs;exploring multiplex images;attention;interpretability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Li;Qiang Ma;Andrew Liu;Justin Cheung;Dana Pe\u2019er;Itsik Pe\u2019er",
        "authorids": "daniel.li@columbia.edu;ma.qiang@columbia.edu;andrew@ml.berkeley.edu;justin.cheung@stonybrookmedicine.edu;peerster@gmail.com;itsik@cs.columbia.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nli2020exploring,\ntitle={Exploring Cellular Protein Localization Through Semantic Image Synthesis},\nauthor={Daniel Li and Qiang Ma and Andrew Liu and Justin Cheung and Dana Pe{\\textquoteright}er and Itsik Pe{\\textquoteright}er},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg9AR4YDB}\n}",
        "github": "https://drive.google.com/file/d/1AEr6eSOum79JZRht6aFkd12EXynCGFOT/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Byg9AR4YDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "407;597;123",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1303;1357;359",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            375.6666666666667,
            194.77394304395258
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1006.3333333333334,
            458.26435844632545
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z6tGS4oi7yoJ:scholar.google.com/&scioq=Exploring+Cellular+Protein+Localization+Through+Semantic+Image+Synthesis&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "Byg9bxrtwS",
        "title": "Kernel and Rich Regimes in Overparametrized Models",
        "track": "main",
        "status": "Reject",
        "tldr": "We study the transition between the kernel and non-kernel/rich regimes in overparametrized models, analytically for simple models, and experimentally for more complex ones. ",
        "abstract": "A recent line of work studies overparametrized neural networks in the \"kernel regime,\" i.e. when the network behaves during training as a kernelized linear predictor, and thus training with gradient descent has the effect of finding the minimum RKHS norm solution.  This stands in contrast to other studies which demonstrate how gradient descent on overparametrized multilayer networks can induce rich implicit biases that are not RKHS norms.  Building on an observation by Chizat and Bach, we show how the scale of the initialization controls the transition between the \"kernel\" (aka lazy) and \"rich\" (aka active) regimes and affects generalization properties in multilayer homogeneous models.  We provide a complete and detailed analysis for a simple two-layer model that already exhibits an interesting and meaningful transition between the kernel and rich regimes, and we demonstrate the transition for more complex matrix factorization models and multilayer non-linear networks. ",
        "keywords": "Overparametrized;Implicit;Bias;Regularization;Kernel;Rich;Adaptive;Regime",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Blake Woodworth;Suriya Gunasekar;Pedro Savarese;Edward Moroshko;Itay Golan;Jason Lee;Daniel Soudry;Nathan Srebro",
        "authorids": "blake@ttic.edu;suriya@ttic.edu;savarese@ttic.edu;edward.moroshko@gmail.com;sitaygo@campus.technion.ac.il;jasondlee88@gmail.com;daniel.soudry@gmail.com;nati@ttic.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nwoodworth2020kernel,\ntitle={Kernel and Rich Regimes in Overparametrized Models},\nauthor={Blake Woodworth and Suriya Gunasekar and Pedro Savarese and Edward Moroshko and Itay Golan and Jason Lee and Daniel Soudry and Nathan Srebro},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg9bxrtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Byg9bxrtwS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "229;337;124",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "700;496;1",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            230.0,
            86.95976080923866
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            399.0,
            293.492759706266
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 447,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16383209207795096912&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "BygFVAEKDH",
        "title": "Understanding Knowledge Distillation in Non-autoregressive Machine Translation",
        "track": "main",
        "status": "Poster",
        "tldr": "We systematically examine why knowledge distillation is crucial to the training of non-autoregressive translation (NAT) models, and propose methods to further improve the distilled data to best match the capacity of an NAT model.",
        "abstract": "Non-autoregressive machine translation (NAT) systems predict a sequence of output tokens in parallel, achieving substantial improvements in generation speed compared to autoregressive models. Existing NAT models usually rely on the technique of knowledge distillation, which creates the training data from a pretrained autoregressive model for better performance. Knowledge distillation is empirically useful, leading to large gains in accuracy for NAT models, but the reason for this success has, as of yet, been unclear. In this paper, we first design systematic experiments to investigate why knowledge distillation is crucial to NAT training. We find that knowledge distillation can reduce the complexity of data sets and help NAT to model the variations in the output data. Furthermore, a strong correlation is observed between the capacity of an NAT model and the optimal complexity of the distilled data for the best translation quality. Based on these findings, we further propose several approaches that can alter the complexity of data sets to improve the performance of NAT models. We achieve the state-of-the-art performance for the NAT-based models, and close the gap with the autoregressive baseline on WMT14 En-De benchmark.",
        "keywords": "knowledge distillation;non-autoregressive neural machine translation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chunting Zhou;Jiatao Gu;Graham Neubig",
        "authorids": "chuntinz@andrew.cmu.edu;jgu@fb.com;gneubig@cs.cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nZhou2020Understanding,\ntitle={Understanding Knowledge Distillation in Non-autoregressive Machine Translation},\nauthor={Chunting Zhou and Jiatao Gu and Graham Neubig},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BygFVAEKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer5",
        "site": "https://openreview.net/forum?id=BygFVAEKDH",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "191;481;615",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "304;391;138",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            429.0,
            176.95950572565087
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            277.6666666666667,
            104.95184080752891
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 245,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9869422674779345312&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BygG3hEYvB",
        "title": "Quantifying Layerwise Information Discarding of Neural Networks and Beyond",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "This paper presents a method to explain how input information is discarded through intermediate layers of a neural network during the forward propagation. The layerwise analysis of information discarding is used to explain and diagnose various deep-learning techniques. We define two types of entropy-based metrics, i.e., the strict information discarding and the reconstruction uncertainty, which measure input information of a specific layer from two perspectives. We develop a method to compute entropy-based metrics, which ensures the fairness of comparisons between different layers of different networks. Preliminary experiments have shown the effectiveness of our metrics in analyzing benchmark networks and explaining existing deep-learning techniques. The code will be released when the paper is accepted.",
        "keywords": "Deep Learning;Information Theory;Interpretability;Convolutional Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haotian Ma;Yinqing Zhang;Fan Zhou;Quanshi Zhang",
        "authorids": "11612807@mail.sustc.edu.cn;zhangyinqing@sjtu.edu.cn;zhoufan98@sjtu.edu.cn;zqs1022@sjtu.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BygG3hEYvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "485;219;235",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "353;342;325",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            313.0,
            121.79764639214777
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            340.0,
            11.51810169544733
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:u1bg-ZsefdIJ:scholar.google.com/&scioq=Quantifying+Layerwise+Information+Discarding+of+Neural+Networks+and+Beyond&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BygIjTNtPr",
        "title": "ODE Analysis of Stochastic Gradient Methods with Optimism and Anchoring for Minimax Problems and GANs",
        "track": "main",
        "status": "Reject",
        "tldr": "Convergence proof of stochastic sub-gradients method and variations on convex-concave minimax problems",
        "abstract": "Despite remarkable empirical success, the training dynamics of generative adversarial networks (GAN), which involves solving a minimax game using stochastic gradients, is still poorly understood. In this work, we analyze last-iterate convergence of simultaneous gradient descent (simGD) and its variants under the assumption of convex-concavity, guided by a continuous-time analysis with differential equations. First, we show that simGD, as is, converges with stochastic sub-gradients under strict convexity in the primal variable. Second, we generalize optimistic simGD to accommodate an optimism rate separate from the learning rate and show its convergence with full gradients. Finally, we present anchored simGD, a new method, and show convergence with stochastic subgradients.",
        "keywords": "GAN;minimax problems;stochastic gradients",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ernest K. Ryu;Kun Yuan;Wotao Yin",
        "authorids": "eryu@math.ucla.edu;kunyuan@ucla.edu;wotaoyin@math.ucla.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nryu2020ode,\ntitle={{\\{}ODE{\\}} Analysis of Stochastic Gradient Methods with Optimism and Anchoring  for Minimax Problems and {\\{}GAN{\\}}s},\nauthor={Ernest K. Ryu and Kun Yuan and Wotao Yin},\nyear={2020},\nurl={https://openreview.net/forum?id=BygIjTNtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BygIjTNtPr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "202;360;250",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "193;397;112",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            270.6666666666667,
            66.13790306792484
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            234.0,
            119.90829829498874
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9393820238270885630&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BygInyBFPr",
        "title": "Exploring the Pareto-Optimality between Quality and Diversity in Text Generation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Quality and diversity are two essential aspects for performance evaluation of text generation models. Quality indicates how likely the generated samples are to be real samples, and diversity indicates how much differences there are between generated samples. Though quality and diversity metrics have been widely used for evaluation, it is still not clear what the relationship is between them. In this paper, we give theoretical analysis of a multi-objective programming problem where quality and diversity are both expected to be maximized. We prove that there exists a family of Pareto-optimal solutions, giving an explanation of the widely observed tradeoff behavior between quality and diversity in practice. We also give the structure of such solutions, and show that a linear combination of quality and diversity is sufficient to measure the divergence between the generated distribution and the real distribution. Further, we derive an efficient algorithm to reach the Pareto-optimal solutions in practice, enabling a controllable quality-diversity tradeoff.",
        "keywords": "text generation;quality;diversity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jianing Li;Yanyan Lan;Jiafeng Guo;Xueqi Cheng",
        "authorids": "lijianing@ict.ac.cn;lanyanyan@ict.ac.cn;guojiafeng@ict.ac.cn;cxq@ict.ac.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BygInyBFPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "150;507;350",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "68;220;333",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            335.6666666666667,
            146.096619475682
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            207.0,
            108.57562648525987
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:V1Em3UgBflgJ:scholar.google.com/&scioq=Exploring+the+Pareto-Optimality+between+Quality+and+Diversity+in+Text+Generation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BygJKn4tPr",
        "title": "Effective Mechanism to Mitigate Injuries During NFL Plays",
        "track": "main",
        "status": "Reject",
        "tldr": "Mitigate concussions in American Football using Machine learning and Optimization techniques",
        "abstract": "NFL(American football),which is regarded as the premier sports icon of America, has been severely accused in the recent years of being exposed to dangerous injuries that prove to be a bigger crisis as the players' lives have been increasingly at risk. Concussions, which refer to the serious brain traumas experienced during the passage of NFL play, have displayed a  dramatic rise in the recent seasons concluding in an alarming rate in 2017/18. Acknowledging the potential risk, the NFL has been trying to fight via NeuroIntel AI mechanism as well as modifying existing  game rules and risky play practices to reduce the rate of concussions. As a remedy, we are suggesting an effective mechanism to extensively analyse the potential concussion risks by adopting predictive analysis to project injury risk percentage per each play and positional impact analysis to suggest safer team formation pairs to lessen injuries to offer a comprehensive study on NFL injury analysis. The proposed data analytical approach differentiates itself from the other similar approaches that were focused only on the descriptive analysis rather than going for a bigger context with predictive modelling and formation pairs mining that would assist in modifying existing rules to tackle injury concerns. The predictive model that works with Kafka-stream processor real-time inputs and risky formation pairs identification by designing FP-Matrix, makes this far-reaching solution to analyse injury data on various grounds wherever applicable.",
        "keywords": "Concussion;American football;Predictive modelling;Injuries;NFL Plays;Optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arraamuthan Arulanantham;Ahamed Arshad Ahamed Anzar;Gowshalini Rajalingam;Krusanth Ingran;Prasanna S. Haddela",
        "authorids": "anzanfas@gmail.com;arulanantham.arraamuthan@my.sliit.lk;it16113800@my.sliit.lk;krusanth7@gmail.com;prasanna@sliit.lk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\narulanantham2020effective,\ntitle={Effective Mechanism to Mitigate Injuries During {\\{}NFL{\\}} Plays },\nauthor={Arraamuthan Arulanantham and Ahamed Arshad Ahamed Anzar and Gowshalini Rajalingam and Krusanth Ingran and Prasanna S. Haddela},\nyear={2020},\nurl={https://openreview.net/forum?id=BygJKn4tPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BygJKn4tPr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "97;146;97",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            113.33333333333333,
            23.098821518760552
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4014128985330936310&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BygKZkBtDH",
        "title": "Balancing Cost and Benefit with Tied-Multi Transformers",
        "track": "main",
        "status": "Reject",
        "tldr": "Training multiple transformers with tied parameters, depth selection, and further compression",
        "abstract": "This paper proposes a novel procedure for training multiple Transformers with tied parameters which compresses multiple models into one enabling the dynamic choice of the number of encoder and decoder layers during decoding. In sequence-to-sequence modeling, typically, the output of the last layer of the N-layer encoder is fed to the M-layer decoder, and the output of the last decoder layer is used to compute loss. Instead, our method computes a single loss consisting of NxM losses, where each loss is computed from the output of one of the M decoder layers connected to one of the N encoder layers. A single model trained by our method subsumes multiple models with different number of encoder and decoder layers, and can be used for decoding with fewer than the maximum number of encoder and decoder layers. We then propose a mechanism to choose a priori the number of encoder and decoder layers for faster decoding, and also explore recurrent stacking of layers and knowledge distillation to enable further parameter reduction. In a case study of neural machine translation, we present a cost-benefit analysis of the proposed approaches and empirically show that they greatly reduce decoding costs while preserving translation quality.",
        "keywords": "tied models;encoder-decoder;multi-layer softmaxing;depth prediction;model compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Raj Dabre;Raphael Rubino;Atsushi Fujita",
        "authorids": "raj.dabre@nict.go.jp;raphael.rubino@nict.go.jp;fujita@paraphrasing.org",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndabre2020balancing,\ntitle={Balancing Cost and Benefit with Tied-Multi Transformers},\nauthor={Raj Dabre and Raphael Rubino and Atsushi Fujita},\nyear={2020},\nurl={https://openreview.net/forum?id=BygKZkBtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BygKZkBtDH",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "236;477;192",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "307;265;146",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            301.6666666666667,
            125.27392208897889
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            239.33333333333334,
            68.18764958227814
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2913633301696658642&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "BygMreSYPB",
        "title": "Learning Latent Dynamics for Partially-Observed Chaotic Systems",
        "track": "main",
        "status": "Reject",
        "tldr": "Data driven identification of ODE representations for partially observed chaotic systems",
        "abstract": "This paper addresses the data-driven identification of latent representations of partially-observed dynamical systems, i.e. dynamical systems whose some components are never observed, with an emphasis on forecasting applications and long-term asymptotic patterns. Whereas state-of-the-art data-driven approaches rely on delay embeddings and linear decompositions of the underlying operators, we introduce a framework based on the data-driven identification of an augmented state-space model using a neural-network-based representation. For a given training dataset, it amounts to jointly reconstructing the latent states and learning an ODE (Ordinary Differential Equation) representation in this space. Through numerical experiments, we demonstrate the relevance of the proposed framework w.r.t. state-of-the-art approaches in terms of short-term forecasting errors and long-term behaviour. We further discuss how the proposed framework relates to Koopman operator theory and Takens' embedding theorem.",
        "keywords": "Dynamical systems;Neural networks;Embedding;Partially observed systems;Forecasting;chaos",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Said ouala;Duong Nguyen;Lucas Drumetz;Bertrand Chapron;Ananda Pascual;Fabrice Collard;Lucile Gaultier;Ronan Fablet",
        "authorids": "said.ouala@imt-atlantique.fr;van.nguyen1@imt-atlantique.fr;lucas.drumetz@imt-atlantique.fr;bertrand.chapron@ifremer.fr;ananda.pascual@imedea.uib-csic.es;dr.fab@oceandatalab.com;lucile.gaultier@oceandatalab.com;ronan.fablet@imt-atlantique.fr",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nouala2020learning,\ntitle={Learning Latent Dynamics for Partially-Observed Chaotic Systems},\nauthor={Said ouala and Duong Nguyen and Lucas Drumetz and Bertrand Chapron and Ananda Pascual and Fabrice Collard and Lucile Gaultier and Ronan Fablet},\nyear={2020},\nurl={https://openreview.net/forum?id=BygMreSYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BygMreSYPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "189;889;543",
        "wc_reply_reviewers": "0;20;0",
        "wc_reply_authors": "522;1927;834",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;3;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            540.3333333333334,
            285.78002418332574
        ],
        "wc_reply_reviewers_avg": [
            6.666666666666667,
            9.428090415820632
        ],
        "wc_reply_authors_avg": [
            1094.3333333333333,
            602.4042570308488
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 57,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16976940659785127691&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 18
    },
    {
        "id": "BygNAa4YPH",
        "title": "Out-of-distribution Detection in Few-shot Classification",
        "track": "main",
        "status": "Reject",
        "tldr": "We quantitatively study out-of-distribution detection in few-shot setting, establish baseline results with ProtoNet, MAML, ABML, and improved upon them.",
        "abstract": "In many real-world settings, a learning model must perform few-shot classification: learn to classify examples from unseen classes using only a few labeled examples per class.\nAdditionally, to be safely deployed, it should have the ability to detect out-of-distribution inputs: examples that do not belong to any of the classes.\nWhile both few-shot classification and out-of-distribution detection are popular topics,\ntheir combination has not been studied. In this work, we propose tasks for out-of-distribution detection in the few-shot setting and establish benchmark datasets, based on four popular few-shot classification datasets.  Then, we propose two new methods for this task and investigate their performance.\nIn sum, we establish baseline out-of-distribution detection results using standard metrics on new benchmark datasets and show improved results with our proposed methods.",
        "keywords": "few-shot classification;out-of-distribution detection;uncertainty estimate",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kuan-Chieh Wang;Paul Vicol;Eleni Triantafillou;Chia-Cheng Liu;Richard Zemel",
        "authorids": "wangkua1@cs.toronto.edu;pvicol@cs.toronto.edu;eleni@cs.toronto.edu;cc.liu2018@gmail.com;zemel@cs.toronto.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwang2020outofdistribution,\ntitle={Out-of-distribution Detection in Few-shot Classification},\nauthor={Kuan-Chieh Wang and Paul Vicol and Eleni Triantafillou and Chia-Cheng Liu and Richard Zemel},\nyear={2020},\nurl={https://openreview.net/forum?id=BygNAa4YPH}\n}",
        "github": "https://drive.google.com/open?id=1LU1B6pK19AvZeXtjSzCGAqXfuYtUA442",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BygNAa4YPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "211;407;258",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "588;472;561",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.0,
            83.55038400071341
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            540.3333333333334,
            49.56028876249837
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3003105720495928338&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BygPO2VKPH",
        "title": "Sparse Coding with Gated Learned ISTA",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose gated mechanisms to enhance learned ISTA for sparse coding, with theoretical guarantees on the superiority of the method. ",
        "abstract": "In this paper, we study the learned iterative shrinkage thresholding algorithm (LISTA) for solving sparse coding problems.  Following assumptions made by prior works, we first discover that the code components in its estimations may be lower than expected, i.e., require gains, and to address this problem, a gated mechanism amenable to theoretical analysis is then introduced. Specific design of the gates is inspired by convergence analyses of the mechanism and hence its effectiveness can be formally guaranteed. In addition to the gain gates, we further introduce overshoot gates for compensating insufficient step size in LISTA. Extensive empirical results confirm our theoretical findings and verify the effectiveness of our method.",
        "keywords": "Sparse coding;deep learning;learned ISTA;convergence analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kailun Wu;Yiwen Guo;Ziang Li;Changshui Zhang",
        "authorids": "wukl14@mails.tsinghua.edu.cn;guoyiwen.ai@bytedance.com;liza19@mails.tsinghua.edu.cn;zcs@mail.tsinghua.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWu2020Sparse,\ntitle={Sparse Coding with Gated Learned ISTA},\nauthor={Kailun Wu and Yiwen Guo and Ziang Li and Changshui Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BygPO2VKPH}\n}",
        "github": "https://github.com/wukailun/GLISTA",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BygPO2VKPH",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "869;396;348",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "685;292;64",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            537.6666666666666,
            235.10612261038398
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            347.0,
            256.48781647477915
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 51,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17364655028001424684&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BygPq6VFvS",
        "title": "Enhancing Attention with Explicit Phrasal Alignments",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The attention mechanism is an indispensable component of any state-of-the-art neural machine translation system. However, existing attention methods are often token-based and ignore the importance of phrasal alignments, which are the backbone of phrase-based statistical machine translation. We propose a novel phrase-based attention method to model n-grams of tokens as the basic attention entities, and design multi-headed phrasal attentions within the Transformer architecture to perform token-to-token and token-to-phrase mappings. Our approach yields improvements in English-German, English-Russian and English-French translation tasks on the standard WMT'14 test set. Furthermore, our phrasal attention method shows improvements on the one-billion-word language modeling benchmark.\n",
        "keywords": "NMT;Phrasal Attention;Machine Translation;Language Modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xuan-Phi Nguyen;Shafiq Joty;Thanh-Tung Nguyen",
        "authorids": "nxphi47@gmail.com;sjoty@salesforce.com;ng0155ng@e.ntu.edu.sg",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nnguyen2020enhancing,\ntitle={Enhancing Attention with Explicit Phrasal Alignments},\nauthor={Xuan-Phi Nguyen and Shafiq Joty and Thanh-Tung Nguyen},\nyear={2020},\nurl={https://openreview.net/forum?id=BygPq6VFvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BygPq6VFvS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "250;516;161",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            309.0,
            150.8133504258382
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:T_1GCe5LISwJ:scholar.google.com/&scioq=Enhancing+Attention+with+Explicit+Phrasal+Alignments&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BygSP6Vtvr",
        "title": "Ensemble Distribution Distillation",
        "track": "main",
        "status": "Poster",
        "tldr": "We distill an ensemble of models into a single model, capturing both the improved classification performance and information about the diversity of the ensemble, which is useful for uncertainty estimation.",
        "abstract": "Ensembles of models often yield improvements in system performance. These ensemble approaches have also been empirically shown to yield robust measures of uncertainty, and are capable of distinguishing between different forms of uncertainty. However, ensembles come at a computational and memory cost which may be prohibitive for many applications. There has been significant work done on the distillation of an ensemble into a single model. Such approaches decrease computational cost and allow a single model to achieve an accuracy comparable to that of an ensemble. However, information about the diversity of the ensemble, which can yield estimates of different forms of uncertainty, is lost. This work considers the novel task of Ensemble Distribution Distillation (EnD^2) - distilling the distribution of the predictions from an ensemble, rather than just the average prediction, into a single model. EnD^2 enables a single model to retain both the improved classification performance of ensemble distillation as well as information about the diversity of the ensemble, which is useful for uncertainty estimation. A solution for EnD^2 based on Prior Networks, a class of models which allow a single neural network to explicitly model a distribution over output distributions, is proposed in this work. The properties of EnD^2 are investigated on both an artificial dataset, and on the CIFAR-10, CIFAR-100 and TinyImageNet datasets, where it is shown that EnD^2 can approach the classification performance of an ensemble, and outperforms both standard DNNs and Ensemble Distillation on the tasks of misclassification and out-of-distribution input detection.",
        "keywords": "Ensemble Distillation;Knowledge Distillation;Uncertainty Estimation;Density Estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrey Malinin;Bruno Mlodozeniec;Mark Gales",
        "authorids": "am969@yandex-team.ru;bkm28@cam.ac.uk;mjfg@eng.cam.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nMalinin2020Ensemble,\ntitle={Ensemble Distribution Distillation},\nauthor={Andrey Malinin and Bruno Mlodozeniec and Mark Gales},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BygSP6Vtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BygSP6Vtvr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "1443;742;943",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1322;173;358",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1042.6666666666667,
            294.731893233306
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            617.6666666666666,
            503.73295394374276
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 289,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1803472414473757943&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BygSXCNFDB",
        "title": "Exploration Based Language Learning for Text-Based Games",
        "track": "main",
        "status": "Reject",
        "tldr": "This work presents an exploration and imitation-learning-based agent capable of state-of-the-art performance in playing text-based computer games. ",
        "abstract": "This work presents an exploration and imitation-learning-based agent capable of state-of-the-art performance in playing text-based computer games. Text-based computer games describe their world to the player through natural language and expect the player to interact with the game using text. These games are of interest as they can be seen as a testbed for language understanding, problem-solving, and language generation by artificial agents. Moreover, they provide a learning environment in which these skills can be acquired through interactions with an environment rather than using fixed corpora. \nOne aspect that makes these games particularly challenging for learning agents is the combinatorially large action space.\nExisting methods for solving text-based games are limited to games that are either very simple or have an action space restricted to a predetermined set of admissible actions. In this work, we propose to use the exploration approach of Go-Explore (Ecoffet et al., 2019) for solving text-based games. More specifically, in an initial exploration phase, we first extract trajectories with high rewards, after which we train a policy to solve the game by imitating these trajectories.\nOur experiments show that this approach outperforms existing solutions in solving text-based games, and it is more sample efficient in terms of the number of interactions with the environment. Moreover, we show that the learned policy can generalize better than existing solutions to unseen games without using any restriction on the action space.",
        "keywords": "Text-Based Games;Exploration;Language Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrea Madotto;Mahdi Namazifar;Joost Huizinga;Piero Molino;Adrien Ecoffet;Huaixiu Zheng;Alexandros Papangelis;Dian Yu;Chandra Khatri;Gokhan Tur",
        "authorids": "amadotto@connect.ust.hk;mahdin@uber.com;jhuizinga@uber.com;piero@uber.com;adrienle@uber.com;huaixiu.zheng@uber.com;apapangelis@uber.com;dianyu@ucdavis.edu;chandrak@uber.com;gokhan@uber.com",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@misc{\nmadotto2020exploration,\ntitle={Exploration Based Language Learning for Text-Based Games},\nauthor={Andrea Madotto and Mahdi Namazifar and Joost Huizinga and Piero Molino and Adrien Ecoffet and Huaixiu Zheng and Alexandros Papangelis and Dian Yu and Chandra Khatri and Gokhan Tur},\nyear={2020},\nurl={https://openreview.net/forum?id=BygSXCNFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BygSXCNFDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "330;229;289",
        "wc_reply_reviewers": "71;0;0",
        "wc_reply_authors": "710;592;545",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            282.6666666666667,
            41.475561746915766
        ],
        "wc_reply_reviewers_avg": [
            23.666666666666668,
            33.46972097616325
        ],
        "wc_reply_authors_avg": [
            615.6666666666666,
            69.4086129781856
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14063645493694670529&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "BygSZAVKvr",
        "title": "Energy-Aware Neural Architecture Optimization with Fast Splitting Steepest Descent",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Designing energy-efficient networks is of critical importance for enabling state-of-the-art deep learning in mobile and edge settings where the computation and energy budgets are highly limited. Recently, Wu et al. (2019) framed the search of efficient neural architectures into a continuous splitting process: it iteratively splits existing neurons into multiple off-springs to achieve progressive loss minimization,  thus finding novel architectures by gradually growing the neural network. However, this method was not specifically tailored for designing energy-efficient networks, and is computationally expensive on large-scale benchmarks.  In this work, we substantially improve Wu et al. (2019) in two significant ways:  1) we incorporate the energy cost of splitting different neurons to better guide the splitting process, thereby discovering more energy-efficient network architectures; 2) we substantially speed up the splitting process of Wu et al. (2019), which requires expensive eigen-decomposition, by proposing a highly scalable Rayleigh-quotient stochastic gradient algorithm.  Our fast algorithm allows us to reduce the computational cost of splitting to the same level of typical back-propagation updates and enables efficient implementation on GPU. Extensive empirical results show that our method can train highly accurate and energy-efficient networks on challenging datasets such as ImageNet,  improving a variety of baselines,  including the pruning-based methods and expert-designed architectures.",
        "keywords": "Neural architecture optimization;splitting steepest descent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dilin Wang;Meng Li;Lemeng Wu;Vikas Chandra;Qiang Liu",
        "authorids": "dilin@cs.utexas.edu;meng.li@fb.com;lmwu@cs.utexas.edu;vchandra@fb.com;lqiang@cs.utexas.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwang2020energyaware,\ntitle={Energy-Aware Neural Architecture Optimization with Fast Splitting Steepest Descent},\nauthor={Dilin Wang and Meng Li and Lemeng Wu and Vikas Chandra and Qiang Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=BygSZAVKvr}\n}",
        "github": "https://drive.google.com/drive/u/1/folders/1RpnXTQUaiia6skdsmhBGqIX-Ex9CjJ3Y",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BygSZAVKvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "320;491;278",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1094;1329;640",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            363.0,
            92.11948762341224
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1021.0,
            285.980185793818
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18077991162853228440&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BygWBRNtvH",
        "title": "Benchmarking Adversarial Robustness",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We provide a comprehensive, rigorous, and coherent benchmark to evaluate adversarial robustness of deep learning models.",
        "abstract": "Deep neural networks are vulnerable to adversarial examples, which becomes one of the most important problems in the development of deep learning. While a lot of efforts have been made in recent years, it is of great significance to perform correct and complete evaluations of the adversarial attack and defense algorithms. In this paper, we establish a comprehensive, rigorous, and coherent benchmark to evaluate adversarial robustness on image classification tasks. After briefly reviewing plenty of representative attack and defense methods, we perform large-scale experiments with two robustness curves as the fair-minded evaluation criteria to fully understand the performance of these methods. Based on the evaluation results, we draw several important findings and provide insights for future research.",
        "keywords": "Adversarial Example;Robustness;Benchmark",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yinpeng Dong;Qi-An Fu;Xiao Yang;Tianyu Pang;Hang Su;Jun Zhu",
        "authorids": "dyp17@mails.tsinghua.edu.cn;fqa19@mails.tsinghua.edu.cn;yangxiao19@mails.tsinghua.edu.cn;pty17@mails.tsinghua.edu.cn;suhangss@mail.tsinghua.edu.cn;dcszj@mail.tsinghua.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "https://github.com/adversarial-robustness-benchmark/adversarial-robustness-benchmark",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BygWBRNtvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "425;467;252",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            381.3333333333333,
            93.04598624097417
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "BygWRaVYwH",
        "title": "Generalized Inner Loop Meta-Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Lots of meta-learning problems follow the same general pattern, so we formalized it, proved stuff about it, turned it into an algorithm, and subsequently a pytorch library.",
        "abstract": "Many (but not all) approaches self-qualifying as \"meta-learning\" in deep learning and reinforcement learning fit a common pattern of approximating the solution to a nested optimization problem. In this paper, we give a formalization of this shared pattern, which we call GIMLI, prove its general requirements, and derive a general-purpose algorithm for implementing similar approaches. Based on this analysis and algorithm, we describe a library of our design, unnamedlib, which we share with the community to assist and enable future research into these kinds of meta-learning approaches. We end the paper by showcasing the practical applications of this framework and library through illustrative experiments and ablation studies which they facilitate.",
        "keywords": "meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Edward Grefenstette;Brandon Amos;Denis Yarats;Phu Mon Htut;Artem Molchanov;Franziska Meier;Douwe Kiela;Kyunghyun Cho;Soumith Chintala",
        "authorids": "egrefen@gmail.com;brandon.amos.cs@gmail.com;denisyarats@cs.nyu.edu;pmh330@nyu.edu;a.molchanov86@gmail.com;fmeier@fb.com;dkiela@fb.com;kyunghyun.cho@nyu.edu;soumith@gmail.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@misc{\ngrefenstette2020generalized,\ntitle={Generalized Inner Loop Meta-Learning},\nauthor={Edward Grefenstette and Brandon Amos and Denis Yarats and Phu Mon Htut and Artem Molchanov and Franziska Meier and Douwe Kiela and Kyunghyun Cho and Soumith Chintala},\nyear={2020},\nurl={https://openreview.net/forum?id=BygWRaVYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BygWRaVYwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "234;125;920",
        "wc_reply_reviewers": "0;0;222",
        "wc_reply_authors": "908;428;2456",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;2;5",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            426.3333333333333,
            351.89992264215624
        ],
        "wc_reply_reviewers_avg": [
            74.0,
            104.65180361560904
        ],
        "wc_reply_authors_avg": [
            1264.0,
            865.3507959203597
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            3.0,
            1.4142135623730951
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 195,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13853104584510259449&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BygXFkSYDH",
        "title": "Target-Embedding Autoencoders for Supervised Representation Learning",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "Autoencoder-based learning has emerged as a staple for disciplining representations in unsupervised and semi-supervised settings. This paper analyzes a framework for improving generalization in a purely supervised setting, where the target space is high-dimensional. We motivate and formalize the general framework of target-embedding autoencoders (TEA) for supervised prediction, learning intermediate latent representations jointly optimized to be both predictable from features as well as predictive of targets---encoding the prior that variations in targets are driven by a compact set of underlying factors. As our theoretical contribution, we provide a guarantee of generalization for linear TEAs by demonstrating uniform stability, interpreting the benefit of the auxiliary reconstruction task as a form of regularization. As our empirical contribution, we extend validation of this approach beyond existing static classification applications to multivariate sequence forecasting, verifying their advantage on both linear and nonlinear recurrent architectures---thereby underscoring the further generality of this framework beyond feedforward instantiations.",
        "keywords": "autoencoders;supervised learning;representation learning;target-embedding;label-embedding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Jarrett;Mihaela van der Schaar",
        "authorids": "daniel.jarrett@eng.ox.ac.uk;mv472@damtp.cam.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nJarrett2020Target-Embedding,\ntitle={Target-Embedding Autoencoders for Supervised Representation Learning},\nauthor={Daniel Jarrett and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BygXFkSYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BygXFkSYDH",
        "pdf_size": 0,
        "rating": "6;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "393;456;498;109",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "2943;2345;2447;181",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "5;6;4;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            364.0,
            151.89305448242195
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1979.0,
            1062.435880418202
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            4.0,
            1.8708286933869707
        ],
        "replies_avg": [
            24,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5000016267940689678&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BygXY34FDr",
        "title": "VUSFA:Variational Universal Successor Features Approximator",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present an improved version of Universal Successor Features based DRL method which can improve the transfer learning of agents.",
        "abstract": "In this paper, we show how novel transfer reinforcement learning techniques can be applied to the complex task of target-driven navigation using the photorealisticAI2THOR simulator. Specifically, we build on the concept of Universal SuccessorFeatures with an A3C agent. We introduce the novel architectural1contribution of a Successor Feature Dependent Policy (SFDP) and adopt the concept of VariationalInformation Bottlenecks to achieve state of the art performance.VUSFA, our final architecture, is a straightforward approach that can be implemented using our open source repository. Our approach is generalizable, showed greater stability in training, and outperformed recent approaches in terms of transfer learning ability.",
        "keywords": "Universal Successor Features;Successor Features;Model Free Deep Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shamane Siriwardhana;Rivindu Weerasakera;Denys J.C. Matthies;Suranga Nanayakkara",
        "authorids": "shamane@ahlab.org;rivindu@ahlab.org;denys@ahlab.org;suranga@ahlab.org",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BygXY34FDr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "403;790;450",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "73;81;145",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            547.6666666666666,
            172.42647386317708
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            99.66666666666667,
            32.22145592958553
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NtVH6_GLtXAJ:scholar.google.com/&scioq=VUSFA:Variational+Universal+Successor+Features+Approximator&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BygY4grYDr",
        "title": "The divergences minimized by non-saturating GAN training",
        "track": "main",
        "status": "Reject",
        "tldr": "Non-saturating GAN training effectively minimizes a reverse KL-like f-divergence.",
        "abstract": "Interpreting generative adversarial network (GAN) training as approximate divergence minimization has been\ntheoretically insightful, has spurred discussion, and has lead to theoretically and practically interesting\nextensions such as f-GANs and Wasserstein GANs. For both classic GANs and f-GANs, there is an original variant of training and a \"non-saturating\" variant which uses an alternative form of generator gradient. The original variant is theoretically easier to study, but for GANs the alternative variant performs better in practice. The non-saturating scheme is often regarded as a simple modification to deal with optimization issues, but we show that in fact the non-saturating scheme for GANs is effectively optimizing a reverse KL-like f-divergence. We also develop a number of theoretical tools to help compare and classify f-divergences. We hope these results may help to clarify some of the theoretical discussion surrounding the divergence minimization view of GAN training.",
        "keywords": "GAN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matt Shannon",
        "authorids": "matt.shannon.personal@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nshannon2020the,\ntitle={The divergences minimized by non-saturating {\\{}GAN{\\}} training},\nauthor={Matt Shannon},\nyear={2020},\nurl={https://openreview.net/forum?id=BygY4grYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BygY4grYDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "173;607;529",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "333;1130;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            436.3333333333333,
            188.90797289215251
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            487.6666666666667,
            474.1071140107007
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.816496580927726
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8923073602837177091&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BygZARVFDH",
        "title": "Compositional Visual Generation with Energy Based Models",
        "track": "main",
        "status": "Reject",
        "tldr": "\"We present flexible compositional image generation and its applications to continual learning and generalization\"?",
        "abstract": "Humans are able to both learn quickly and rapidly adapt their knowledge. One major component is the ability to incrementally combine many simple concepts to accelerates the learning process. We show that energy based models are a promising class of models towards exhibiting these properties by directly combining probability distributions. This allows us to combine an arbitrary number of different distributions in a globally coherent manner. We show this compositionality property allows us to define three basic operators, logical conjunction, disjunction, and negation, on different concepts to generate plausible naturalistic images. Furthermore, by applying these abilities, we show that we are able to extrapolate concept combinations, continually combine previously learned concepts, and infer concept properties in a compositional manner.",
        "keywords": "Compositional Generation;Energy Based Model;Compositionality;Generative Models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yilun Du;Shuang Li;Igor Mordatch",
        "authorids": "yilundu@mit.edu;lishuang@mit.edu;mordatch@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndu2020compositional,\ntitle={Compositional Visual Generation with Energy Based Models},\nauthor={Yilun Du and Shuang Li and Igor Mordatch},\nyear={2020},\nurl={https://openreview.net/forum?id=BygZARVFDH}\n}",
        "github": "https://drive.google.com/file/d/138w7Oj8rQl_e40_RfZJq2WKWb41NgKn3",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BygZARVFDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "433;910;342",
        "wc_reply_reviewers": "0;0;132",
        "wc_reply_authors": "176;525;361",
        "reply_reviewers": "0;0;3",
        "reply_authors": "1;1;4",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            561.6666666666666,
            249.09480569097025
        ],
        "wc_reply_reviewers_avg": [
            44.0,
            62.22539674441618
        ],
        "wc_reply_authors_avg": [
            354.0,
            142.56460523799961
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 180,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2816208777147772856&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BygZK2VYvB",
        "title": "Utilizing Edge Features in Graph Neural Networks via Variational Information Maximization",
        "track": "main",
        "status": "Reject",
        "tldr": "We use a principled variational approach to preserve edge information in graph neural networks and show the importance of edge features and the superior of our method in extensive benchmarks.",
        "abstract": "Graph Neural Networks (GNNs) broadly follow the scheme that the representation vector of each node is updated recursively using the message from neighbor nodes, where the message of a neighbor is usually pre-processed with a parameterized transform matrix. To make better use of edge features, we propose the Edge Information maximized Graph Neural Network (EIGNN) that maximizes the Mutual Information (MI) between edge features and message passing channels. The MI is reformulated as a differentiable objective via a variational approach. We theoretically show that the newly introduced objective enables the model to preserve edge information, and empirically corroborate the enhanced performance of MI-maximized models across a broad range of learning tasks including regression on molecular graphs and relation prediction in knowledge graphs.",
        "keywords": "Graph Neural Network;Edge Feature;Mutual Information",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pengfei Chen;Weiwen Liu;Chang-Yu Hsieh;Guangyong Chen;Pheng Ann Heng",
        "authorids": "chenpf.cuhk@gmail.com;wwliu@cse.cuhk.edu.hk;kimhsieh@tencent.com;gycchen@tencent.com;pheng@cse.cuhk.edu.hk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchen2020utilizing,\ntitle={Utilizing Edge Features in Graph Neural Networks via Variational Information Maximization},\nauthor={Pengfei Chen and Weiwen Liu and Chang-Yu Hsieh and Guangyong Chen and Pheng Ann Heng},\nyear={2020},\nurl={https://openreview.net/forum?id=BygZK2VYvB}\n}",
        "github": "https://drive.google.com/file/d/1HtOWRuLBcuggsSIrEjHC-1-Lq8W-8KYb/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BygZK2VYvB",
        "pdf_size": 0,
        "rating": "3;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "123;413;128;130",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "55;783;189;15",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            198.5,
            123.8678731552294
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            260.5,
            308.4716356490496
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10451284294211352837&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Byg_vREtvB",
        "title": "Generalized Bayesian Posterior Expectation Distillation for Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "A general framework for distilling Bayesian posterior expectations for deep neural networks.",
        "abstract": "In this paper, we present a general framework for distilling expectations with respect to the Bayesian posterior distribution of a deep neural network, significantly extending prior work on a method known as ``Bayesian Dark Knowledge.\"  Our generalized framework applies to the case of classification models and takes as input the architecture of a ``teacher\" network, a general posterior expectation of interest, and the architecture of a ``student\" network. The distillation method performs an online compression of the selected posterior expectation using iteratively generated Monte Carlo samples from the parameter posterior of the teacher model. We further consider the problem of optimizing the student model architecture with respect to an accuracy-speed-storage trade-off. We present experimental results investigating multiple data sets, distillation targets,  teacher model architectures, and approaches to searching for student model architectures. We establish the key result that distilling into a student model with an architecture that matches the teacher, as is done in Bayesian Dark Knowledge, can lead to sub-optimal performance. Lastly, we show that student architecture search methods can identify student models with significantly improved performance. ",
        "keywords": "Bayesian Neural Networks;Distillation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Meet P. Vadera;Benjamin M. Marlin",
        "authorids": "mvadera@cs.umass.edu;marlin@cs.umass.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nvadera2020generalized,\ntitle={Generalized Bayesian Posterior Expectation Distillation for Deep Neural Networks},\nauthor={Meet P. Vadera and Benjamin M. Marlin},\nyear={2020},\nurl={https://openreview.net/forum?id=Byg_vREtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Byg_vREtvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "500;313;393",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "537;489;464",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            402.0,
            76.60722333218106
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            496.6666666666667,
            30.291179500896884
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16630961224837045361&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BygacxrFwS",
        "title": "Fractional Graph Convolutional Networks (FGCN) for Semi-Supervised Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "A new Fractional Generalized Graph Convolutional Networks (FGCN) method for semi-supervised learning",
        "abstract": "Due to high utility in many applications, from social networks to blockchain to power grids, deep learning on non-Euclidean objects such as graphs and manifolds continues to gain an ever increasing interest. Most currently available techniques are based on the idea of performing  a convolution operation in the spectral domain with a suitably chosen nonlinear trainable filter and then approximating the filter with finite order polynomials. However, such polynomial approximation approaches tend to be both non-robust to changes in the graph structure and to capture primarily the global graph topology. In this paper we propose a new Fractional Generalized Graph Convolutional Networks (FGCN) method for semi-supervised learning, which casts the L\\'evy Fights into random walks on graphs and, as a result, allows to more accurately account for the intrinsic graph topology and to substantially improve classification performance, especially for  heterogeneous graphs.",
        "keywords": "convolutional networks;node classification;Levy flight;graph-based semi-supervised learning;local graph topology",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuzhou Chen;Yulia R. Gel;Konstantin Avrachenkov",
        "authorids": "yuzhouc@smu.edu;ygl@utdallas.edu;konstentin.avratchankov@inria.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchen2020fractional,\ntitle={Fractional Graph Convolutional Networks ({\\{}FGCN{\\}}) for Semi-Supervised Learning},\nauthor={Yuzhou Chen and Yulia R. Gel and Konstantin Avrachenkov},\nyear={2020},\nurl={https://openreview.net/forum?id=BygacxrFwS}\n}",
        "github": "https://www.dropbox.com/sh/ajtz6inf677nkcv/AACXkFRZjRrCkxYkxJDNfks0a?dl=0.",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BygacxrFwS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "314;241;195",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            250.0,
            48.99659852139398
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3798810595245038267&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Bygadh4tDB",
        "title": "Low Bias Gradient Estimates for Very Deep Boolean Stochastic Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a low-bias estimator for Boolean stochastic variable models with many stochastic layers.",
        "abstract": "Stochastic neural networks with discrete random variables are an important class of models for their expressivity and interpretability. Since direct differentiation and backpropagation is not possible, Monte Carlo gradient estimation techniques have been widely employed for training such models. Efficient stochastic gradient estimators, such Straight-Through and Gumbel-Softmax, work well for shallow models with one or two stochastic layers. Their performance, however, suffers with increasing model complexity.\nIn this work we focus on stochastic networks with multiple layers of Boolean latent variables. To analyze such such networks, we employ the framework of harmonic analysis for Boolean functions.  We use it to derive an analytic formulation for the source of bias in the biased Straight-Through estimator. Based on the analysis we propose \\emph{FouST}, a simple gradient estimation algorithm that relies on three simple bias reduction steps. Extensive experiments show that FouST performs favorably compared to state-of-the-art biased estimators, while being much faster than unbiased ones. To the best of our knowledge FouST is the first gradient estimator to train up very deep stochastic neural networks, with up to 80 deterministic and 11 stochastic layers. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Adeel Pervez;Taco Cohen;Efstratios Gavves",
        "authorids": "a.a.pervez@uva.nl;tacos@qti.qualcomm.com;efstratios.gavves@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npervez2020low,\ntitle={Low Bias Gradient Estimates for Very Deep Boolean Stochastic Networks},\nauthor={Adeel Pervez and Taco Cohen and Efstratios Gavves},\nyear={2020},\nurl={https://openreview.net/forum?id=Bygadh4tDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bygadh4tDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "451;854;169",
        "wc_reply_reviewers": "126;515;0",
        "wc_reply_authors": "821;2145;48",
        "reply_reviewers": "2;1;0",
        "reply_authors": "3;4;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            491.3333333333333,
            281.1006146481284
        ],
        "wc_reply_reviewers_avg": [
            213.66666666666666,
            219.1960056408166
        ],
        "wc_reply_authors_avg": [
            1004.6666666666666,
            865.8915764818494
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.247219128924647
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PyTWH8leAbUJ:scholar.google.com/&scioq=Low+Bias+Gradient+Estimates+for+Very+Deep+Boolean+Stochastic+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BygdR0VKDr",
        "title": "Discrete Transformer",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Discrete transformer which uses hard attention to ensure that each step only depends on a fixed context.",
        "abstract": "The transformer has become a central model for many NLP tasks from translation to language modeling to representation learning. Its success demonstrates the effectiveness of stacked attention as a replacement for recurrence for many tasks. In theory attention also offers more insights into the model\u2019s internal decisions; however, in practice when stacked it quickly becomes nearly as fully-connected as recurrent models. In this work, we propose an alternative transformer architecture, discrete transformer, with the goal of better separating out internal model decisions. The model uses hard attention to ensure that each step only depends on a fixed context. Additionally, the model uses a separate \u201csyntactic\u201d controller to separate out network structure from decision making. Finally we show that this approach can be further sparsified with direct regularization. Empirically, this approach is able to maintain the same level of performance on several datasets, while discretizing reasoning decisions over the data.",
        "keywords": "transformer;natural language processing;attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jambay Kinley;Yuntian Deng;Alexander M. Rush",
        "authorids": "j_kinley@college.harvard.edu;dengyuntian@seas.harvard.edu;arush@cornell.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BygdR0VKDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "692;729;313",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            578.0,
            187.99113454274024
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "BygdyxHFDS",
        "title": "Meta-learning curiosity algorithms",
        "track": "main",
        "status": "Poster",
        "tldr": "Meta-learning curiosity algorithms by searching through a rich space of programs yields novel designs that generalize across very different reinforcement-learning domains.",
        "abstract": "We hypothesize that curiosity is a mechanism found by evolution that encourages meaningful exploration early in an agent's life in order to expose it to experiences that enable it to obtain high rewards over the course of its lifetime. We formulate the problem of generating curious behavior as one of meta-learning: an outer loop will search over a space of curiosity mechanisms that dynamically adapt the agent's reward signal, and an inner loop will perform standard reinforcement learning using the adapted reward signal. However, current meta-RL methods based on transferring neural network weights have only generalized between very similar tasks. To broaden the generalization, we instead propose to meta-learn algorithms: pieces of code similar to those designed by humans in ML papers. Our rich language of programs combines neural networks with other building blocks such as buffers, nearest-neighbor modules and custom loss functions. We demonstrate the effectiveness of the approach empirically, finding two novel curiosity algorithms that perform on par or better than human-designed published curiosity algorithms in domains as disparate as grid navigation with image inputs, acrobot, lunar lander, ant and hopper.",
        "keywords": "meta-learning;exploration;curiosity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ferran Alet*;Martin F. Schneider*;Tomas Lozano-Perez;Leslie Pack Kaelbling",
        "authorids": "ferranalet@gmail.com;martinfs@mit.edu;tlp@csail.mit.edu;lpk@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nAlet*2020Meta-learning,\ntitle={Meta-learning curiosity algorithms},\nauthor={Ferran Alet* and Martin F. Schneider* and Tomas Lozano-Perez and Leslie Pack Kaelbling},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BygdyxHFDS}\n}",
        "github": "https://github.com/mfranzs/meta-learning-curiosity-algorithms",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BygdyxHFDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "183;357;743",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "92;273;720",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            427.6666666666667,
            234.01614379259297
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            361.6666666666667,
            263.9347562477431
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 83,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=957030808144457280&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BygfiAEtwS",
        "title": "Inducing Stronger Object Representations in Deep Visual Trackers",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Fully convolutional deep correlation networks are integral components of state-of-\nthe-art approaches to single object visual tracking. It is commonly assumed that\nthese networks perform tracking by detection by matching features of the object\ninstance with features of the entire frame. Strong architectural priors and conditioning\non the object representation is thought to encourage this tracking strategy.\nDespite these strong priors, we show that deep trackers often default to \u201ctracking-\nby-saliency\u201d detection \u2013 without relying on the object instance representation. Our\nanalysis shows that despite being a useful prior, salience detection can prevent the\nemergence of more robust tracking strategies in deep networks. This leads us to\nintroduce an auxiliary detection task that encourages more discriminative object\nrepresentations that improve tracking performance.",
        "keywords": "Object Tracking;Computer Vision;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ross Goroshin;Jonathan Tompson;Debidatta Dwibedi",
        "authorids": "goroshin@google.com;tompson@google.com;debidatta@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngoroshin2020inducing,\ntitle={Inducing Stronger Object Representations in Deep Visual Trackers},\nauthor={Ross Goroshin and Jonathan Tompson and Debidatta Dwibedi},\nyear={2020},\nurl={https://openreview.net/forum?id=BygfiAEtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BygfiAEtwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "353;547;476",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            458.6666666666667,
            80.14292787819079
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:S-rlp0G9UZwJ:scholar.google.com/&scioq=Inducing+Stronger+Object+Representations+in+Deep+Visual+Trackers&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BygfrANKvB",
        "title": "Learning to Make Generalizable and Diverse Predictions for Retrosynthesis",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new model for making generalizable and diverse retrosynthetic reaction predictions.",
        "abstract": "We propose a new model for making generalizable and diverse retrosynthetic reaction predictions. Given a target compound, the task is to predict the likely chemical reactants to produce the target. This generative task can be framed as a sequence-to-sequence problem by using the SMILES representations of the molecules. Building on top of the popular Transformer architecture, we propose two novel pre-training methods that construct relevant auxiliary tasks (plausible reactions) for our problem. Furthermore, we incorporate a discrete latent variable model into the architecture to encourage the model to produce a diverse set of alternative predictions. On the 50k subset of reaction examples from the United States patent literature (USPTO-50k) benchmark dataset, our model greatly improves performance over the baseline, while also generating predictions that are more diverse.",
        "keywords": "Chemistry;Retrosynthesis;Transformer;Pre-training;Diversity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Benson Chen;Tianxiao Shen;Tommi S. Jaakkola;Regina Barzilay",
        "authorids": "bensonc@mit.edu;tianxiao@mit.edu;tommi@csail.mit.edu;regina@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nchen2020learning,\ntitle={Learning to Make Generalizable and Diverse Predictions for Retrosynthesis},\nauthor={Benson Chen and Tianxiao Shen and Tommi S. Jaakkola and Regina Barzilay},\nyear={2020},\nurl={https://openreview.net/forum?id=BygfrANKvB}\n}",
        "github": "https://github.com/iclr-2020-retro/retro_smiles_transformer",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BygfrANKvB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "314;183;163",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "830;114;23",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            220.0,
            66.9676538835479
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            322.3333333333333,
            360.891796649479
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 53,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=765518216642769830&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ByggpyrFPS",
        "title": "Bayesian Variational Autoencoders for Unsupervised Out-of-Distribution Detection",
        "track": "main",
        "status": "Reject",
        "tldr": "We do unsupervised out-of-distribution detection by estimating a posterior distribution over the parameters of a VAE using SG-MCMC and using information-theoretic measures.",
        "abstract": "Despite their successes, deep neural networks still make unreliable predictions when faced with test data drawn from a distribution different to that of the training data, constituting a major problem for AI safety. While this motivated a recent surge in interest in developing methods to detect such out-of-distribution (OoD) inputs, a robust solution is still lacking. We propose a new probabilistic, unsupervised approach to this problem based on a Bayesian variational autoencoder model, which estimates a full posterior distribution over the decoder parameters using stochastic gradient Markov chain Monte Carlo, instead of fitting a point estimate. We describe how information-theoretic measures based on this posterior can then be used to detect OoD data both in input space as well as in the model\u2019s latent space. The effectiveness of our approach is empirically demonstrated.",
        "keywords": "variational autoencoders;out-of-distribution detection;stochastic gradient MCMC",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Erik Daxberger;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato",
        "authorids": "ead54@cam.ac.uk;jmh233@cam.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ndaxberger2020bayesian,\ntitle={Bayesian Variational Autoencoders for Unsupervised Out-of-Distribution Detection},\nauthor={Erik Daxberger and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato},\nyear={2020},\nurl={https://openreview.net/forum?id=ByggpyrFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByggpyrFPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "310;309;362",
        "wc_reply_reviewers": "95;0;0",
        "wc_reply_authors": "1634;176;822",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            327.0,
            24.752104287649296
        ],
        "wc_reply_reviewers_avg": [
            31.666666666666668,
            44.78342947514801
        ],
        "wc_reply_authors_avg": [
            877.3333333333334,
            596.5105941128699
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 80,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15157855946489945418&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BygkQeHKwB",
        "title": "Walking on the Edge: Fast, Low-Distortion Adversarial Examples",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Adversarial examples of deep neural networks are receiving ever increasing attention because they help in understanding and reducing the sensitivity to their input. This is natural given the increasing applications of deep neural networks in our everyday lives. When white-box attacks are almost always successful, it is typically only the distortion of the perturbations that matters in their evaluation.\n\nIn this work, we argue that speed is important as well, especially when considering that fast attacks are required by adversarial training. Given more time, iterative methods can always find better solutions. We investigate this speed-distortion trade-off in some depth and introduce a new attack called boundary projection BP that improves upon existing methods by a large margin. Our key idea is that the classification boundary is a manifold in the image space: we therefore quickly reach the boundary and then optimize distortion on this manifold.",
        "keywords": "Deep learning;adversarial attack",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hanwei Zhang;Teddy Furon;Yannis Avrithis;Laurent Amsaleg",
        "authorids": "hanwei.zhang@irisa.fr;teddy.furon@inria.fr;yannis@avrithis.net;laurent.amsaleg@irisa.fr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020walking,\ntitle={Walking on the Edge: Fast, Low-Distortion Adversarial Examples},\nauthor={Hanwei Zhang and Teddy Furon and Yannis Avrithis and Laurent Amsaleg},\nyear={2020},\nurl={https://openreview.net/forum?id=BygkQeHKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BygkQeHKwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "178;286;211",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "624;1197;282",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            225.0,
            45.18849411078001
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            701.0,
            377.4943708189567
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 69,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7146835260599927994&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "Bygka64KPH",
        "title": "Semi-Supervised Few-Shot Learning with Prototypical Random Walks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Learning from a few examples is a key characteristic of human intelligence that inspired machine learning researchers to build data-efficient AI models. Recent progress has shown that few-shot learning can be improved with access to unlabelled data,  known as semi-supervised few-shot learning(SS-FSL). We introduce an SS-FSL approach, dubbed as Prototypical Random Walk Networks (PRWN), built on top of Prototypical Networks (PN). We develop a random walk semi-supervised loss that enables the network to learn representations that are compact and well-separated. Our work is related to the very recent development on graph-based approaches for few-shot learning. However, we show that achieved compact and well-separated class embeddings can be achieved by our prototypical random walk notion without needing additional graph-NN parameters or requiring a transductive setting where collective test set is provided. Our model outperforms prior art in most benchmarks with significant improvements in some cases. For example, in a mini-Imagenet 5-shot classification task, we obtain 69.65% accuracy to the 64.59% state-of-the-art. Our model, trained with 40% of the data as labelled, compares competitively against fully supervised prototypical networks, trained on 100% of the labels, even outperforming it in the 1-shot mini-Imagenet case with 50.89% to 49.4% accuracy.  We also show that our model is resistant to distractors, unlabeled data that does not belong to any of the training classes, and hence reflecting robustness to labelled/unlabelled class distribution mismatch. We also performed a challenging discriminative power test, showing a relative improvement on top of the baseline of 14% on 20 classes on mini-Imagenet and 60% on 800 classes on Ominiglot.",
        "keywords": "Few-Shot Learning;Semi-Supervised Learning;Random Walks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ahmed Ayyad;Nassir Navab;Mohamed Elhoseiny;Shadi Albarqouni",
        "authorids": "a.3ayad@gmail.com;nassir.navab@tum.de;mohamed.elhoseiny@gmail.com;shadi.albarqouni@tum.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nayyad2020semisupervised,\ntitle={Semi-Supervised Few-Shot Learning with Prototypical Random Walks},\nauthor={Ahmed Ayyad and Nassir Navab and Mohamed Elhoseiny and Shadi Albarqouni},\nyear={2020},\nurl={https://openreview.net/forum?id=Bygka64KPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bygka64KPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "175;887;222",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "106;869;177",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            428.0,
            325.1286924691001
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            384.0,
            344.16953186862236
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13725268590156622866&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "ByglLlHFDS",
        "title": "Expected Information Maximization: Using the I-Projection for Mixture Density Estimation",
        "track": "main",
        "status": "Poster",
        "tldr": "A novel, non-adversarial, approach to learn latent variable models in general and mixture models in particular by computing the I-Projection solely based on samples.",
        "abstract": "Modelling highly multi-modal data is a challenging problem in machine learning. Most algorithms are based on maximizing the likelihood, which corresponds to the M(oment)-projection of the data distribution to the model distribution.\nThe M-projection forces the model to average over modes it cannot represent. In contrast, the I(nformation)-projection ignores such modes in the data and concentrates on the modes the model can represent. Such behavior is appealing whenever we deal with highly multi-modal data where modelling single modes correctly is more important than covering all the modes. Despite this advantage, the I-projection is rarely used in practice due to the lack of algorithms that can efficiently optimize it based on data. In this work, we present a new algorithm called Expected Information Maximization (EIM) for computing the I-projection solely based on samples for general latent variable models, where we focus on Gaussian mixtures models and Gaussian mixtures of experts. Our approach applies a variational upper bound to the I-projection objective which decomposes the original objective into single objectives for each mixture component as well as for the coefficients, allowing an efficient optimization. Similar to GANs, our approach employs discriminators but uses a more stable optimization procedure, using a tight upper bound. We show that our algorithm is much more effective in computing the I-projection than recent GAN approaches and we illustrate the effectiveness of our approach for modelling multi-modal behavior on two pedestrian and traffic prediction datasets.  ",
        "keywords": "density estimation;information projection;mixture models;generative learning;multimodal modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Philipp Becker;Oleg Arenz;Gerhard Neumann",
        "authorids": "philippbecker93@googlemail.com;oleg@robot-learning.de;geri@robot-learning.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nBecker2020Expected,\ntitle={Expected Information Maximization: Using the I-Projection for Mixture Density Estimation},\nauthor={Philipp Becker and Oleg Arenz and Gerhard Neumann},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByglLlHFDS}\n}",
        "github": "https://github.com/pbecker93/ExpectedInformationMaximization",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByglLlHFDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "306;252;284",
        "wc_reply_reviewers": "0;20;0",
        "wc_reply_authors": "601;380;482",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            280.6666666666667,
            22.17105219775452
        ],
        "wc_reply_reviewers_avg": [
            6.666666666666667,
            9.428090415820632
        ],
        "wc_reply_authors_avg": [
            487.6666666666667,
            90.31180555288563
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10322383053162964662&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "BygpAp4Ywr",
        "title": "Defending Against Adversarial Examples by Regularized Deep Embedding",
        "track": "main",
        "status": "Reject",
        "tldr": "A general and easy-to-use framework that improves the adversarial robustness of deep classification models through embedding regularization.",
        "abstract": "Recent studies have demonstrated the vulnerability of deep convolutional neural networks against adversarial examples. Inspired by the observation that the intrinsic dimension of image data is much smaller than its pixel space dimension and the vulnerability of neural networks grows with the input dimension, we propose to embed high-dimensional input images into a low-dimensional space to perform classification. However, arbitrarily projecting the input images to a low-dimensional space without regularization will not improve the robustness of deep neural networks. We propose a new framework, Embedding Regularized Classifier (ER-Classifier), which improves the adversarial robustness of the classifier through embedding regularization. Experimental results on several benchmark datasets show that, our proposed framework achieves state-of-the-art performance against strong adversarial attack methods.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yao Li;Martin Renqiang Min;Wenchao Yu;Cho-Jui Hsieh;Thomas Lee;Erik Kruus",
        "authorids": "yaoli@ucdavis.edu;renqiang@nec-labs.com;yuwenchao@ucla.edu;chohsieh@cs.ucla.edu;tcmlee@ucdavis.edu;kruus@nec-labs.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nli2020defending,\ntitle={Defending Against Adversarial Examples by Regularized Deep Embedding},\nauthor={Yao Li and Martin Renqiang Min and Wenchao Yu and Cho-Jui Hsieh and Thomas Lee and Erik Kruus},\nyear={2020},\nurl={https://openreview.net/forum?id=BygpAp4Ywr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BygpAp4Ywr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "592;471;526",
        "wc_reply_reviewers": "96;0;0",
        "wc_reply_authors": "1894;684;753",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            529.6666666666666,
            49.46603773185082
        ],
        "wc_reply_reviewers_avg": [
            32.0,
            45.254833995939045
        ],
        "wc_reply_authors_avg": [
            1110.3333333333333,
            554.8515316931388
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:M5x7DbkpUb0J:scholar.google.com/&scioq=Defending+Against+Adversarial+Examples+by+Regularized+Deep+Embedding&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Bygw86VKwS",
        "title": "Fault Tolerant Reinforcement Learning via A Markov Game of Control and Stopping",
        "track": "main",
        "status": "Withdraw",
        "tldr": "The paper tackles fault-tolerance under random and adversarial stoppages.",
        "abstract": "Recently, there has been a surge in interest in safe and robust techniques within reinforcement learning (RL). \nCurrent notions of risk in RL fail to capture the potential for systemic failures such as abrupt stoppages from system failures or surpassing of safety thresholds and the appropriate responsive controls in such instances. We propose a novel approach to fault-tolerance within RL in which the controller learns a policy can cope with adversarial attacks and random stoppages that lead to failures of the system subcomponents. The results of the paper also cover fault-tolerant (FT) control so that the controller learns to avoid states that carry risk of system failures. By demonstrating that the class of problems is represented by a variant of SGs, we prove the existence of a solution which is a unique fixed point equilibrium of the game and characterise the optimal controller behaviour. We then introduce a value function approximation algorithm that converges to the solution through simulation in unknown environments. ",
        "keywords": "Fault-Tolerance;Robust Control;Reinforcement Learning;Stochastic Games;Markov Games;Optimal Stopping",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David Mguni",
        "authorids": "davidmguni@hotmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nmguni2020fault,\ntitle={Fault Tolerant Reinforcement Learning via A Markov Game of Control and Stopping},\nauthor={David Mguni},\nyear={2020},\nurl={https://openreview.net/forum?id=Bygw86VKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Bygw86VKwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "178;368;341",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.6666666666667,
            83.92986490053599
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zMaPLfUL-GcJ:scholar.google.com/&scioq=Fault+Tolerant+Reinforcement+Learning+via+A+Markov+Game+of+Control+and+Stopping&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BygzbyHFvB",
        "title": "FreeLB: Enhanced Adversarial Training for Natural Language Understanding",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "Adversarial training, which minimizes the maximal risk for label-preserving input perturbations, has proved to be effective for improving the generalization of language models. In this work, we propose a novel adversarial training algorithm, FreeLB, that promotes higher invariance in the embedding space, by adding adversarial perturbations to word embeddings and minimizing the resultant adversarial risk inside different regions around input samples. To validate the effectiveness of the proposed approach, we apply it to Transformer-based models for natural language understanding and commonsense reasoning tasks. Experiments on the GLUE benchmark show that when applied only to the finetuning stage, it is able to improve the overall test scores of BERT-base model from 78.3 to 79.4, and RoBERTa-large model from 88.5 to 88.8. In addition, the proposed approach achieves state-of-the-art single-model test accuracies of 85.44% and 67.75% on ARC-Easy and ARC-Challenge. Experiments on CommonsenseQA benchmark further demonstrate that FreeLB can be generalized and boost the performance of RoBERTa-large model on other tasks as well.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chen Zhu;Yu Cheng;Zhe Gan;Siqi Sun;Tom Goldstein;Jingjing Liu",
        "authorids": "chenzhu@cs.umd.edu;yu.cheng@microsoft.com;zhe.gan@microsoft.com;siqi.sun@microsoft.com;tomg@cs.umd.edu;jingjl@microsoft.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nZhu2020FreeLB:,\ntitle={FreeLB: Enhanced Adversarial Training for Natural Language Understanding},\nauthor={Chen Zhu and Yu Cheng and Zhe Gan and Siqi Sun and Tom Goldstein and Jingjing Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BygzbyHFvB}\n}",
        "github": "https://github.com/zhuchen03/FreeLB",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BygzbyHFvB",
        "pdf_size": 0,
        "rating": "8;8",
        "confidence": "0;0",
        "wc_review": "187;260",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "67;409",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            223.5,
            36.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            238.0,
            171.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 567,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18174532754984286160&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Byl-264tvr",
        "title": "Improving End-to-End Object Tracking Using Relational Reasoning",
        "track": "main",
        "status": "Reject",
        "tldr": "MOHART uses a self-attention mechanism to perform relational reasoning in multi-object tracking.",
        "abstract": "Relational reasoning, the ability to model interactions and relations between objects, is valuable for robust multi-object tracking and pivotal for trajectory prediction. In this paper, we propose MOHART, a class-agnostic, end-to-end multi-object tracking and trajectory prediction algorithm, which explicitly accounts for permutation invariance in its relational reasoning. We explore a number of permutation invariant architectures and show that multi-headed self-attention outperforms the provided baselines and better accounts for complex physical interactions in a challenging toy experiment. We show on three real-world tracking datasets that adding relational reasoning capabilities in this way increases the tracking and trajectory prediction performance, particularly in the presence of ego-motion, occlusions, crowded scenes, and faulty sensor inputs. To the best of our knowledge, MOHART is the first fully end-to-end multi-object tracking from vision approach applied to real-world data reported in the literature. ",
        "keywords": "Relational Reasoning;Tracking;Intuitive Physics;Real-World Application;Permutation Invariance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fabian B. Fuchs;Adam R. Kosiorek;Li Sun;Oiwi Parker Jones;Ingmar Posner",
        "authorids": "fabian@robots.ox.ac.uk;adamk@robots.ox.ac.uk;kevin@robots.ox.ac.uk;oiwi.parkerjones@jesus.ox.ac.uk;ingmar@robots.ox.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nfuchs2020improving,\ntitle={Improving End-to-End Object Tracking Using Relational Reasoning},\nauthor={Fabian B. Fuchs and Adam R. Kosiorek and Li Sun and Oiwi Parker Jones and Ingmar Posner},\nyear={2020},\nurl={https://openreview.net/forum?id=Byl-264tvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byl-264tvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "441;287;230",
        "wc_reply_reviewers": "0;0;53",
        "wc_reply_authors": "299;452;181",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            319.3333333333333,
            89.12288644836909
        ],
        "wc_reply_reviewers_avg": [
            17.666666666666668,
            24.984439601924677
        ],
        "wc_reply_authors_avg": [
            310.6666666666667,
            110.9424275118506
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0KZ2pPphPfIJ:scholar.google.com/&scioq=Improving+End-to-End+Object+Tracking+Using+Relational+Reasoning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Byl1W1rtvH",
        "title": "Recurrent Hierarchical Topic-Guided Neural Language Models",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a novel larger-context language model to simultaneously captures syntax and semantics, making it capable of generating highly interpretable sentences and paragraphs",
        "abstract": "To simultaneously capture syntax and semantics from a text corpus, we propose a new larger-context language model that extracts recurrent hierarchical semantic structure via a dynamic deep topic model to guide natural language generation. Moving beyond a conventional language model that ignores long-range word dependencies and sentence order, the proposed model captures not only intra-sentence word dependencies, but also temporal transitions between sentences and inter-sentence topic dependences. For inference, we develop a hybrid of stochastic-gradient MCMC and recurrent autoencoding variational Bayes. Experimental results on a variety of real-world text corpora demonstrate that the proposed model not only outperforms state-of-the-art larger-context language models, but also learns interpretable recurrent multilayer topics and generates diverse sentences and paragraphs that are syntactically correct and semantically coherent.",
        "keywords": "Bayesian deep learning;recurrent gamma belief net;larger-context language model;variational inference;sentence generation;paragraph generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dandan Guo;Bo Chen;Ruiying Lu;Mingyuan Zhou",
        "authorids": "gdd_xidian@126.com;bchen@mail.xidian.edu.cn;ruiyinglu_xidian@163.com;mingyuan.zhou@mccombs.utexas.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nguo2020recurrent,\ntitle={Recurrent Hierarchical Topic-Guided Neural Language Models},\nauthor={Dandan Guo and Bo Chen and Ruiying Lu and Mingyuan Zhou},\nyear={2020},\nurl={https://openreview.net/forum?id=Byl1W1rtvH}\n}",
        "github": "https://drop.me/BbR8pr",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer4;AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byl1W1rtvH",
        "pdf_size": 0,
        "rating": "1;1;8;8;8",
        "confidence": "0;0;0;0;0",
        "wc_review": "255;692;344;351;147",
        "wc_reply_reviewers": "0;0;0;0;0",
        "wc_reply_authors": "657;1144;470;447;138",
        "reply_reviewers": "0;0;0;0;0",
        "reply_authors": "1;2;1;1;1",
        "rating_avg": [
            5.2,
            3.429285639896449
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.8,
            182.7188003463245
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            571.2,
            331.29769090653195
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.2,
            0.4000000000000001
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5269776927689594075&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Byl1e1BFPH",
        "title": "Dual Sequential Monte Carlo: Tunneling Filtering and Planning in Continuous POMDPs",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "We present the DualSMC network that solves continuous POMDPs by learning belief representations and then leveraging them for planning. It is based on the fact that filtering, i.e. state estimation, and planning can be viewed as two related sequential Monte Carlo processes, with one in the belief space and the other in the future planning trajectory space. In particular, we first introduce a novel particle filter network that makes better use of the adversarial relationship between the proposer model and the observation model. We then introduce a new planning algorithm over the belief representations, which learns uncertainty-dependent policies. We allow these two parts to be trained jointly with each other. We testify the effectiveness of our approach on three continuous control and planning tasks: the floor positioning, the 3D light-dark navigation, and a modified Reacher task.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yunbo Wang;Bo Liu;Jiajun Wu;Yuke Zhu;Simon Shaolei Du;Li Fei-Fei;Joshua B. Tenenbaum",
        "authorids": "yunbo.thu@gmail.com;bliu@cs.utexas.edu;jiajunw@stanford.edu;yukez@cs.stanford.edu;ssdu@ias.edu;feifeili@cs.stanford.edu;jbt@mit.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byl1e1BFPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "426;418;885",
        "wc_reply_reviewers": "0;0;354",
        "wc_reply_authors": "503;263;802",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            576.3333333333334,
            218.2847274140411
        ],
        "wc_reply_reviewers_avg": [
            118.0,
            166.87720036002523
        ],
        "wc_reply_authors_avg": [
            522.6666666666666,
            220.484819331299
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8305961084084895697&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Byl28eBtwH",
        "title": "Learning Cluster Structured Sparsity by Reweighting",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recently, the paradigm of unfolding iterative algorithms into finite-length feed-forward neural networks has achieved a great success in the area of sparse recovery. Benefit from available training data, the learned networks have achieved state-of-the-art performance in respect of both speed and accuracy. However, the structure behind sparsity, imposing constraint on the support of sparse signals, is often an essential prior knowledge but seldom considered in the existing networks. In this paper, we aim at bridging this gap. Specifically, exploiting the iterative reweighted $\\ell_1$ minimization (IRL1) algorithm, we propose to learn the cluster structured sparsity (CSS) by rewegihting adaptively. In particular, we first unfold the Reweighted Iterative Shrinkage Algorithm (RwISTA) into an end-to-end trainable deep architecture termed as RW-LISTA. Then instead of the element-wise reweighting, the global and local reweighting manner are proposed for the cluster structured sparse learning. Numerical experiments further show the superiority of our algorithm against both classical algorithms and learning-based networks on different tasks. ",
        "keywords": "Sparse Recovery;Sparse Representation;Structured Sparsity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yulun Jiang;Lei Yu;Haijian Zhang;Zhou Liu",
        "authorids": "yljblues@whu.edu.cn;ly.wd@whu.edu.cn;haijian.zhang@whu.edu.cn;liuzhou@whu.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njiang2020learning,\ntitle={Learning Cluster Structured Sparsity by Reweighting},\nauthor={Yulun Jiang and Lei Yu and Haijian Zhang and Zhou Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=Byl28eBtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byl28eBtwH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "316;271;244",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "621;484;233",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            277.0,
            29.698484809834994
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            446.0,
            160.66320881479575
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10079647384280886481&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Byl3HxBFwH",
        "title": "Efficient Deep Representation Learning by Adaptive Latent Space Sampling",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper introduces a framework for data-efficient representation learning by adaptive sampling in latent space.",
        "abstract": "Supervised deep learning requires a large amount of training samples with annotations (e.g. label class for classification task, pixel- or voxel-wised label map for segmentation tasks), which are expensive and time-consuming to obtain. During the training of a deep neural network, the annotated samples are fed into the network in a mini-batch way, where they are often regarded of equal importance. However, some of the samples may become less informative during training, as the magnitude of the gradient start to vanish for these samples. In the meantime, other samples of higher utility or hardness may be more demanded for the training process to proceed and require more exploitation. To address the challenges of expensive annotations and loss of sample informativeness, here we propose a novel training framework which adaptively selects informative samples that are fed to the training process. The adaptive selection or sampling is performed based on a hardness-aware strategy in the latent space constructed by a generative model. To evaluate the proposed training framework, we perform experiments on three different datasets, including MNIST and CIFAR-10 for image classification task and a medical image dataset IVUS for biophysical simulation task. On all three datasets, the proposed framework outperforms a random sampling method, which demonstrates the effectiveness of our framework.",
        "keywords": "Deep learning;Data efficiency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuanhan Mo;Shuo Wang;Chengliang Dai;Rui Zhou;Zhongzhao Teng;Wenjia Bai;Yike Guo",
        "authorids": "y.mo16@imperial.ac.uk;shuo.wang@imperial.ac.uk;c.dai@imperial.ac.uk;rui.zhou18@imperial.ac.uk;zt215@cam.ac.uk;w.bai@imperial.ac.uk;y.guo@imperial.ac.uk",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nmo2020efficient,\ntitle={Efficient Deep Representation Learning by Adaptive Latent Space Sampling},\nauthor={Yuanhan Mo and Shuo Wang and Chengliang Dai and Rui Zhou and Zhongzhao Teng and Wenjia Bai and Yike Guo},\nyear={2020},\nurl={https://openreview.net/forum?id=Byl3HxBFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Byl3HxBFwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "262;101;610",
        "wc_reply_reviewers": "0;0;59",
        "wc_reply_authors": "706;247;699",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            324.3333333333333,
            212.42148876441124
        ],
        "wc_reply_reviewers_avg": [
            19.666666666666668,
            27.812866726670865
        ],
        "wc_reply_authors_avg": [
            550.6666666666666,
            214.74377497121745
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2161058822661451367&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Byl3K2VtwB",
        "title": "Unsupervised Learning of Node Embeddings by Detecting Communities",
        "track": "main",
        "status": "Reject",
        "tldr": "A neural network approach for unsupervised learning of node embeddings of a graph, while at the same time learning structural characteristics in terms of communities of nodes",
        "abstract": "We present Deep MinCut (DMC), an unsupervised approach to learn node embeddings for graph-structured data. It derives node representations based on their membership in communities. As such, the embeddings directly provide interesting insights into the graph structure, so that the separate node clustering step of existing methods is no longer needed. DMC learns both, node embeddings and communities, simultaneously by minimizing the mincut loss, which captures the number of connections between communities. Striving for high scalability, we also propose a training process for DMC based on minibatches. We provide empirical evidence that the communities learned by DMC are meaningful and that the node embeddings are competitive in different node classification benchmarks.",
        "keywords": "Unsupervised Learning;Graph Embedding;Community Detection;Mincut;Normalized cut;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chi Thang Duong;Dung Hoang;Truong Giang Le Ba;Thanh Le Cong;Hongzhi Yin;Matthias Weidlich;Quoc Viet Hung Nguyen;Karl Aberer",
        "authorids": "thang.duong@epfl.ch;dungmin97@gmail.com;giangpna98@gmail.com;thanhcls1316@gmail.com;h.yin1@uq.edu.au;matthias.weidlich@hu-berlin.de;quocviethung1@gmail.com;karl.aberer@epfl.ch",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nduong2020unsupervised,\ntitle={Unsupervised Learning of Node Embeddings by Detecting Communities},\nauthor={Chi Thang Duong and Dung Hoang and Truong Giang Le Ba and Thanh Le Cong and Hongzhi Yin and Matthias Weidlich and Quoc Viet Hung Nguyen and Karl Aberer},\nyear={2020},\nurl={https://openreview.net/forum?id=Byl3K2VtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byl3K2VtwB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "148;347;158",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "100;463;313",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            217.66666666666666,
            91.54355369015461
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            292.0,
            148.93622796351463
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Vc07fYCDD-wJ:scholar.google.com/&scioq=Unsupervised+Learning+of+Node+Embeddings+by+Detecting+Communities&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Byl5NREFDr",
        "title": "Thieves on Sesame Street! Model Extraction of BERT-based APIs",
        "track": "main",
        "status": "Poster",
        "tldr": "Outputs of modern NLP APIs on nonsensical text provide strong signals about model internals, allowing adversaries to steal the APIs.",
        "abstract": "We study the problem of model extraction in natural language processing, in which an adversary with only query access to a victim model attempts to reconstruct a local copy of that model. Assuming that both the adversary and victim model fine-tune a large pretrained language model such as BERT (Devlin et al., 2019), we show that the adversary does not need any real training data to successfully mount the attack. In fact, the attacker need not even use grammatical or semantically meaningful queries: we show that random sequences of words coupled with task-specific heuristics form effective queries for model extraction on a diverse set of NLP tasks, including natural language inference and question answering. Our work thus highlights an exploit only made feasible by the shift towards transfer learning methods within the NLP community: for a query budget of a few hundred dollars, an attacker can extract a model that performs only slightly worse than the victim model. Finally, we study two defense strategies against model extraction\u2014membership classification and API watermarking\u2014which while successful against some adversaries can also be circumvented by more clever ones.",
        "keywords": "model extraction;BERT;natural language processing;pretraining language models;model stealing;deep learning security",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kalpesh Krishna;Gaurav Singh Tomar;Ankur P. Parikh;Nicolas Papernot;Mohit Iyyer",
        "authorids": "kalpesh@cs.umass.edu;gtomar@google.com;aparikh@google.com;papernot@google.com;miyyer@cs.umass.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nKrishna2020Thieves,\ntitle={Thieves on Sesame Street! Model Extraction of BERT-based APIs},\nauthor={Kalpesh Krishna and Gaurav Singh Tomar and Ankur P. Parikh and Nicolas Papernot and Mohit Iyyer},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byl5NREFDr}\n}",
        "github": "https://github.com/google-research/language/tree/master/language/bert_extraction",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byl5NREFDr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "195;693;110",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "398;408;509",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            332.6666666666667,
            257.1463050915222
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            438.3333333333333,
            50.13537229896229
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 235,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16082536591090461809&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Byl8hhNYPS",
        "title": "Neural Machine Translation with Universal Visual Representation",
        "track": "main",
        "status": "Spotlight",
        "tldr": "This work proposed a universal visual representation for neural machine translation (NMT) using retrieved images with similar topics to source sentence,  extending image applicability in NMT.",
        "abstract": "Though visual information has been introduced for enhancing neural machine translation (NMT), its effectiveness strongly relies on the availability of large amounts of bilingual parallel sentence pairs with manual image annotations. In this paper, we present a universal visual representation learned over the monolingual corpora with image annotations, which overcomes the lack of large-scale bilingual sentence-image pairs, thereby extending image applicability in NMT. In detail, a group of images with similar topics to the source sentence will be retrieved from a light topic-image lookup table learned over the existing sentence-image pairs, and then is encoded as image representations by a pre-trained ResNet. An attention layer with a gated weighting is to fuse the visual information and text information as input to the decoder for predicting target translations. In particular, the proposed method enables the visual information to be integrated into large-scale text-only NMT in addition to the multimodel NMT. Experiments on four widely used translation datasets, including the WMT'16 English-to-Romanian, WMT'14 English-to-German, WMT'14 English-to-French, and Multi30K, show that the proposed approach achieves significant improvements over strong baselines.",
        "keywords": "Neural Machine Translation;Visual Representation;Multimodal Machine Translation;Language Representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhuosheng Zhang;Kehai Chen;Rui Wang;Masao Utiyama;Eiichiro Sumita;Zuchao Li;Hai Zhao",
        "authorids": "zhangzs@sjtu.edu.cn;khchen@nict.go.jp;wangrui@nict.go.jp;mutiyama@nict.go.jp;eiichiro.sumita@nict.go.jp;charlee@sjtu.edu.cn;zhaohai@cs.sjtu.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nZhang2020Neural,\ntitle={Neural Machine Translation with Universal Visual Representation},\nauthor={Zhuosheng Zhang and Kehai Chen and Rui Wang and Masao Utiyama and Eiichiro Sumita and Zuchao Li and Hai Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byl8hhNYPS}\n}",
        "github": "https://github.com/cooelf/UVR-NMT",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Byl8hhNYPS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "243;212;181",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "554;320;177",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            212.0,
            25.311394008759507
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            350.3333333333333,
            155.3969826676896
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 141,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16822191574317178355&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BylA_C4tPr",
        "title": "Composition-based Multi-Relational Graph Convolutional Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "A Composition-based Graph Convolutional framework for multi-relational graphs.",
        "abstract": "Graph Convolutional Networks (GCNs) have recently been shown to be quite successful in modeling graph-structured data. However, the primary focus has been on handling simple undirected graphs. Multi-relational graphs are a more general and prevalent form of graphs where each edge has a label and direction associated with it. Most of the existing approaches to handle such graphs suffer from over-parameterization and are restricted to learning representations of nodes only. In this paper, we propose CompGCN, a novel Graph Convolutional framework which jointly embeds both nodes and relations in a relational graph. CompGCN leverages a variety of entity-relation composition operations from Knowledge Graph Embedding techniques and scales with the number of relations. It also generalizes several of the existing multi-relational GCN methods. We evaluate our proposed method on multiple tasks such as node classification, link prediction, and graph classification, and achieve demonstrably superior results. We make the source code of CompGCN available to foster reproducible research.",
        "keywords": "Graph Convolutional Networks;Multi-relational Graphs;Knowledge Graph Embeddings;Link Prediction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shikhar Vashishth;Soumya Sanyal;Vikram Nitin;Partha Talukdar",
        "authorids": "shikhar@iisc.ac.in;sanyal.soumya8@gmail.com;vikram.nitin@columbia.edu;ppt@iisc.ac.in",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nVashishth2020Composition-based,\ntitle={Composition-based Multi-Relational Graph Convolutional Networks},\nauthor={Shikhar Vashishth and Soumya Sanyal and Vikram Nitin and Partha Talukdar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BylA_C4tPr}\n}",
        "github": "https://github.com/malllabiisc/CompGCN",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BylA_C4tPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "357;191;155",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "338;219;129",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            234.33333333333334,
            87.97474385046856
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            228.66666666666666,
            85.5972481385289
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1247,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4927480689371858635&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "BylB4kBtwB",
        "title": "Retrieving Signals in the Frequency Domain with Deep Complex Extractors",
        "track": "main",
        "status": "Reject",
        "tldr": "New Signal Extraction Method in the Fourier Domain",
        "abstract": "Recent advances have made it possible to create deep complex-valued neural networks. Despite this progress, the potential power of fully complex intermediate computations and representations has not yet been explored for many challenging learning problems. Building on recent advances, we propose a novel mechanism for extracting signals in the frequency domain. As a case study, we perform audio source separation in the Fourier domain. Our extraction mechanism could be regarded as a local ensembling method that combines a complex-valued convolutional version of Feature-Wise Linear Modulation (FiLM) and a signal averaging operation. We also introduce a new explicit amplitude and phase-aware loss, which is scale and time invariant, taking into account the complex-valued components of the spectrogram. Using the Wall Street Journal Dataset, we compare our phase-aware loss to several others that operate both in the time and frequency domains and demonstrate the effectiveness of our proposed signal extraction method and proposed loss. When operating in the complex-valued frequency domain, our deep complex-valued network substantially outperforms its real-valued counterparts even with half the depth and a third of the parameters. Our proposed mechanism improves significantly deep complex-valued networks' performance and we demonstrate the usefulness of its regularizing effect.",
        "keywords": "Deep Complex Networks;Signal Extraction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chiheb Trabelsi;Olexa Bilaniuk;Ousmane Dia;Ying Zhang;Mirco Ravanelli;Jonathan Binas;Negar Rostamzadeh;Christopher  J Pal",
        "authorids": "chiheb.trabelsi@polymtl.ca;olexa.bilaniuk@umontreal.ca;ousmane@elementai.com;ying@elementai.com;mirco.ravanelli@gmail.com;jbinas@gmail.com;negar@elementai.com;christopher.pal@elementai.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\ntrabelsi2020retrieving,\ntitle={Retrieving Signals in the Frequency Domain with Deep Complex Extractors},\nauthor={Chiheb Trabelsi and Olexa Bilaniuk and Ousmane Dia and Ying Zhang and Mirco Ravanelli and Jonathan Binas and Negar Rostamzadeh and Christopher  J Pal},\nyear={2020},\nurl={https://openreview.net/forum?id=BylB4kBtwB}\n}",
        "github": "https://github.com/FourierSignalRetrievalICLR2020/FourierExtraction",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BylB4kBtwB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "340;860;184",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2214;1698;140",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;3;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            461.3333333333333,
            289.00442134257315
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1350.6666666666667,
            881.6081266009796
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gJR8jdppeI8J:scholar.google.com/&scioq=Retrieving+Signals+in+the+Frequency+Domain+with+Deep+Complex+Extractors&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BylD9eSYPS",
        "title": "Clustered Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Exploration strategy design is one of the challenging problems in reinforcement learning~(RL), especially when the environment contains a large state space or sparse rewards. During exploration, the agent tries to discover novel areas or high reward~(quality) areas. In most existing methods, the novelty and quality in the neighboring area of the current state are not well utilized to guide the exploration of the agent. To tackle this problem, we propose a novel RL framework, called \\underline{c}lustered \\underline{r}einforcement \\underline{l}earning~(CRL), for efficient exploration in RL. CRL adopts clustering to divide the collected states into several clusters, based on which a bonus reward reflecting both novelty and quality in the neighboring area~(cluster) of the current state is given to the agent. Experiments on several continuous control tasks and several Atari-2600 games show that CRL can outperform other state-of-the-art methods to achieve the best performance in most cases.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiao Ma;Shen-Yi Zhao;Zhao-Heng Yin;Wu-Jun Li",
        "authorids": "max@lamda.nju.edu.cn;zhaosy@lamda.nju.edu.cn;zhaohengyin@gmail.com;liwujun@nju.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nma2020clustered,\ntitle={Clustered Reinforcement Learning},\nauthor={Xiao Ma and Shen-Yi Zhao and Zhao-Heng Yin and Wu-Jun Li},\nyear={2020},\nurl={https://openreview.net/forum?id=BylD9eSYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BylD9eSYPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "499;363;240",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            367.3333333333333,
            105.7806955713355
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12848876121902842019&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BylDrRNKvH",
        "title": "Understanding Attention Mechanisms",
        "track": "main",
        "status": "Reject",
        "tldr": "We analyze the loss landscape of neural networks with attention and explain why attention is helpful in training neural networks to achieve good performance.",
        "abstract": "Attention mechanisms have advanced the state of the art in several machine learning tasks. Despite significant empirical gains, there is a lack of theoretical analyses on understanding their effectiveness. In this paper, we address this problem by studying the landscape of population and empirical loss functions of attention-based neural networks. Our results show that, under mild assumptions, every local minimum of a two-layer global attention model has low prediction error, and attention models require lower sample complexity than models not employing attention. We then extend our analyses to the popular self-attention model, proving that they deliver consistent predictions with a more expressive class of functions. Additionally, our theoretical results provide several guidelines for designing attention mechanisms. Our findings are validated with satisfactory experimental results on MNIST and IMDB reviews dataset.",
        "keywords": "Attention;deep learning;sample complexity;self-attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bingyuan Liu;Yogesh Balaji;Lingzhou Xue;Martin Renqiang Min",
        "authorids": "bul37@psu.edu;yogesh@cs.umd.edu;lzxue@psu.edu;renqiang@nec-labs.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nliu2020understanding,\ntitle={Understanding Attention Mechanisms},\nauthor={Bingyuan Liu and Yogesh Balaji and Lingzhou Xue and Martin Renqiang Min},\nyear={2020},\nurl={https://openreview.net/forum?id=BylDrRNKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BylDrRNKvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "532;308;239",
        "wc_reply_reviewers": "154;0;0",
        "wc_reply_authors": "751;341;440",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.6666666666667,
            125.07153508648116
        ],
        "wc_reply_reviewers_avg": [
            51.333333333333336,
            72.59629620181887
        ],
        "wc_reply_authors_avg": [
            510.6666666666667,
            174.68129709718656
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "BylEqnVFDB",
        "title": "Curvature Graph Network",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Graph-structured data is prevalent in many domains. Despite the widely celebrated success of deep neural networks, their power in graph-structured data is yet to be fully explored. We propose a novel network architecture that incorporates advanced graph structural features. In particular, we leverage discrete graph curvature, which measures how the neighborhoods of a pair of nodes are structurally related. The curvature of an edge (x, y) defines the distance taken to travel from neighbors of x to neighbors of y, compared with the length of edge (x, y). It is a much more descriptive feature compared to previously used features that only focus on node specific attributes or limited topological information such as degree. Our curvature graph convolution network outperforms state-of-the-art on various synthetic and real-world graphs, especially the larger and denser ones.",
        "keywords": "Deep Learning;Graph Convolution;Ricci Curvature.",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ze Ye;Kin Sum Liu;Tengfei Ma;Jie Gao;Chao Chen",
        "authorids": "yeze16159@gmail.com;kiliu@cs.stonybrook.edu;tengfei.ma1@ibm.com;jgao@cs.stonybrook.edu;chao.chen.1@stonybrook.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nYe2020Curvature,\ntitle={Curvature Graph Network},\nauthor={Ze Ye and Kin Sum Liu and Tengfei Ma and Jie Gao and Chao Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BylEqnVFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BylEqnVFDB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "261;108;151",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "343;7;92",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            173.33333333333334,
            64.42739238001867
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            147.33333333333334,
            142.64252132126973
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 84,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17273097322670084034&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BylIA1HYwS",
        "title": "STYLE EXAMPLE-GUIDED TEXT GENERATION USING GENERATIVE ADVERSARIAL TRANSFORMERS",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "We introduce a language generative model framework for generating a styled paragraph based on a context sentence and a style reference example. The framework consists of a style encoder and a texts decoder. The style encoder extracts a style code from the reference example, and the text decoder generates texts based on the style code and the context. We propose a novel objective function to train our framework. We also investigate different network design choices. We conduct extensive experimental validation with comparison to strong baselines to validate the effectiveness of the proposed framework using a newly collected dataset with diverse text styles. Both code and dataset will be released upon publication.",
        "keywords": "Language generation;Transformer;GANs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kuo-Hao Zeng;Mohammad Shoeybi;Ming-Yu Liu",
        "authorids": "khzeng@cs.washington.edu;mshoeybi@nvidia.com;sean.mingyu.liu@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzeng2020style,\ntitle={{\\{}STYLE{\\}} {\\{}EXAMPLE{\\}}-{\\{}GUIDED{\\}} {\\{}TEXT{\\}} {\\{}GENERATION{\\}} {\\{}USING{\\}} {\\{}GENERATIVE{\\}} {\\{}ADVERSARIAL{\\}} {\\{}TRANSFORMERS{\\}}},\nauthor={Kuo-Hao Zeng and Mohammad Shoeybi and Ming-Yu Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=BylIA1HYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BylIA1HYwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "456;397;528",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            460.3333333333333,
            53.56823270890645
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2981615501648454960&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BylJUTEKvB",
        "title": "Cross-Iteration Batch Normalization",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose to borrow and compensate the statistics from previous iterations, to enhance statistics estimation quality in current iteration for batch normalization.",
        "abstract": "A well-known issue of Batch Normalization is its significantly reduced effectiveness in the case of small mini-batch sizes. When a mini-batch contains few examples, the statistics upon which the normalization is defined cannot be reliably estimated from it during a training iteration. To address this problem, we present Cross-Iteration Batch Normalization (CBN), in which examples from multiple recent iterations are jointly utilized to enhance estimation quality. A challenge of computing statistics over multiple iterations is that the network activations from different iterations are not comparable to each other due to changes in network weights. We thus compensate for the network weight changes via a proposed technique based on Taylor polynomials, so that the statistics can be accurately estimated and batch normalization can be effectively applied. On object detection and image classification with small mini-batch sizes, CBN is found to outperform the original batch normalization and a direct calculation of statistics over previous iterations without the proposed compensation technique.",
        "keywords": "batch normalization;small batch size",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhuliang Yao;Yue Cao;Shuxin Zheng;Gao Huang;Stephen Lin;Jifeng Dai",
        "authorids": "yaozhuliang13@gmail.com;yuecao@microsoft.com;shuxin.zheng@microsoft.com;gaohuang@tsinghua.edu.cn;stevelin@microsoft.com;jifdai@microsoft.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nyao2020crossiteration,\ntitle={Cross-Iteration Batch Normalization},\nauthor={Zhuliang Yao and Yue Cao and Shuxin Zheng and Gao Huang and Stephen Lin and Jifeng Dai},\nyear={2020},\nurl={https://openreview.net/forum?id=BylJUTEKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BylJUTEKvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "519;347;407",
        "wc_reply_reviewers": "91;46;0",
        "wc_reply_authors": "696;522;450",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            424.3333333333333,
            71.28035415799285
        ],
        "wc_reply_reviewers_avg": [
            45.666666666666664,
            37.15134213217905
        ],
        "wc_reply_authors_avg": [
            556.0,
            103.26664514740469
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 149,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3743158606346871914&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BylKL1SKvr",
        "title": "Towards Understanding the Transferability of Deep Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "Understand transferability from the perspectives of improved generalization, optimization and the feasibility of transferability.",
        "abstract": "Deep neural networks trained on a wide range of datasets demonstrate impressive transferability. Deep features appear general in that they are applicable to many datasets and tasks. Such property is in prevalent use in real-world applications. A neural network pretrained on large datasets, such as ImageNet, can significantly boost generalization and accelerate training if fine-tuned to a smaller target dataset. Despite its pervasiveness, few effort has been devoted to uncovering the reason of transferability in deep feature representations. This paper tries to understand transferability from the perspectives of improved generalization, optimization and the feasibility of transferability. We demonstrate that 1) Transferred models tend to find flatter minima, since their weight matrices stay close to the original flat region of pretrained parameters when transferred to a similar target dataset; 2) Transferred representations make the loss landscape more favorable with improved Lipschitzness, which accelerates and stabilizes training substantially. The improvement largely attributes to the fact that the principal component of gradient is suppressed in the pretrained parameters, thus stabilizing the magnitude of gradient in back-propagation. 3) The feasibility of transferability is related to the similarity of both input and label. And a surprising discovery is that the feasibility is also impacted by the training stages in that the transferability first increases during training, and then declines. We further provide a theoretical analysis to verify our observations.",
        "keywords": "Transfer Learning;Fine-tuning;Deep Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hong Liu;Mingsheng Long;Jianmin Wang;Michael I. Jordan",
        "authorids": "h-l17@mails.tsinghua.edu.cn;mingsheng@tsinghua.edu.cn;jimwang@tsinghua.edu.cn;jordan@cs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nliu2020towards,\ntitle={Towards Understanding the Transferability of Deep Representations},\nauthor={Hong Liu and Mingsheng Long and Jianmin Wang and Michael I. Jordan},\nyear={2020},\nurl={https://openreview.net/forum?id=BylKL1SKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BylKL1SKvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "899;1073;228",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "924;1384;142",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            733.3333333333334,
            364.31701701799335
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            816.6666666666666,
            512.6931072505483
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8688983577404625456&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BylKwnEYvS",
        "title": "Star-Convexity in Non-Negative Matrix Factorization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Non-negative matrix factorization (NMF) is a highly celebrated algorithm for matrix decomposition that guarantees strictly non-negative factors. The underlying optimization problem is computationally intractable, yet in practice gradient descent based solvers often find good solutions. This gap between computational hardness and practical success mirrors recent observations in deep learning, where it has been the focus of extensive discussion and analysis. In this paper we revisit the NMF optimization problem and analyze its loss landscape in non-worst-case settings. It has recently been observed that gradients in deep networks tend to point towards the final minimizer throughout the optimization. We show that a similar property holds (with high probability) for NMF, provably in a non-worst case model with a planted solution, and empirically across an extensive suite of real-world NMF problems. Our analysis predicts that this property becomes more likely with growing number of parameters, and experiments suggest that a similar trend might also hold for deep neural networks --- turning increasing data sets and models into a blessing from an optimization perspective. ",
        "keywords": "nmf;convexity;nonconvex optimization;average-case-analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Johan Bjorck;Carla Gomes;Kilian Weinberger",
        "authorids": "njb225@cornell.edu;gomes@cs.cornell.edu;kilianweinberger@cornell.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nbjorck2020starconvexity,\ntitle={Star-Convexity in Non-Negative Matrix Factorization},\nauthor={Johan Bjorck and Carla Gomes and Kilian Weinberger},\nyear={2020},\nurl={https://openreview.net/forum?id=BylKwnEYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BylKwnEYvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "890;436;204",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "352;440;149",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            510.0,
            284.904662416512
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            313.6666666666667,
            121.85328154063895
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8ehxrKI6UfQJ:scholar.google.com/&scioq=Star-Convexity+in+Non-Negative+Matrix+Factorization&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BylNoaVYPS",
        "title": "Variational Autoencoders for Opponent Modeling in Multi-Agent Systems",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Multi-agent systems exhibit complex behaviors that emanate from the interactions of multiple agents in a shared environment. In this work, we are interested in controlling one agent in a multi-agent system and successfully learn to interact with the other agents that have fixed policies. Modeling the behavior of other agents (opponents) is essential in understanding the interactions of the agents in the system. By taking advantage of recent advances in unsupervised learning, we propose modeling opponents using variational autoencoders. Additionally, many existing methods in the literature assume that the opponent models have access to opponent's observations and actions during both training and execution. To eliminate this assumption, we propose a modification that attempts to identify the underlying opponent model, using only local information of our agent, such as its observations, actions, and rewards. The experiments indicate that our opponent modeling methods achieve equal or greater episodic returns in reinforcement learning tasks against another modeling method.",
        "keywords": "reinforcement learning;multi-agent systems;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Georgios Papoudakis;Stefano V. Albrecht",
        "authorids": "g.papoudakis@ed.ac.uk;s.albrecht@ed.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\npapoudakis2020variational,\ntitle={Variational Autoencoders for Opponent Modeling in Multi-Agent Systems},\nauthor={Georgios Papoudakis and Stefano V. Albrecht},\nyear={2020},\nurl={https://openreview.net/forum?id=BylNoaVYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BylNoaVYPS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "98;545;168",
        "wc_reply_reviewers": "0;0;57",
        "wc_reply_authors": "152;685;96",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            270.3333333333333,
            196.30984579372364
        ],
        "wc_reply_reviewers_avg": [
            19.0,
            26.870057685088806
        ],
        "wc_reply_authors_avg": [
            311.0,
            265.4442816612682
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15345781828897940465&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BylPSkHKvB",
        "title": "Natural- to formal-language generation using Tensor Product Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "In this paper, we propose a new encoder-decoder model based on Tensor Product Representations for Natural- to Formal-language generation, called TP-N2F.",
        "abstract": "Generating formal-language represented by relational tuples, such as Lisp programs or mathematical expressions, from a natural-language input is an extremely challenging task because it requires to explicitly capture discrete symbolic structural information from the input to generate the output. Most state-of-the-art neural sequence models do not explicitly capture such structure information, and thus do not perform well on these tasks. In this paper, we propose a new encoder-decoder model based on Tensor Product Representations (TPRs) for Natural- to Formal-language generation, called TP-N2F. The encoder of TP-N2F employs TPR 'binding' to encode natural-language symbolic structure in vector space and the decoder uses TPR 'unbinding' to generate a sequence of relational tuples, each consisting of a relation (or operation) and a number of arguments, in symbolic space. TP-N2F considerably outperforms LSTM-based Seq2Seq models, creating a new state of the art results on two benchmarks: the MathQA dataset for math problem solving, and the AlgoList dataset for program synthesis. Ablation studies show that improvements are mainly attributed to the use of TPRs in both the encoder and decoder to explicitly capture relational structure information for symbolic reasoning. ",
        "keywords": "Neural Symbolic Reasoning;Deep Learning;Natural Language Processing;Structural Representation;Interpretation of Learned Representations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kezhen Chen;Qiuyuan Huang;Hamid Palangi;Paul Smolensky;Kenneth D. Forbus;Jianfeng Gao",
        "authorids": "kezhenchen2021@u.northwestern.edu;qihua@microsoft.com;hpalangi@microsoft.com;paul.smolensky@gmail.com;forbus@northwestern.edu;jfgao@microsoft.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nchen2020natural,\ntitle={Natural- to formal-language generation using Tensor Product Representations},\nauthor={Kezhen Chen and Qiuyuan Huang and Hamid Palangi and Paul Smolensky and Kenneth D. Forbus and Jianfeng Gao},\nyear={2020},\nurl={https://openreview.net/forum?id=BylPSkHKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BylPSkHKvB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "1547;370;581",
        "wc_reply_reviewers": "0;312;0",
        "wc_reply_authors": "1046;1376;1028",
        "reply_reviewers": "0;2;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            832.6666666666666,
            512.402402631196
        ],
        "wc_reply_reviewers_avg": [
            104.0,
            147.07821048680188
        ],
        "wc_reply_authors_avg": [
            1150.0,
            159.97499804656977
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9859923722704753364&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BylQSxHFwr",
        "title": "AtomNAS: Fine-Grained End-to-End Neural Architecture Search",
        "track": "main",
        "status": "Poster",
        "tldr": "A new state-of-the-art on Imagenet for mobile setting",
        "abstract": "Search space design is very critical to neural architecture search (NAS) algorithms. We propose a fine-grained search space comprised of atomic blocks, a minimal search unit that is much smaller than the ones used in recent NAS algorithms. This search space allows a mix of operations by composing different types of atomic blocks, while the search space in previous methods only allows homogeneous operations. Based on this search space, we propose a resource-aware architecture search framework which automatically assigns the computational resources (e.g., output channel numbers) for each operation by jointly considering the performance and the computational cost. In addition, to accelerate the search process, we propose a dynamic network shrinkage technique which prunes the atomic blocks with negligible influence on outputs on the fly.  Instead of a search-and-retrain two-stage paradigm, our method simultaneously searches and trains the target architecture. \nOur method achieves state-of-the-art performance under several FLOPs configurations on ImageNet with a small searching cost.\nWe open our entire codebase at: https://github.com/meijieru/AtomNAS.",
        "keywords": "Neural Architecture Search;Image Classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jieru Mei;Yingwei Li;Xiaochen Lian;Xiaojie Jin;Linjie Yang;Alan Yuille;Jianchao Yang",
        "authorids": "meijieru@gmail.com;yingwei.li@jhu.edu;xiaochen.lian@bytedance.com;jinxiaojie@bytedance.com;linjie.yang@bytedance.com;alan.l.yuille@gmail.com;yangjianchao@bytedance.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nMei2020AtomNAS:,\ntitle={AtomNAS: Fine-Grained End-to-End Neural Architecture Search},\nauthor={Jieru Mei and Yingwei Li and Xiaochen Lian and Xiaojie Jin and Linjie Yang and Alan Yuille and Jianchao Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BylQSxHFwr}\n}",
        "github": "https://github.com/meijieru/AtomNAS",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BylQSxHFwr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "214;295;391",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "177;362;606",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            300.0,
            72.34638899074369
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            381.6666666666667,
            175.6897518038229
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 150,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16282779625023333674&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BylQm1HKvB",
        "title": "CONTRIBUTION OF INTERNAL REFLECTION IN LANGUAGE EMERGENCE WITH AN UNDER-RESTRICTED SITUATION",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Owing to language emergence, human beings have been able to understand the intentions of others, generate common concepts, and extend new concepts. Artificial intelligence researchers have not only predicted words and sentences statistically in machine learning, but also created a language system by communicating with the machine itself. However, strong constraints are exhibited in current studies (supervisor signals and rewards exist, or the concepts were fixed on only a point), thus hindering the emergence of real-world languages. In this study, we improved Batali (1998) and Choi et al. (2018)\u2019s research and attempted language emergence under conditions of low constraints such as human language generation. We included the bias that exists in humans as an \u201cinternal reflection function\u201d into the system. Irrespective of function, messages corresponding to the label could be generated. However, through qualitative and quantitative analysis, we confirmed that the internal reflection function caused \u201coverlearning\u201d and different structuring of message patterns. This result suggested that the internal reflection function performed effectively in creating a grounding language from raw images with an under-restricted situation such as human language generation.",
        "keywords": "Language emergence;Conceptual grounding;Reflection;Cognitive bias",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kense Todo;Masayuki Yamamura",
        "authorids": "k_todo@ali.c.titech.ac.jp;my@c.titech.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ntodo2020contribution,\ntitle={{\\{}CONTRIBUTION{\\}} {\\{}OF{\\}} {\\{}INTERNAL{\\}} {\\{}REFLECTION{\\}} {\\{}IN{\\}} {\\{}LANGUAGE{\\}} {\\{}EMERGENCE{\\}} {\\{}WITH{\\}} {\\{}AN{\\}} {\\{}UNDER{\\}}-{\\{}RESTRICTED{\\}} {\\{}SITUATION{\\}}},\nauthor={Kense Todo and Masayuki Yamamura},\nyear={2020},\nurl={https://openreview.net/forum?id=BylQm1HKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BylQm1HKvB",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "423;122",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "885;61",
        "reply_reviewers": "0;0",
        "reply_authors": "2;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            272.5,
            150.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            473.0,
            412.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xyx-_1OpVQcJ:scholar.google.com/&scioq=CONTRIBUTION+OF+INTERNAL+REFLECTION+IN+LANGUAGE+EMERGENCE+WITH+AN+UNDER-RESTRICTED+SITUATION&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BylRkAEKDH",
        "title": "TabNet: Attentive Interpretable Tabular Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel high-performance interpretable deep tabular data learning network. ",
        "abstract": "We propose a novel high-performance interpretable deep tabular data learning network, TabNet. TabNet utilizes a sequential attention mechanism that softly selects features to reason from at each decision step and then aggregates the processed information to make a final prediction decision. By explicitly selecting sparse features, TabNet learns very efficiently as the model capacity at each decision step is fully utilized for the most relevant features, resulting in a high performance model. This sparsity also enables more interpretable decision making through the visualization of feature selection masks. We demonstrate that TabNet outperforms other neural network and decision tree variants on a wide range of tabular data learning datasets and yields interpretable feature attributions and insights into the global model behavior.",
        "keywords": "Tabular data;interpretable neural networks;attention models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sercan O. Arik;Tomas Pfister",
        "authorids": "soarik@google.com;tpfister@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\narik2020tabnet,\ntitle={TabNet: Attentive Interpretable Tabular Learning},\nauthor={Sercan O. Arik and Tomas Pfister},\nyear={2020},\nurl={https://openreview.net/forum?id=BylRkAEKDH}\n}",
        "github": "https://drive.google.com/file/d/1oLQRgKygAEVRRmqCZTPwno7gyTq22wbb/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BylRkAEKDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "421;312;219",
        "wc_reply_reviewers": "144;0;0",
        "wc_reply_authors": "1255;882;404",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            317.3333333333333,
            82.5523402992846
        ],
        "wc_reply_reviewers_avg": [
            48.0,
            67.88225099390856
        ],
        "wc_reply_authors_avg": [
            847.0,
            348.2996793950099
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1825,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12867257668244942542&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BylT8RNKPH",
        "title": "A Base Model Selection Methodology for Efficient Fine-Tuning",
        "track": "main",
        "status": "Reject",
        "tldr": " We propose several metrics to estimate the transferability of pre-trained CNN models for a given target task.",
        "abstract": "While the accuracy of image classification achieves significant improvement with deep Convolutional Neural Networks (CNN), training a deep CNN is a time-consuming task because it requires a large amount of labeled data and takes a long time to converge even with high performance computing resources.\nFine-tuning, one of the transfer learning methods, is effective in decreasing time and the amount of data necessary for CNN training. It is known that fine-tuning can be performed efficiently if the source and the target tasks have high relativity.\nHowever, the technique to evaluate the relativity or transferability of trained models quantitatively from their parameters has not been established. In this paper, we propose and evaluate several metrics to estimate the transferability of pre-trained CNN models for a given target task by featuremaps of the last convolutional layer.\nWe found that some of the proposed metrics are good predictors of fine-tuned accuracy, but their effectiveness depends on the structure of the network. Therefore, we also propose to combine two metrics to get a generally applicable indicator. \nThe experimental results reveal that one of the combined metrics is well correlated with fine-tuned accuracy in a variety of network structure and our method has a good potential to reduce the burden of CNN training.",
        "keywords": "transfer learning;fine-tuning;parameter transfer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yosuke Ueno;Masaaki Kondo",
        "authorids": "ueno@hal.ipc.i.u-tokyo.ac.jp;kondo@hal.ipc.i.u-tokyo.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nueno2020a,\ntitle={A Base Model Selection Methodology for Efficient Fine-Tuning},\nauthor={Yosuke Ueno and Masaaki Kondo},\nyear={2020},\nurl={https://openreview.net/forum?id=BylT8RNKPH}\n}",
        "github": "https://www.dropbox.com/s/ry1gh8bf4yy4qe7/iclr2020_code_submission.tar.gz?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BylT8RNKPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "446;474;336",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "102;107;105",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            418.6666666666667,
            59.56135958451213
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            104.66666666666667,
            2.0548046676563256
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ezSi7QRlO4cJ:scholar.google.com/&scioq=A+Base+Model+Selection+Methodology+for+Efficient+Fine-Tuning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "BylTta4YvB",
        "title": "How Well Do WGANs Estimate the Wasserstein Metric?",
        "track": "main",
        "status": "Reject",
        "tldr": "A study of how well different methods used to compute the 1-Wasserstein distance in the GAN setting actually perform.",
        "abstract": "Generative modelling is often cast as minimizing a similarity measure between a data distribution and a model distribution. Recently, a popular choice for the similarity measure has been the Wasserstein metric, which can be expressed in the Kantorovich duality formulation as the optimum difference of the expected values of a potential function under the real data distribution and the model hypothesis. In practice, the potential is approximated with a neural network and is called the discriminator. Duality constraints on the function class of the discriminator are enforced approximately, and the expectations are estimated from samples. This gives at least three sources of errors: the approximated discriminator and constraints, the estimation of the expectation value, and the optimization required to find the optimal potential. In this work, we study how well the methods, that are used in generative adversarial networks to approximate the Wasserstein metric, perform. We consider, in particular, the $c$-transform formulation, which eliminates the need to enforce the constraints explicitly. We demonstrate that the $c$-transform allows for a more accurate estimation of the true Wasserstein metric from samples, but surprisingly, does not",
        "keywords": "Optimal Transport;Wasserstein Metric;Generative Adversial Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anton Mallasto;Guido Mont\u00fafar;Augusto Gerolin",
        "authorids": "anton.mallasto@gmail.com;montufar@math.ucla.edu;augustogerolin@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmallasto2020how,\ntitle={How Well Do {\\{}WGAN{\\}}s Estimate the Wasserstein Metric?},\nauthor={Anton Mallasto and Guido Mont{\\'u}far and Augusto Gerolin},\nyear={2020},\nurl={https://openreview.net/forum?id=BylTta4YvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BylTta4YvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "209;477;250",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "333;330;288",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            312.0,
            117.86715686172576
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            317.0,
            20.54263858417414
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17232893214045385346&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BylTy1HFDS",
        "title": "Deep unsupervised feature selection",
        "track": "main",
        "status": "Reject",
        "tldr": "To perform well in downstream prediction tasks, features are selected by learning a \"restricted autoencoder\" that iteratively eliminates features that aren't necessary for accurate reconstruction.",
        "abstract": "Unsupervised feature selection involves finding a small number of highly informative features, in the absence of a specific supervised learning task. Selecting a small number of features is an important problem in many scientific domains with high-dimensional observations. Here, we propose the restricted autoencoder (RAE) framework for selecting features that can accurately reconstruct the rest of the features. We justify our approach through a novel proof that the reconstruction ability of a set of features bounds its performance in downstream supervised learning tasks. Based on this theory, we present a learning algorithm for RAEs that iteratively eliminates features using learned per-feature corruption rates. We apply the RAE framework to two high-dimensional biological datasets\u2014single cell RNA sequencing and microarray gene expression data, which pose important problems in cell biology and precision medicine\u2014and demonstrate that RAEs outperform nine baseline methods, often by a large margin.",
        "keywords": "Single cell rna;microarray;feature selection;feature ranking",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ian Covert;Uygar Sumbul;Su-In Lee",
        "authorids": "icovert@cs.washington.edu;uygars@alleninstitute.org;suinlee@cs.washington.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ncovert2020deep,\ntitle={Deep unsupervised feature selection},\nauthor={Ian Covert and Uygar Sumbul and Su-In Lee},\nyear={2020},\nurl={https://openreview.net/forum?id=BylTy1HFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BylTy1HFDS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "273;323;145",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            247.0,
            74.95776588630872
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2355418347111126905&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "BylUMxSFwS",
        "title": "Disentangled Cumulants Help Successor Representations Transfer to New Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "We show in the absence of external reward that agents can leverage knowledge from unsupervised control of latent features to solve downstream tasks and that when these latent features are disentangled superior performance is achieved.",
        "abstract": "Biological intelligence can learn to solve many diverse tasks in a data efficient manner by re-using basic knowledge and skills from one task to another. Furthermore, many of such skills are acquired through something called latent learning, where no explicit supervision for skill acquisition is provided. This is in contrast to the state-of-the-art reinforcement learning agents, which typically start learning each new task from scratch and struggle with knowledge transfer. In this paper we propose a principled way to learn and recombine a basis set of policies, which comes with certain guarantees on the coverage of the final task space. In particular, we construct a learning pipeline where an agent invests time to learn to perform intrinsically generated, goal-based tasks, and subsequently leverages this experience to quickly achieve a high level of performance on externally specified, often significantly more complex tasks through generalised policy improvement. We demonstrate both theoretically and empirically that such goal-based intrinsic tasks produce more transferable policies when the goals are specified in a space that exhibits a form of disentanglement. ",
        "keywords": "reinforcement learning;representation learning;intrinsic reward;intrinsic control;endogenous;generalized policy improvement;successor features;variational;monet;disentangled",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chris Grimm;Irina Higgins;Andre Barreto;Denis Teplyashin;Markus Wulfmeier;Tim Hertweck;Raia Hadsell;Satinder Singh",
        "authorids": "crgrimm@umich.edu;irinah@google.com;andrebarreto@google.com;teplyashin@google.com;mwulfmeier@google.com;thertweck@google.com;raia@google.com;baveja@google.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\ngrimm2020disentangled,\ntitle={Disentangled Cumulants Help Successor Representations Transfer to New Tasks},\nauthor={Chris Grimm and Irina Higgins and Andre Barreto and Denis Teplyashin and Markus Wulfmeier and Tim Hertweck and Raia Hadsell and Satinder Singh},\nyear={2020},\nurl={https://openreview.net/forum?id=BylUMxSFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BylUMxSFwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "585;337;228",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            383.3333333333333,
            149.38168859967035
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7626977450026923338&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BylVcTNtDS",
        "title": "A Target-Agnostic Attack on Deep Models: Exploiting Security Vulnerabilities of Transfer Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Due to insufficient training data and the high computational cost to train a deep neural network from scratch, transfer learning has been extensively used in many deep-neural-network-based applications. A commonly used transfer learning approach involves taking a part of a pre-trained model, adding a few layers at the end, and re-training the new layers with a small dataset. This approach, while efficient and widely used, imposes a security vulnerability because the pre-trained model used in transfer learning is usually publicly available, including to potential attackers. In this paper, we show that without any additional knowledge other than the pre-trained model, an attacker can launch an effective and efficient brute force attack that can craft instances of input to trigger each target class with high confidence. We assume that the attacker has no access to any target-specific information, including samples from target classes, re-trained model, and probabilities assigned by Softmax to each class, and thus making the attack target-agnostic. These assumptions render all previous attack models inapplicable, to the best of our knowledge. To evaluate the proposed attack, we perform a set of experiments on face recognition and speech recognition tasks and show the effectiveness of the attack. Our work reveals a fundamental security weakness of the Softmax layer when used in transfer learning settings.",
        "keywords": "Machine learning security;Transfer learning;deep learning security;Softmax Vulnerability;Transfer learning Security",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shahbaz Rezaei;Xin Liu",
        "authorids": "srezaei@ucdavis.edu;xinliu@ucdavis.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nRezaei2020A,\ntitle={A Target-Agnostic Attack on Deep Models: Exploiting Security Vulnerabilities of Transfer Learning},\nauthor={Shahbaz Rezaei and Xin Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BylVcTNtDS}\n}",
        "github": "https://github.com/shrezaei/Target-Agnostic-Attack",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BylVcTNtDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "330;224;397",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "477;1091;1180",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            317.0,
            71.22265557157124
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            916.0,
            312.5390642250448
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 68,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15887045580387501339&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BylWYC4KwH",
        "title": "On Concept-Based Explanations in Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "we propose a concept-based explanation for DNNs that is both sufficient for prediction (complete) and interpretable",
        "abstract": "Deep neural networks (DNNs) build high-level intelligence on low-level raw features. Understanding of this high-level intelligence can be enabled by deciphering the concepts they base their decisions on, as human-level thinking. In this paper, we study concept-based explainability for DNNs in a systematic framework. First, we define the notion of completeness, which quantifies how sufficient a particular set of concepts is in explaining a model's prediction behavior. Based on performance and variability motivations, we propose two definitions to quantify completeness. We show that under degenerate conditions, our method is equivalent to Principal Component Analysis. Next, we propose a concept discovery method that considers two additional constraints to encourage the interpretability of the discovered concepts. We use game-theoretic notions to aggregate over sets to define an importance score for each discovered concept, which we call \\emph{ConceptSHAP}. On specifically-designed synthetic datasets and real-world text and image datasets, we validate the effectiveness of our framework in finding concepts that are complete in explaining the decision, and interpretable.",
        "keywords": "concept-based explanations;interpretability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chih-Kuan Yeh;Been Kim;Sercan Arik;Chun-Liang Li;Pradeep Ravikumar;Tomas Pfister",
        "authorids": "cjyeh@cs.cmu.edu;beenkim.mit@gmail.com;soarik@google.com;chunliang.tw@gmail.com;pradeep.ravikumar@gmail.com;tpfister@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nyeh2020on,\ntitle={On Concept-Based Explanations in Deep Neural Networks},\nauthor={Chih-Kuan Yeh and Been Kim and Sercan Arik and Chun-Liang Li and Pradeep Ravikumar and Tomas Pfister},\nyear={2020},\nurl={https://openreview.net/forum?id=BylWYC4KwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BylWYC4KwH",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "171;771;826;761",
        "wc_reply_reviewers": "0;0;105;50",
        "wc_reply_authors": "295;548;237;979",
        "reply_reviewers": "0;0;1;1",
        "reply_authors": "1;2;1;2",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            632.25,
            267.45034585881547
        ],
        "wc_reply_reviewers_avg": [
            38.75,
            43.355362990061565
        ],
        "wc_reply_authors_avg": [
            514.75,
            292.4332188722752
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4708042388310832607&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BylWglrYPH",
        "title": "Symmetry and Systematicity",
        "track": "main",
        "status": "Reject",
        "tldr": "We use convolution to make neural networks behave more like symbolic systems.",
        "abstract": "We argue that symmetry is an important consideration in addressing the problem\nof systematicity and investigate two forms of symmetry relevant to symbolic processes. \nWe implement this approach in terms of convolution and show that it can\nbe used to achieve effective generalisation in three toy problems: rule learning,\ncomposition and grammar learning.",
        "keywords": "symmetry;systematicity;convolution;symbols;generalisation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jeff Mitchell;Jeff Bowers",
        "authorids": "jeff.mitchell@bristol.ac.uk;j.bowers@bristol.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmitchell2020symmetry,\ntitle={Symmetry and Systematicity},\nauthor={Jeff Mitchell and Jeff Bowers},\nyear={2020},\nurl={https://openreview.net/forum?id=BylWglrYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BylWglrYPH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "825;468;414",
        "wc_reply_reviewers": "0;238;266",
        "wc_reply_authors": "1009;588;575",
        "reply_reviewers": "0;2;1",
        "reply_authors": "3;2;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            569.0,
            182.3567931281969
        ],
        "wc_reply_reviewers_avg": [
            168.0,
            119.34264395708126
        ],
        "wc_reply_authors_avg": [
            724.0,
            201.59530417811487
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:z9s1owhQOl0J:scholar.google.com/&scioq=Symmetry+and+Systematicity&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "BylXi3NKvS",
        "title": "FALCON: Fast and Lightweight Convolution for Compressing and Accelerating CNN",
        "track": "main",
        "status": "Reject",
        "tldr": "FALCON is an accurate and lightweight method for compressing CNNs based on depthwise separable convolution.",
        "abstract": "How can we efficiently compress Convolutional Neural Networks (CNN) while retaining their accuracy on classification tasks? A promising direction is based on depthwise separable convolution which replaces a standard convolution with a depthwise convolution and a pointwise convolution. However, previous works based on depthwise separable convolution are limited since 1) they are mostly heuristic approaches without a precise understanding of their relations to standard convolution, and 2) their accuracies do not match that of the standard convolution.\n\nIn this paper, we propose FALCON, an accurate and lightweight method for compressing CNN. FALCON is derived by interpreting existing convolution methods based on depthwise separable convolution using EHP, our proposed mathematical formulation to approximate the standard convolution kernel. Such interpretation leads to developing a generalized version rank-k FALCON which further improves the accuracy while sacrificing a bit of compression and computation reduction rates. In addition, we propose FALCON-branch by fitting FALCON into the previous state-of-the-art convolution unit ShuffleUnitV2 which gives even better accuracy. Experiments show that FALCON and FALCON-branch outperform 1) existing methods based on depthwise separable convolution and 2) standard CNN models by up to 8x compression and 8x computation reduction while ensuring similar accuracy. We also demonstrate that rank-k FALCON provides even better accuracy than standard convolution in many cases, while using a smaller number of parameters and floating-point operations.",
        "keywords": "CNN compression;CNN acceleration;model compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chun Quan;Jun-Gi Jang;Hyun Dong Lee;U Kang",
        "authorids": "chunquan_cs@outlook.com;elnino4@snu.ac.kr;hyundonglee1015@gmail.com;ukang@snu.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nquan2020falcon,\ntitle={{\\{}FALCON{\\}}: Fast and Lightweight Convolution for Compressing and Accelerating {\\{}CNN{\\}}},\nauthor={Chun Quan and Jun-Gi Jang and Hyun Dong Lee and U Kang},\nyear={2020},\nurl={https://openreview.net/forum?id=BylXi3NKvS}\n}",
        "github": "https://github.com/falcon-submission/falcon-submission",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BylXi3NKvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "337;125;271",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "971;56;699",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            244.33333333333334,
            88.57890393441444
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            575.3333333333334,
            383.64595947594995
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8706396820041331825&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Byla224KPr",
        "title": "An Empirical Study on Post-processing Methods for Word Embeddings",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Word embeddings learnt from large corpora have been adopted in various applications in natural language processing and served as the general input representations to learning systems. Recently, a series of post-processing methods have been proposed to boost the performance of word embeddings on similarity comparison and analogy retrieval tasks, and some have been adapted to compose sentence representations. The general hypothesis behind these methods is that by enforcing the embedding space to be more isotropic, the similarity between words can be better expressed. We view these methods as an approach to shrink the covariance/gram matrix, which is estimated by learning word vectors, towards a scaled identity matrix. By optimising an objective in the semi-Riemannian manifold with Centralised Kernel Alignment (CKA), we are able to search for the optimal shrinkage parameter, and provide a post-processing method to smooth the spectrum of learnt word vectors which yields improved performance on downstream tasks.",
        "keywords": "word vectors;post-processing method;centralised kernel alignment;shrinkage",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shuai Tang;Mahta Mousavi;Virginia R. de Sa",
        "authorids": "shuaitang93@ucsd.edu;mahta@ucsd.edu;desa@ucsd.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntang2020an,\ntitle={An Empirical Study on Post-processing Methods for Word Embeddings},\nauthor={Shuai Tang and Mahta Mousavi and Virginia R. de Sa},\nyear={2020},\nurl={https://openreview.net/forum?id=Byla224KPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byla224KPr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "762;1021;242",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            675.0,
            323.9207722062089
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11704558129067111630&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "BylaUTNtPS",
        "title": "Recurrent Independent Mechanisms",
        "track": "main",
        "status": "Reject",
        "tldr": "Learning recurrent mechanisms which operate independently, and sparingly interact  can lead to better generalization to out of distribution samples.",
        "abstract": "Learning modular structures which reflect the dynamics of the environment can lead to better generalization and robustness to changes which only affect a few of the underlying causes. We propose Recurrent Independent Mechanisms (RIMs), a new recurrent architecture in which multiple groups of recurrent cells operate with nearly independent transition dynamics, communicate only sparingly through the bottleneck of attention, and are only updated at time steps where they are most relevant.  We show that this leads to specialization amongst the RIMs, which in turn allows for dramatically improved generalization on tasks where some factors of variation differ systematically between training and evaluation.",
        "keywords": "modular representations;better generalization;learning mechanisms",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anirudh Goyal;Alex Lamb;Shagun Sodhani;Jordan Hoffmann;Sergey Levine;Yoshua Bengio;Bernhard Scholkopf",
        "authorids": "anirudhgoyal9119@gmail.com;alex6200@gmail.com;sshagunsodhani@gmail.com;jhoffmann@g.harvard.edu;svlevine@eecs.berkeley.edu;yoshua.bengio@mila.quebec;bs@tuebingen.mpg.de",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\ngoyal2020recurrent,\ntitle={Recurrent Independent Mechanisms},\nauthor={Anirudh Goyal and Alex Lamb and Shagun Sodhani and Jordan Hoffmann and Sergey Levine and Yoshua Bengio and Bernhard Scholkopf},\nyear={2020},\nurl={https://openreview.net/forum?id=BylaUTNtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BylaUTNtPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "273;657;382",
        "wc_reply_reviewers": "74;82;0",
        "wc_reply_authors": "744;782;566",
        "reply_reviewers": "1;1;0",
        "reply_authors": "3;5;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            437.3333333333333,
            161.57626338323612
        ],
        "wc_reply_reviewers_avg": [
            52.0,
            36.914315199752345
        ],
        "wc_reply_authors_avg": [
            697.3333333333334,
            94.15353890793956
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            3.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 389,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10756182227982355233&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "BylalAEtvB",
        "title": "Lipschitz Lifelong Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We analyze theoretically how the optimal value function changes across tasks and derive a method for non-negative transfer of value functions in Lifelong Reinforcement Learning.",
        "abstract": "We consider the problem of reusing prior experience when an agent is facing a series of Reinforcement Learning (RL) tasks. We introduce a novel metric between Markov Decision Processes and focus on the study and exploitation of the optimal value function's Lipschitz continuity in the task space with respect to that metric. These theoretical results lead us to a value transfer method for Lifelong RL, which we use to build a PAC-MDP algorithm that exploits continuity to accelerate learning. We illustrate the benefits of the method in Lifelong RL experiments.",
        "keywords": "Reinforcement Learning;Lifelong Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Erwan Lecarpentier;David Abel;Kavosh Asadi;Yuu Jinnai;Emmanuel Rachelson;Michael L. Littman",
        "authorids": "erwan.lecarpentier@isae-supaero.fr;david_abel@brown.edu;k8@brown.edu;yuu_jinnai@brown.edu;emmanuel.rachelson@isae-supaero.fr;michael_littman@brown.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nlecarpentier2020lipschitz,\ntitle={Lipschitz Lifelong Reinforcement Learning},\nauthor={Erwan Lecarpentier and David Abel and Kavosh Asadi and Yuu Jinnai and Emmanuel Rachelson and Michael L. Littman},\nyear={2020},\nurl={https://openreview.net/forum?id=BylalAEtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=BylalAEtvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "236;877;462",
        "wc_reply_reviewers": "0;0;70",
        "wc_reply_authors": "627;806;617",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            525.0,
            265.45181609223675
        ],
        "wc_reply_reviewers_avg": [
            23.333333333333332,
            32.99831645537222
        ],
        "wc_reply_authors_avg": [
            683.3333333333334,
            86.83445296782966
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 46,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16900120489887289598&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "BylfTySYvB",
        "title": "GATO: Gates Are Not the Only Option",
        "track": "main",
        "status": "Reject",
        "tldr": "Recurrent neural networks can avoid vanishing gradients by not using all of their hidden state in recurrences, together with a residual structure.",
        "abstract": "Recurrent Neural Networks (RNNs) facilitate prediction and generation of structured temporal data such as text and sound. However, training RNNs is hard. Vanishing gradients cause difficulties for learning long-range dependencies. Hidden states can explode for long sequences and send unbounded gradients to model parameters, even when hidden-to-hidden Jacobians are bounded. Models like the LSTM and GRU use gates to bound their hidden state, but most choices of gating functions lead to saturating gradients that contribute to, instead of alleviate, vanishing gradients. Moreover, performance of these models is not robust across random initializations. In this work, we specify desiderata for sequence models. We develop one model that satisfies them and that is capable of learning long-term dependencies, called GATO. GATO is constructed so that part of its hidden state does not have vanishing gradients, regardless of sequence length. We study GATO on copying and arithmetic tasks with long dependencies and on modeling intensive care unit and language data. Training GATO is more stable across random seeds and learning rates than GRUs and LSTMs. GATO solves these tasks using an order of magnitude fewer parameters.",
        "keywords": "Sequence Models;Vanishing Gradients;Recurrent neural networks;Long-term dependence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mark Goldstein*;Xintian Han*;Rajesh Ranganath",
        "authorids": "goldstein@nyu.edu;xh1007@nyu.edu;rajeshr@cims.nyu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngoldstein*2020gato,\ntitle={{\\{}GATO{\\}}: Gates Are Not the Only Option},\nauthor={Mark Goldstein* and Xintian Han* and Rajesh Ranganath},\nyear={2020},\nurl={https://openreview.net/forum?id=BylfTySYvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BylfTySYvB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "316;472;597",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "285;390;384",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            461.6666666666667,
            114.95023077643454
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            353.0,
            48.14561246884289
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nLDQ1iwSlxYJ:scholar.google.com/&scioq=GATO:+Gates+Are+Not+the+Only+Option&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Bylh2krYPr",
        "title": "Probing Emergent Semantics in Predictive Agents via Question Answering",
        "track": "main",
        "status": "Reject",
        "tldr": "We use question-answering to evaluate how much knowledge about the environment can agents learn by self-supervised prediction.",
        "abstract": "Recent work has demonstrated how predictive modeling can endow agents with rich knowledge of their surroundings, improving their ability to act in complex environments. We propose question-answering as a general paradigm to decode and understand the representations that such agents develop, applying our method to two recent approaches to predictive modeling \u2013 action-conditional CPC (Guo et al., 2018) and SimCore (Gregor et al., 2019). After training agents with these predictive objectives in a visually-rich, 3D environment with an assortment of objects, colors, shapes, and spatial configurations, we probe their internal state representations with a host of synthetic (English) questions, without backpropagating gradients from the question-answering decoder into the agent. The performance of different agents when probed in this way reveals that they learn to encode detailed, and seemingly compositional, information about objects, properties and spatial relations from their physical environment. Our approach is intuitive, i.e. humans can easily interpret the responses of the model as opposed to inspecting continuous vectors, and model-agnostic, i.e. applicable to any modeling approach. By revealing the implicit knowledge of objects, quantities, properties and relations acquired by agents as they learn, question-conditional agent probing can stimulate the design and development of stronger predictive learning objectives.",
        "keywords": "question-answering;predictive models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Abhishek Das;Federico Carnevale;Hamza Merzic;Laura Rimell;Rosalia Schneider;Alden Hung;Josh Abramson;Arun Ahuja;Stephen Clark;Greg Wayne;Felix Hill",
        "authorids": "abhshkdz@gatech.edu;fedecarnev@google.com;hamzamerzic@google.com;laurarimell@google.com;rgschneider@google.com;aldenhung@google.com;jabramson@google.com;arahuja@google.com;clarkstephen@google.com;gregwayne@google.com;felixhill@google.com",
        "gender": ";;;;;;;;;;",
        "homepage": ";;;;;;;;;;",
        "dblp": ";;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;",
        "orcid": ";;;;;;;;;;",
        "linkedin": ";;;;;;;;;;",
        "or_profile": ";;;;;;;;;;",
        "aff": ";;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;",
        "position": ";;;;;;;;;;",
        "bibtex": "@misc{\ndas2020probing,\ntitle={Probing Emergent Semantics in Predictive Agents via Question Answering},\nauthor={Abhishek Das and Federico Carnevale and Hamza Merzic and Laura Rimell and Rosalia Schneider and Alden Hung and Josh Abramson and Arun Ahuja and Stephen Clark and Greg Wayne and Felix Hill},\nyear={2020},\nurl={https://openreview.net/forum?id=Bylh2krYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Bylh2krYPr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "1366;248;171",
        "wc_reply_reviewers": "1165;0;0",
        "wc_reply_authors": "2075;547;41",
        "reply_reviewers": "2;0;0",
        "reply_authors": "4;2;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            595.0,
            546.0848529914254
        ],
        "wc_reply_reviewers_avg": [
            388.3333333333333,
            549.1862667215519
        ],
        "wc_reply_authors_avg": [
            887.6666666666666,
            864.6114091826969
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            11,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1184161701791044177&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "ByliZgBKPH",
        "title": "Policy path programming",
        "track": "main",
        "status": "Reject",
        "tldr": "A normative theory of hierarchical model-based policy optimization",
        "abstract": "We develop a normative theory of hierarchical model-based policy optimization for Markov decision processes resulting in a full-depth, full-width policy iteration algorithm. This method performs policy updates which integrate reward information over all states at all horizons simultaneously thus sequentially maximizing the expected reward obtained per algorithmic iteration. Effectively, policy path programming ascends the expected cumulative reward gradient in the space of policies defined over all state-space paths. An exact formula is derived which finitely parametrizes these path gradients in terms of action preferences. Policy path gradients can be directly computed using an internal model thus obviating the need to sample paths in order to optimize in depth. They are quadratic in successor representation entries and afford natural generalizations to higher-order gradient techniques. In simulations, it is shown that intuitive hierarchical reasoning is emergent within the associated policy optimization dynamics.",
        "keywords": "markov decision process;planning;hierarchical;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel McNamee",
        "authorids": "daniel.c.mcnamee@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nmcnamee2020policy,\ntitle={Policy path programming},\nauthor={Daniel McNamee},\nyear={2020},\nurl={https://openreview.net/forum?id=ByliZgBKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ByliZgBKPH",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "354;459;172;169",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            288.5,
            123.70630541730684
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "ByljMaNKwB",
        "title": "Domain Aggregation Networks for Multi-Source Domain Adaptation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In many real-world applications, we want to exploit multiple source datasets of similar tasks to learn a model for a different but related target dataset -- e.g.,  recognizing characters of a new font using a set of different fonts. While most recent research has considered ad-hoc combination rules to address this problem, we extend previous work on domain discrepancy minimization to develop a finite-sample generalization bound, and accordingly propose a theoretically justified optimization procedure. The algorithm we develop, Domain AggRegation Network (DARN), is able to effectively adjust the weight of each source domain during training to ensure relevant domains are given more importance for adaptation. We evaluate the proposed method on real-world sentiment analysis and digit recognition datasets and show that DARN can significantly outperform the state-of-the-art alternatives.",
        "keywords": "Domain Adaptation;Transfer Learning;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junfeng Wen;Russell Greiner;Dale Schuurmans",
        "authorids": "junfengwen@gmail.com;rgreiner@ualberta.ca;daes@ualberta.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwen2020domain,\ntitle={Domain Aggregation Networks for Multi-Source Domain Adaptation},\nauthor={Junfeng Wen and Russell Greiner and Dale Schuurmans},\nyear={2020},\nurl={https://openreview.net/forum?id=ByljMaNKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ByljMaNKwB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "885;283;448",
        "wc_reply_reviewers": "0;0;199",
        "wc_reply_authors": "313;245;330",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            538.6666666666666,
            253.98993855837853
        ],
        "wc_reply_reviewers_avg": [
            66.33333333333333,
            93.80949963741531
        ],
        "wc_reply_authors_avg": [
            296.0,
            36.72419729097788
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 93,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3448185297985110103&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Bylkd0EFwr",
        "title": "Bio-Inspired Hashing for Unsupervised Similarity Search",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a biologically motivated hashing algorithm that demonstrates good empirical performance.",
        "abstract": "The fruit fly Drosophila's olfactory circuit has inspired a new locality sensitive hashing (LSH) algorithm, FlyHash. In contrast with classical LSH algorithms that produce low dimensional hash codes, FlyHash produces sparse high-dimensional hash codes and has also been shown to have superior empirical performance compared to classical LSH algorithms in similarity search. However, FlyHash uses random projections and cannot learn from data. Building on inspiration from FlyHash and the ubiquity of sparse expansive representations in neurobiology, our work proposes a novel hashing algorithm BioHash that produces sparse high dimensional hash codes in a data-driven manner. We show that  BioHash outperforms previously published benchmarks for various hashing methods. Since our learning algorithm is based on a local and biologically plausible synaptic plasticity rule, our work provides evidence for the proposal that LSH might be a computational reason for the abundance of sparse expansive motifs in a variety of biological systems. We also propose a convolutional variant BioConvHash that further improves performance.  From the perspective of computer science, BioHash and BioConvHash are fast, scalable and yield compressed binary representations that are useful for similarity search.",
        "keywords": "unsupervised learning;similarity search;neuroscience",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chaitanya K. Ryali;John J. Hopfield;Dmitry Krotov",
        "authorids": "rckrishn@eng.ucsd.edu;hopfield@princeton.edu;krotov@ibm.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nryali2020bioinspired,\ntitle={Bio-Inspired Hashing for Unsupervised Similarity Search},\nauthor={Chaitanya K. Ryali and John J. Hopfield and Dmitry Krotov},\nyear={2020},\nurl={https://openreview.net/forum?id=Bylkd0EFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bylkd0EFwr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "262;620;464",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            448.6666666666667,
            146.55450256550367
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15770555852166336829&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "BylldnNFwS",
        "title": "On the Decision Boundaries of Deep Neural Networks: A Tropical Geometry Perspective",
        "track": "main",
        "status": "Reject",
        "tldr": "Tropical geometry can be leveraged to represent the decision boundaries of neural networks and bring to light interesting insights.",
        "abstract": "This work tackles the problem of characterizing and understanding the decision boundaries of neural networks with piece-wise linear non-linearity activations. We use tropical geometry, a new development in the area of algebraic geometry, to provide a characterization of the decision boundaries of a simple neural network of the form (Affine, ReLU, Affine). Specifically, we show that the decision boundaries are a subset of a tropical hypersurface, which is intimately related to a polytope formed by the convex hull of two zonotopes. The generators of the zonotopes are precise functions of the neural network parameters. We utilize this geometric characterization to shed light and new perspective on three tasks. In doing so, we propose a new tropical perspective for the lottery ticket hypothesis, where we see the effect of different initializations on the tropical geometric representation of the decision boundaries. Also, we leverage this characterization as a new set of tropical regularizers, which  deal directly  with the decision boundaries of a network. We investigate the use of these regularizers  in neural network pruning (removing network parameters that do not contribute to the tropical geometric representation of the decision boundaries) and in generating adversarial input attacks (with input perturbations explicitly perturbing the decision boundaries geometry to change the network prediction of the input). ",
        "keywords": "Decision boundaries;Neural Network;Tropical Geometry;Network Pruning;Adversarial Attacks;Lottery Ticket Hypothesis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Motasem Alfarra;Adel Bibi;Hasan Hammoud;Mohamed Gaafar;Bernard Ghanem",
        "authorids": "motasem.alfarra@kaust.edu.sa;adel.bibi@kaust.edu.sa;hasan.hammoud@kaust.edu.sa;muhamed.gaafar@gmail.com;bernard.ghanem@kaust.edu.sa",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nalfarra2020on,\ntitle={On the Decision Boundaries of Deep Neural Networks: A Tropical Geometry Perspective},\nauthor={Motasem Alfarra and Adel Bibi and Hasan Hammoud and Mohamed Gaafar and Bernard Ghanem},\nyear={2020},\nurl={https://openreview.net/forum?id=BylldnNFwS}\n}",
        "github": "https://drive.google.com/file/d/1igQUriSYnUMKo6GafsRz5P_J45YYiImh/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BylldnNFwS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "534;738;1426",
        "wc_reply_reviewers": "0;0;167",
        "wc_reply_authors": "1090;1609;1619",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;3;3",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            899.3333333333334,
            381.6082924791977
        ],
        "wc_reply_reviewers_avg": [
            55.666666666666664,
            78.7245549721023
        ],
        "wc_reply_authors_avg": [
            1439.3333333333333,
            247.0497026016335
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1462098504063480261&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "BylldxBYwH",
        "title": "Physics-Aware Flow Data Completion Using Neural Inpainting",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a network architecture and loss functions for inpainting fluid flow data that considers physical properties and constraints.",
        "abstract": "In this paper we propose a physics-aware neural network for inpainting fluid flow data. We consider that flow field data inherently follows the solution of the Navier-Stokes equations and hence our network is designed to capture physical laws. We use a DenseBlock U-Net architecture combined with a stream function formulation to inpaint missing velocity data. Our loss functions represent the relevant physical quantities velocity, velocity Jacobian, vorticity and divergence. Obstacles are treated as known priors, and each layer of the network receives the relevant information through concatenation with the previous layer's output. Our results demonstrate the network's capability for physics-aware completion tasks, and the presented ablation studies show the effectiveness of each proposed component.",
        "keywords": "neural inpainting;fluid dynamics;flow data completion;physics-aware network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sebastien Foucher;Jingwei Tang;Vinicius da Costa de Azevedo;Byungsoo Kim;Markus Gross;Barbara Solenthaler",
        "authorids": "sfoucher@ethz.ch;jingwei.tang@inf.ethz.ch;vinicius.azevedo@inf.ethz.ch;kimby@inf.ethz.ch;grossm@inf.ethz.ch;solenthaler@inf.ethz.ch",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nfoucher2020physicsaware,\ntitle={Physics-Aware Flow Data Completion Using Neural Inpainting},\nauthor={Sebastien Foucher and Jingwei Tang and Vinicius da Costa de Azevedo and Byungsoo Kim and Markus Gross and Barbara Solenthaler},\nyear={2020},\nurl={https://openreview.net/forum?id=BylldxBYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BylldxBYwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "880;437;359",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            558.6666666666666,
            229.43747635370198
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:irWx39j1k4wJ:scholar.google.com/&scioq=Physics-Aware+Flow+Data+Completion+Using+Neural+Inpainting&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Bylp62EKDH",
        "title": "Extreme Triplet Learning: Effectively Optimizing Easy Positives and Hard Negatives",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The Triplet Loss approach to Distance Metric Learning is defined by the strategy to select triplets and the loss function through which those triplets are optimized.  During optimization, two especially important cases are easy positive and hard negative mining which consider, the closest example of the same and different classes.  We characterize how triplets behave based during optimization as a function of these similarities, and highlight that these important cases have technical problems where standard gradient descent behaves poorly, pulling the negative example closer and/or pushing the positive example farther away.  We derive an updated loss function that fixes these problems and shows improvements to the state of the art for CUB, CAR, SOP, In-Shop Clothes datasets.",
        "keywords": "Triplet Learning;Easy Positive;Hard Negatives",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hong Xuan;Robert Pless",
        "authorids": "xuanhong@gwu.edu;pless@gwu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nxuan2020extreme,\ntitle={Extreme Triplet Learning: Effectively Optimizing Easy Positives and Hard Negatives},\nauthor={Hong Xuan and Robert Pless},\nyear={2020},\nurl={https://openreview.net/forum?id=Bylp62EKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Bylp62EKDH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "536;137;99",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "494;167;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            257.3333333333333,
            197.65682943481164
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            220.33333333333334,
            205.17038989310538
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10943307885687547648&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "BylsKkHYvH",
        "title": "Why Not to Use Zero Imputation? Correcting Sparsity Bias in Training Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Handling missing data is one of the most fundamental problems in machine learning. Among many approaches, the simplest and most intuitive way is zero imputation, which treats the value of a missing entry simply as zero. However, many studies have experimentally confirmed that zero imputation results in suboptimal performances in training neural networks. Yet, none of the existing work has explained what brings such performance degradations. In this paper, we introduce the variable sparsity problem (VSP), which describes a phenomenon where the output of a predictive model largely varies with respect to the rate of missingness in the given input, and show that it adversarially affects the model performance. We first theoretically analyze this phenomenon and propose a simple yet effective technique to handle missingness, which we refer to as Sparsity Normalization (SN), that directly targets and resolves the VSP. We further experimentally validate SN on diverse benchmark datasets, to show that debiasing the effect of input-level sparsity improves the performance and stabilizes the training of neural networks.",
        "keywords": "Missing Data;Collaborative Filtering;Health Care;Tabular Data;High Dimensional Data;Deep Learning;Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Joonyoung Yi;Juhyuk Lee;Kwang Joon Kim;Sung Ju Hwang;Eunho Yang",
        "authorids": "joonyoung.yi@kaist.ac.kr;sehkmg@kaist.ac.kr;preppie@yuhs.ac;sjhwang82@kaist.ac.kr;eunhoy@kaist.ac.kr",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nYi2020Why,\ntitle={Why Not to Use Zero Imputation? Correcting Sparsity Bias in Training Neural Networks},\nauthor={Joonyoung Yi and Juhyuk Lee and Kwang Joon Kim and Sung Ju Hwang and Eunho Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=BylsKkHYvH}\n}",
        "github": "https://github.com/JoonyoungYi/sparsity-normalization",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=BylsKkHYvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "275;305;189",
        "wc_reply_reviewers": "0;22;0",
        "wc_reply_authors": "930;888;163",
        "reply_reviewers": "0;1;0",
        "reply_authors": "5;4;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            256.3333333333333,
            49.16186417223099
        ],
        "wc_reply_reviewers_avg": [
            7.333333333333333,
            10.370899457402697
        ],
        "wc_reply_authors_avg": [
            660.3333333333334,
            352.0855325375103
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            3.3333333333333335,
            1.699673171197595
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=363482687084089467&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "BylssnVFwH",
        "title": "Interpretability Evaluation Framework for Deep Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a novel framework to evaluate the interpretability of neural network.",
        "abstract": "Deep neural networks (DNNs) have attained surprising achievement during the last decade due to the advantages of automatic feature learning and freedom of expressiveness. However,  their interpretability remains mysterious because DNNs are complex combinations of linear and nonlinear transformations. Even though many models have been proposed to explore the interpretability of DNNs,  several challenges remain unsolved: 1) The lack of interpretability quantity measures for DNNs,  2) the lack of theory for stability of DNNs, and 3) the difficulty to solve nonconvex DNN problems with interpretability constraints. To address these challenges simultaneously, this paper presents a novel intrinsic interpretability evaluation framework for DNNs. Specifically, Four independent properties of interpretability are defined based on existing works. Moreover, we investigate the theory for the stability of DNNs, which is an important aspect of interpretability, and prove that DNNs are generally stable given different activation functions. Finally, an extended version of deep learning Alternating Direction Method of Multipliers (dlADMM) are proposed to solve DNN problems with interpretability constraints efficiently and accurately. Extensive experiments on several benchmark datasets validate several DNNs by our proposed interpretability framework.",
        "keywords": "Interpretability Evaluation;Deep Neural Networks;Alternating Direction Method of Multipliers",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junxiang Wang;Liang Zhao;Yanfang Ye and Houman Homayoun",
        "authorids": ";;",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=BylssnVFwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "333;149;153",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            211.66666666666666,
            85.81116218508846
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cm5N8XPwkkwJ:scholar.google.com/&scioq=Interpretability+Evaluation+Framework+for+Deep+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Bylthp4Yvr",
        "title": "Dropout: Explicit Forms and Capacity Control",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We investigate the capacity control provided by dropout in various machine learning problems. First, we study dropout for matrix sensing, where it induces a data-dependent regularizer that, in expectation, equals the weighted trace-norm of the product of the factors. In deep learning, we show that the data-dependent regularizer due to dropout directly controls the Rademacher complexity of the underlying class of deep neural networks. These developments enable us to give concrete generalization error bounds for the dropout algorithm in both matrix completion as well as training deep neural networks. We evaluate our theoretical findings on real-world datasets, including MovieLens, Fashion MNIST, and CIFAR-10.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Raman Arora;Peter L. Bartlett;Poorya Mianjy;Nathan Srebro",
        "authorids": "arora@cs.jhu.edu;bartlett@cs.berkeley.edu;mianjy@jhu.edu;nati@ttic.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\narora2020dropout,\ntitle={Dropout: Explicit Forms and Capacity Control},\nauthor={Raman Arora and Peter L. Bartlett and Poorya Mianjy and Nathan Srebro},\nyear={2020},\nurl={https://openreview.net/forum?id=Bylthp4Yvr}\n}",
        "github": "https://www.dropbox.com/s/inptu0exz9iz4cb/c75_drop.py?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bylthp4Yvr",
        "pdf_size": 0,
        "rating": "1;1;1;3",
        "confidence": "0;0;0;0",
        "wc_review": "384;1090;410;259",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "449;0;237;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;0;1;0",
        "rating_avg": [
            1.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            535.75,
            325.04797722797787
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            171.5,
            187.16369840329614
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.5,
            0.5
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7582175856338441846&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "Bylx-TNKvH",
        "title": "Functional vs. parametric equivalence of ReLU networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We prove that there exist ReLU networks whose parameters are almost uniquely determined by the function they implement.",
        "abstract": "We address the following question: How redundant is the parameterisation of ReLU networks? Specifically, we consider transformations of the weight space which leave the function implemented by the network intact. Two such transformations are known for feed-forward architectures: permutation of neurons within a layer, and positive scaling of all incoming weights of a neuron coupled with inverse scaling of its outgoing weights. In this work, we show for architectures with non-increasing widths that permutation and scaling are in fact the only function-preserving weight transformations. For any eligible architecture we give an explicit construction of a neural network such that any other network that implements the same function can be obtained from the original one by the application of permutations and rescaling. The proof relies on a geometric understanding of boundaries between linear regions of ReLU networks, and we hope the developed mathematical tools are of independent interest.\n",
        "keywords": "ReLU networks;symmetry;functional equivalence;over-parameterization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mary Phuong;Christoph H. Lampert",
        "authorids": "bphuong@ist.ac.at;chl@ist.ac.at",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nphuong2020functional,\ntitle={Functional vs. parametric equivalence of Re{\\{}LU{\\}} networks},\nauthor={Mary Phuong and Christoph H. Lampert},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Bylx-TNKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Bylx-TNKvH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "444;228;86",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "736;37;32",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            252.66666666666666,
            147.18997550407053
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            268.3333333333333,
            330.696571228403
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7830598780773110936&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "BylyV1BtDB",
        "title": "FR-GAN: Fair and Robust Training",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose FR-GAN, which holistically performs fair and robust model training using generative adversarial networks. ",
        "abstract": "We consider the problem of fair and robust model training in the presence of data poisoning. Ensuring fairness usually involves a tradeoff against accuracy, so if the data poisoning is mistakenly viewed as additional bias to be fixed, the accuracy will be sacrificed even more. We demonstrate that this phenomenon indeed holds for state-of-the-art model fairness techniques. We then propose FR-GAN, which holistically performs fair and robust model training using generative adversarial networks (GANs). We first use a generator that attempts to classify examples as accurately as possible. In addition, we deploy two discriminators: (1) a fairness discriminator that predicts the sensitive attribute from classification results and (2) a robustness discriminator that distinguishes examples and predictions from a clean validation set. Our framework respects all the prominent fairness measures: disparate impact, equalized odds, and equal opportunity. Also, FR-GAN optimizes fairness without requiring the knowledge of prior statistics of the sensitive attributes. In our experiments, FR-GAN shows almost no decrease in fairness and accuracy in the presence of data poisoning unlike other state-of-the-art fairness methods, which are vulnerable. In addition, FR-GAN can be adjusted using parameters to maintain reasonable accuracy and fairness even if the validation set is too small or unavailable.",
        "keywords": "generative adversarial networks;model fairness;model robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuji Roh;Kangwook Lee;Gyeong Jo Hwang;Steven Euijong Whang;Changho Suh",
        "authorids": "rohyj113@gmail.com;kangwook.lee@wisc.edu;hkj4276@kaist.ac.kr;swhang@kaist.ac.kr;chsuh@kaist.ac.kr",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nroh2020frgan,\ntitle={{\\{}FR{\\}}-{\\{}GAN{\\}}: Fair and Robust Training},\nauthor={Yuji Roh and Kangwook Lee and Gyeong Jo Hwang and Steven Euijong Whang and Changho Suh},\nyear={2020},\nurl={https://openreview.net/forum?id=BylyV1BtDB}\n}",
        "github": "https://drive.google.com/file/d/19yARy2muC86KJi-opG7enicfxLRg55SR/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=BylyV1BtDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "344;291;491",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "603;642;690",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            375.3333333333333,
            84.60233776649174
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            645.0,
            35.58089374931439
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gKdOov144zQJ:scholar.google.com/&scioq=FR-GAN:+Fair+and+Robust+Training&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Byx0PREtDH",
        "title": "BEYOND SUPERVISED LEARNING: RECOGNIZING UNSEEN ATTRIBUTE-OBJECT PAIRS WITH VISION-LANGUAGE FUSION AND ATTRACTOR NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This paper handles a challenging problem, unseen attribute-object pair recognition, which asks a model to simultaneously recognize the attribute type and the object type of a given image while this attribute-object pair is not included in the training set. In the past years, the conventional classifier-based methods, which recognize unseen attribute-object pairs by composing separately-trained attribute classifiers and object classifiers, are strongly frustrated. Different from conventional methods, we propose a generative model with a visual pathway and a linguistic pathway. In each pathway, the attractor network is involved to learn the intrinsic feature representation to explore the inner relationship between the attribute and the object. With the learned features in both pathways, the unseen attribute-object pair is recognized by finding out the pair whose linguistic feature closely matches the visual feature of the given image. On two public datasets, our model achieves impressive experiment results, notably outperforming the state-of-the-art methods.",
        "keywords": "image understanding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hui Chen;Zhixiong Nan;Nanning Zheng",
        "authorids": "chenhui0622@stu.xjtu.edu.cn;nanzhixiong@stu.xjtu.edu.cn;nnzheng@mail.xjtu.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchen2020beyond,\ntitle={{\\{}BEYOND{\\}} {\\{}SUPERVISED{\\}} {\\{}LEARNING{\\}}: {\\{}RECOGNIZING{\\}} {\\{}UNSEEN{\\}} {\\{}ATTRIBUTE{\\}}-{\\{}OBJECT{\\}} {\\{}PAIRS{\\}} {\\{}WITH{\\}} {\\{}VISION{\\}}-{\\{}LANGUAGE{\\}} {\\{}FUSION{\\}} {\\{}AND{\\}} {\\{}ATTRACTOR{\\}} {\\{}NETWORKS{\\}}},\nauthor={Hui Chen and Zhixiong Nan and Nanning Zheng},\nyear={2020},\nurl={https://openreview.net/forum?id=Byx0PREtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Byx0PREtDH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "562;303;206",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.0,
            150.26864831583023
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1ELGCfMD-5wJ:scholar.google.com/&scioq=BEYOND+SUPERVISED+LEARNING:+RECOGNIZING+UNSEEN+ATTRIBUTE-OBJECT+PAIRS+WITH+VISION-LANGUAGE+FUSION+AND+ATTRACTOR+NETWORKS&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Byx0iAEYPH",
        "title": "Fully Polynomial-Time Randomized Approximation Schemes for Global Optimization of High-Dimensional Folded Concave Penalized Generalized Linear Models",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper primarily demonstrates a technique to find the global optima of FCP regularized GLMs which is to our knowledge the first of its kind.",
        "abstract": "Global solutions to high-dimensional sparse estimation problems with a folded concave penalty (FCP) have been shown to be statistically desirable but are strongly  NP-hard to compute, which implies the non-existence of a pseudo-polynomial time global optimization schemes in the worst case. This paper shows that, with high probability, a global solution to the formulation for a FCP-based high-dimensional generalized linear model coincides with a stationary point characterized by the significant subspace second order necessary conditions (S$^3$ONC). Since the desired S$^3$ONC solution admits a fully polynomial-time approximation schemes (FPTAS), we thus have shown the existence of fully polynomial-time randomized approximation scheme (FPRAS) for a strongly NP-hard problem. We further demonstrate two versions of the FPRAS for generating the desired S$^3$ONC solutions. One follows the paradigm of an interior point trust region algorithm and the other is the well-studied local linear approximation (LLA).  Our analysis thus provides new techniques for global optimization of certain NP-Hard problems and new insights on the effectiveness of LLA.",
        "keywords": "statistical learning;FPRAS;global optimization;folded concave penalty;GLM;high dimensional learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Charles Hernandez;Hungyi Lee;Hongchen Liu",
        "authorids": "cdhernandez@ufl.edu;hungyilee@ufl.edu;hliu@ise.ufl.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhernandez2020fully,\ntitle={Fully Polynomial-Time Randomized Approximation Schemes for Global Optimization of High-Dimensional Folded Concave Penalized Generalized Linear Models},\nauthor={Charles Hernandez and Hungyi Lee and Hongchen Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=Byx0iAEYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byx0iAEYPH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "748;638;226",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "837;789;406",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            537.3333333333334,
            224.67952485460017
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            677.3333333333334,
            192.8597648263859
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7yf8tyggFigJ:scholar.google.com/&scioq=Fully+Polynomial-Time+Randomized+Approximation+Schemes+for+Global+Optimization+of+High-Dimensional+Folded+Concave+Penalized+Generalized+Linear+Models&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "Byx1qpEFDr",
        "title": "On Recovering Latent Factors From Sampling And Firing Graph",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "",
        "abstract": "Consider a set of latent factors whose observable effect of activation is caught on a measure space that appears as a grid of bits tacking value in {0,1}. This paper intend to deliver a theoretical and practical answer to the question: Given that we have access to a perfect indicator of the activation of latent factors that label a finite dataset of grid's activity, can we imagine a procedure to build a generic identificator of factor's activations ?",
        "keywords": "latent factor;graph;discrete probability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pierre Gouedard",
        "authorids": "pierre.gouedard@alumni.epfl.ch",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nanonymous2020on,\ntitle={On Recovering Latent Factors From Sampling And Firing Graph},\nauthor={Anonymous},\nbooktitle={Submitted to International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byx1qpEFDr},\nnote={under review}\n}",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=Byx1qpEFDr",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:t3bv4m5CsJEJ:scholar.google.com/&scioq=On+Recovering+Latent+Factors+From+Sampling+And+Firing+Graph&hl=en&as_sdt=0,5",
        "gs_version_total": 4
    },
    {
        "id": "Byx4NkrtDS",
        "title": "Implementing Inductive bias for different navigation tasks through diverse RNN attrractors",
        "track": "main",
        "status": "Poster",
        "tldr": "Task agnostic pre-training can shape RNN's attractor landscape, and form diverse inductive bias for different navigation tasks   ",
        "abstract": "Navigation is crucial for animal behavior and is assumed to require an internal representation of the external environment, termed a cognitive map. The precise form of this representation is often considered to be a metric representation of space. An internal representation, however, is judged by its contribution to performance on a given task, and may thus vary between different types of navigation tasks. Here we train a recurrent neural network that controls an agent performing several navigation tasks in a simple environment. To focus on internal representations, we split learning into a task-agnostic pre-training stage that modifies internal connectivity and a task-specific Q learning stage that controls the network's output. We show that pre-training shapes the attractor landscape of the networks, leading to either a continuous attractor, discrete attractors or a disordered state. These structures induce bias onto the Q-Learning phase, leading to a performance pattern across the tasks corresponding to metric and topological regularities. Our results show that, in recurrent networks, inductive bias takes the form of attractor landscapes -- which can be shaped by pre-training and analyzed using dynamical systems methods. Furthermore, we demonstrate that non-metric representations are useful for navigation tasks.  ",
        "keywords": "navigation;Recurrent Neural Networks;dynamics;inductive bias;pre-training;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tie XU;Omri Barak",
        "authorids": "fexutie@gmail.com;omri.barak@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nXU2020Implementing,\ntitle={Implementing Inductive bias for different navigation tasks through diverse RNN attrractors},\nauthor={Tie XU and Omri Barak},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byx4NkrtDS}\n}",
        "github": "https://anonymous.4open.science/r/539372c8-c17b-4b48-a7da-56392ed685c4/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Byx4NkrtDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "141;249;495",
        "wc_reply_reviewers": "0;65;0",
        "wc_reply_authors": "580;354;647",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.0,
            148.13507349712964
        ],
        "wc_reply_reviewers_avg": [
            21.666666666666668,
            30.641293851417057
        ],
        "wc_reply_authors_avg": [
            527.0,
            125.3501761732574
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8qOpsOVq8LMJ:scholar.google.com/&scioq=Implementing+Inductive+bias+for+different+navigation+tasks+through+diverse+RNN+attrractors&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Byx55pVKDB",
        "title": "How the Softmax Activation Hinders the Detection of Adversarial and Out-of-Distribution Examples in Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": " The softmax activation hinders the detection of adversarial and out-of-distribution examples, as it masks a significant part of the relevant information present in the logits.",
        "abstract": "Despite having excellent performances for a wide variety of tasks, modern neural networks are unable to provide a prediction with a reliable confidence estimate which would allow to detect misclassifications. This limitation is at the heart of what is known as an adversarial example, where the network provides a wrong prediction associated with a strong confidence to a slightly modified image. Moreover, this overconfidence issue has also been observed for out-of-distribution data. We show through several experiments that the softmax activation, usually placed as the last layer of modern neural networks, is partly responsible for this behaviour. We give qualitative insights about its impact on the MNIST dataset, showing that relevant information present in the logits is lost once the softmax function is applied. The same observation is made through quantitative analysis, as we show that two out-of-distribution and adversarial example detectors obtain competitive results when using logit values as inputs, but provide considerably lower performances if they use softmax probabilities instead: from 98.0% average AUROC to 56.8% in some settings. These results provide evidence that the softmax activation hinders the detection of adversarial and out-of-distribution examples, as it masks a significant part of the relevant information present in the logits.",
        "keywords": "Adversarial examples;out-of-distribution;detection;softmax;logits",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jonathan Aigrain;Marcin Detyniecki",
        "authorids": "jonathan.aigrain@axa.com;marcin.detyniecki@axa.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\naigrain2020how,\ntitle={How the Softmax Activation Hinders the Detection of Adversarial and Out-of-Distribution Examples in Neural Networks},\nauthor={Jonathan Aigrain and Marcin Detyniecki},\nyear={2020},\nurl={https://openreview.net/forum?id=Byx55pVKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer5;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Byx55pVKDB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "287;226;122",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "151;37;71",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            211.66666666666666,
            68.11917661145223
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            86.33333333333333,
            47.786562499886465
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SFyU8z7sRRYJ:scholar.google.com/&scioq=How+the+Softmax+Activation+Hinders+the+Detection+of+Adversarial+and+Out-of-Distribution+Examples+in+Neural+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Byx5R0NKPr",
        "title": "Learning Calibratable Policies using Programmatic Style-Consistency",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a framework for style-consistent imitation of diverse behaviors.",
        "abstract": "We study the important and challenging problem of controllable generation of long-term sequential behaviors. Solutions to this problem would impact many applications, such as calibrating behaviors of AI agents in games or predicting player trajectories in sports. In contrast to the well-studied areas of controllable generation of images, text, and speech, there are significant challenges that are unique to or exacerbated by generating long-term behaviors: how should we specify the factors of variation to control, and how can we ensure that the generated temporal behavior faithfully demonstrates diverse styles? In this paper, we leverage large amounts of raw behavioral data to learn policies that can be calibrated to generate a diverse range of behavior styles (e.g., aggressive versus passive play in sports). Inspired by recent work on leveraging programmatic labeling functions, we present a novel framework that combines imitation learning with data programming to learn style-calibratable policies. Our primary technical contribution is a formal notion of style-consistency as a learning objective, and its integration with conventional imitation learning approaches. We evaluate our framework using demonstrations from professional basketball players and agents in the MuJoCo physics environment, and show that our learned policies can be accurately calibrated to generate interesting behavior styles in both domains.",
        "keywords": "imitation learning;conditional generation;data programming",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eric Zhan;Albert Tseng;Yisong Yue;Adith Swaminathan;Matthew Hausknecht",
        "authorids": "ezhan@caltech.edu;atseng@caltech.edu;yyue@caltech.edu;adswamin@microsoft.com;mahauskn@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhan2020learning,\ntitle={Learning Calibratable Policies using Programmatic Style-Consistency},\nauthor={Eric Zhan and Albert Tseng and Yisong Yue and Adith Swaminathan and Matthew Hausknecht},\nyear={2020},\nurl={https://openreview.net/forum?id=Byx5R0NKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Byx5R0NKPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "899;241;424",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "731;315;508",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            521.3333333333334,
            277.3040849961564
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            518.0,
            169.97843000412337
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14384068625001787252&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "Byx91R4twB",
        "title": "Adversarial Video Generation on Complex Datasets",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose DVD-GAN, a large video generative model that is state of the art on several tasks and produces highly complex videos when trained on large real world datasets.",
        "abstract": "Generative models of natural images have progressed towards high fidelity samples by the strong leveraging of scale. We attempt to carry this success to the field of video modeling by showing that large Generative Adversarial Networks trained on the complex Kinetics-600 dataset are able to produce video samples of substantially higher complexity and fidelity than previous work.  Our proposed model, Dual Video Discriminator GAN (DVD-GAN), scales to longer and higher resolution videos by leveraging a computationally efficient decomposition of its discriminator. We evaluate on the related tasks of video synthesis and video prediction,  and achieve new state-of-the-art Fr\u00e9chet Inception Distance for prediction for Kinetics-600,  as well as state-of-the-art Inception Score for synthesis on the UCF-101 dataset, alongside establishing a strong baseline for synthesis on Kinetics-600.",
        "keywords": "GAN;generative model;generative adversarial network;video prediction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aidan Clark;Jeff Donahue;Karen Simonyan",
        "authorids": "aidanclark@google.com;jeffdonahue@google.com;simonyan@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nclark2020adversarial,\ntitle={Adversarial Video Generation on Complex Datasets},\nauthor={Aidan Clark and Jeff Donahue and Karen Simonyan},\nyear={2020},\nurl={https://openreview.net/forum?id=Byx91R4twB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Byx91R4twB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "971;745;578",
        "wc_reply_reviewers": "0;0;20",
        "wc_reply_authors": "724;484;534",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            764.6666666666666,
            161.04312741898949
        ],
        "wc_reply_reviewers_avg": [
            6.666666666666667,
            9.428090415820632
        ],
        "wc_reply_authors_avg": [
            580.6666666666666,
            103.38708279513882
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 253,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17998775741348946387&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Byx9p2EtDH",
        "title": "MULTIPOLAR: Multi-Source Policy Aggregation for Transfer Reinforcement Learning between Diverse Environmental Dynamics",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose MULTIPOLAR, a transfer RL method that leverages a set of source policies collected under unknown diverse environmental dynamics to efficiently learn a target policy in another dynamics.",
        "abstract": "Transfer reinforcement learning (RL) aims at improving learning efficiency of an agent by exploiting knowledge from other source agents trained on relevant tasks. However, it remains challenging to transfer knowledge between different environmental dynamics without having access to the source environments. In this work, we explore a new challenge in transfer RL, where only a set of source policies collected under unknown diverse dynamics is available for learning a target task efficiently. To address this problem, the proposed approach, MULTI-source POLicy AggRegation (MULTIPOLAR), comprises two key techniques. We learn to aggregate the actions provided by the source policies adaptively to maximize the target task performance. Meanwhile, we learn an auxiliary network that predicts residuals around the aggregated actions, which ensures the target policy's expressiveness even when some of the source policies perform poorly. We demonstrated the effectiveness of MULTIPOLAR through an extensive experimental evaluation across six simulated environments ranging from classic control problems to challenging robotics simulations, under both continuous and discrete action spaces.",
        "keywords": "reinforcement learning;transfer learning;policy aggregation;residual policy learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohammadamin Barekatain;Ryo Yonetani;Masashi Hamaya",
        "authorids": "m.barekatain@tum.de;ryo.yonetani@sinicx.com;masashi.hamaya@sinicx.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nbarekatain2020multipolar,\ntitle={{\\{}MULTIPOLAR{\\}}: Multi-Source Policy Aggregation for Transfer Reinforcement Learning between Diverse Environmental Dynamics},\nauthor={Mohammadamin Barekatain and Ryo Yonetani and Masashi Hamaya},\nyear={2020},\nurl={https://openreview.net/forum?id=Byx9p2EtDH}\n}",
        "github": "https://anonymous.4open.science/r/b12d3183-0ce6-4d44-91de-c767c1e75d07/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Byx9p2EtDH",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "416;202;261",
        "wc_reply_reviewers": "0;0;15",
        "wc_reply_authors": "1140;503;1137",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            293.0,
            90.2478069908996
        ],
        "wc_reply_reviewers_avg": [
            5.0,
            7.0710678118654755
        ],
        "wc_reply_authors_avg": [
            926.6666666666666,
            299.580076477429
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5555740313387837333&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "ByxCrerKvS",
        "title": "Set Functions for Time Series",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel method for the scalable and interpretable classification of irregularly sampled time series.",
        "abstract": "Despite the eminent successes of deep neural networks, many architectures are often hard to transfer to irregularly-sampled and asynchronous time series that occur in many real-world datasets, such as healthcare applications. This paper proposes a novel framework for classifying irregularly sampled time series with unaligned measurements, focusing on high scalability and data efficiency.\nOur method SeFT (Set Functions for Time Series) is based on recent advances in differentiable set function learning, extremely parallelizable, and scales well to very large datasets and online monitoring scenarios.\nWe extensively compare our method to competitors on multiple healthcare time series datasets and show that it performs competitively whilst significantly reducing runtime.",
        "keywords": "Time Series;Set functions;Irregularly sampling;Medical Time series;Dynamical Systems;Time series classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Max Horn;Michael Moor;Christian Bock;Bastian Rieck;Karsten Borgwardt",
        "authorids": "max.horn@bsse.ethz.ch;michael.moor@bsse.ethz.ch;christian.bock@bsse.ethz.ch;bastian.rieck@bsse.ethz.ch;karsten.borgwardt@bsse.ethz.ch",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nhorn2020set,\ntitle={Set Functions for Time Series},\nauthor={Max Horn and Michael Moor and Christian Bock and Bastian Rieck and Karsten Borgwardt},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxCrerKvS}\n}",
        "github": "https://osf.io/2hg74/?view_only=8d45fdf237954948a02f1e2bf701cdf1",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByxCrerKvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "210;724;668",
        "wc_reply_reviewers": "59;0;0",
        "wc_reply_authors": "456;317;760",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            534.0,
            230.240454018547
        ],
        "wc_reply_reviewers_avg": [
            19.666666666666668,
            27.812866726670865
        ],
        "wc_reply_authors_avg": [
            511.0,
            184.9882879175508
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 176,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11653676919176974096&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "ByxDJyHYPS",
        "title": "Multi-hop Question Answering via Reasoning Chains",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We improve answering of questions that require multi-hop reasoning extracting an intermediate chain of sentences.",
        "abstract": "Multi-hop question answering requires models to gather information from different parts of a text to answer a question. Most current approaches learn to address this task in an end-to-end way with neural networks, without maintaining an explicit representation of the reasoning process. We propose a method to extract a discrete reasoning chain over the text, which consists of a series of sentences leading to the answer. We then feed the extracted chains to a BERT-based QA model to do final answer prediction. Critically, we do not rely on gold annotated chains or ``supporting facts:'' at training time, we derive pseudogold reasoning chains using heuristics based on named entity recognition and coreference resolution. Nor do we rely on these annotations at test time, as our model learns to extract chains from raw text alone.  We test our approach on two recently proposed large multi-hop question answering datasets: WikiHop and HotpotQA, and achieve state-of-art performance on WikiHop and strong performance on HotpotQA. Our analysis shows the properties of chains that are crucial for high performance: in particular, modeling extraction sequentially is important, as is dealing with each candidate sentence in a context-aware way. Furthermore, human evaluation shows that our extracted chains allow humans to give answers with high confidence, indicating that these are a strong intermediate abstraction for this task.",
        "keywords": "natural language processing;question answering;multi-hop reasoning;reasoning chain extraction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jifan Chen;Shih-ting Lin;Greg Durrett",
        "authorids": "jfchen@cs.utexas.edu;j0717lin@gmail.com;gdurrett@cs.utexas.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ByxDJyHYPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "209;639;403",
        "wc_reply_reviewers": "0;0;183",
        "wc_reply_authors": "411;341;499",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.0,
            175.82567123906188
        ],
        "wc_reply_reviewers_avg": [
            61.0,
            86.2670273047588
        ],
        "wc_reply_authors_avg": [
            417.0,
            64.64260720814613
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 92,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1265359676044493753&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ByxGkySKwH",
        "title": "Towards neural networks that provably know when they don't know",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "It has recently been shown that ReLU networks produce arbitrarily over-confident predictions far away from the \ntraining data. Thus, ReLU networks do not know when they don't know. However, this is a highly important property in safety\ncritical applications. In the context of out-of-distribution detection (OOD) there have been a number of proposals to mitigate this problem but none of them are able to make any mathematical guarantees. In this paper we propose a new approach to OOD which overcomes both problems. Our approach can be used with ReLU networks and provides provably low confidence predictions far away from the training data as well as the first certificates for low confidence predictions in a neighborhood of an out-distribution point. In the experiments we show that state-of-the-art methods fail in this worst-case setting whereas our model can guarantee its performance while retaining state-of-the-art OOD performance.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexander Meinke;Matthias Hein",
        "authorids": "alexander.meinke@uni-tuebingen.de;matthias.hein@uni-tuebingen.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nMeinke2020Towards,\ntitle={Towards neural networks that provably know when they don't know},\nauthor={Alexander Meinke and Matthias Hein},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxGkySKwH}\n}",
        "github": "[![github](/images/github_icon.svg) AlexMeinke/certified-certain-uncertainty](https://github.com/AlexMeinke/certified-certain-uncertainty)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ByxGkySKwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "124;505;161",
        "wc_reply_reviewers": "16;0;0",
        "wc_reply_authors": "284;743;230",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            263.3333333333333,
            171.55044609547232
        ],
        "wc_reply_reviewers_avg": [
            5.333333333333333,
            7.542472332656507
        ],
        "wc_reply_authors_avg": [
            419.0,
            230.1608133457996
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 181,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3907037768613550224&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "ByxHJeBYDB",
        "title": "Forecasting Deep Learning Dynamics with Applications to Hyperparameter Tuning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Well-performing deep learning models have enormous impact, but getting them\nto perform well is complicated, as the model architecture must be chosen and a\nnumber of hyperparameters tuned. This requires experimentation, which is timeconsuming and costly. We propose to address the problem of hyperparameter\ntuning by learning to forecast the training behaviour of deep learning architectures.\nConcretely, we introduce a forecasting model that, given a hyperparameter schedule\n(e.g., learning rate, weight decay) and a history of training observations (such as\nloss and accuracy), predicts how the training will continue. Naturally, forecasting\nis much faster and less expensive than running actual deep learning experiments.\nThe main question we study is whether the forecasting model is good enough to be\nof use - can it indeed replace real experiments? We answer this affirmatively in two\nways. For one, we show that the forecasted curves are close to real ones. On the\npractical side, we apply our forecaster to learn hyperparameter tuning policies. We\nexperiment on a version of ResNet on CIFAR10 and on Transformer in a language\nmodeling task. The policies learned using our forecaster match or exceed the ones\nlearned in real experiments and in one case even the default schedules discovered\nby researchers. We study the learning rate schedules created using the forecaster\nare find that they are not only effective, but also lead to interesting insights.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Piotr Kozakowski;\u0141ukasz Kaiser;Afroz Mohiuddin",
        "authorids": "p.kozakowski@mimuw.edu.pl;lukaszkaiser@google.com;afrozm@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkozakowski2020forecasting,\ntitle={Forecasting Deep Learning Dynamics with Applications to Hyperparameter Tuning},\nauthor={Piotr Kozakowski and {\\L}ukasz Kaiser and Afroz Mohiuddin},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxHJeBYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByxHJeBYDB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "119;274;427",
        "wc_reply_reviewers": "0;0;11",
        "wc_reply_authors": "192;311;381",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            273.3333333333333,
            125.74135711407321
        ],
        "wc_reply_reviewers_avg": [
            3.6666666666666665,
            5.185449728701348
        ],
        "wc_reply_authors_avg": [
            294.6666666666667,
            78.01851632073561
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IECI704stCkJ:scholar.google.com/&scioq=Forecasting+Deep+Learning+Dynamics+with+Applications+to+Hyperparameter+Tuning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ByxI62NFvB",
        "title": "Cost-Effective Interactive Neural Attention Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "We propose a novel interactive attention learning framework which we refer to as Interactive Attention Learning (IAL), in which the human annotators interactively manipulate the allocated attentions to correct the model\u2019s behavior, by updating the attention-generating model without having to retrain the network. For efficient update of the attention generator without retraining, we propose a novel attention mechanism Neural Attention Process (NAP), which can generate stochastic attentions based on scarce training instances, and can incorporate new training instances without retraining. Further, to minimize human interaction cost, we propose a cost-effective algorithm that selects the most negative training instances that yield incorrect and non-intuitive interpretation with influence function and re-rank the attentions on the input features by their uncertainties, such that the annotators label the instances and attentions that are more influential to the prediction first. We validate IAL on multiple datasets from the healthcare domain, on which it not only significantly outperforms other baselines, but also achieves explicitly trained attention for interpretability that agrees well with human interpretation without expensive human labeling procedure.",
        "keywords": "interactive learning;neural process;attention mechanism;interpretable machine learning;influence functions;uncertainty.",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jay Heo;Junhyeon Park;Hyewon Jeong;Wuhyun Shin;Kwang Joon Kim;juho Lee;Eunho Yang;Sung Ju Hwang",
        "authorids": "jayheo@kaist.ac.kr;pjh2941@kaist.ac.kr;jhw162@kaist.ac.kr;wuhyun.shin@kaist.ac.kr;preppie@yuhs.ac.kr;juho@aitrics.com;eunhoy@kaist.ac.kr;sjhwang82@kaist.ac.kr",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByxI62NFvB",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "963;592;531;341",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "567;186;215;388",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            606.75,
            225.55085346768254
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            339.0,
            152.6188061806277
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZqJFk604R_4J:scholar.google.com/&scioq=Cost-Effective+Interactive+Neural+Attention+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "ByxJO3VFwB",
        "title": "Probabilistic modeling the hidden layers of deep neural networks",
        "track": "main",
        "status": "Reject",
        "tldr": "The Gaussian Process cannot correctly explain all the hidden layers of neural networks. Alternatively, we propose a novel probabilistic representation for deep learning ",
        "abstract": "In this paper, we demonstrate that the parameters of Deep Neural Networks (DNNs) cannot satisfy the i.i.d. prior assumption and activations being i.i.d. is not valid for all the hidden layers of DNNs. Hence, the Gaussian Process cannot correctly explain all the hidden layers of DNNs. Alternatively, we introduce a novel probabilistic representation for the hidden layers of DNNs in two aspects: (i) a hidden layer formulates a Gibbs distribution, in which neurons define the energy function, and (ii) the connection between two adjacent layers can be modeled by a product of experts model. Based on the probabilistic representation, we demonstrate that the entire architecture of DNNs can be explained as a Bayesian hierarchical model. Moreover, the proposed probabilistic representation indicates that DNNs have explicit regularizations defined by the hidden layers serving as prior distributions. Based on the Bayesian explanation for the regularization of DNNs, we propose a novel regularization approach to improve the generalization performance of DNNs. Simulation results validate the proposed theories. ",
        "keywords": "Neural Networks;Gaussian Process;Probabilistic Representation for Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinjie Lan;Kenneth E. Barner",
        "authorids": "lxjbit@udel.edu;barner@udel.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlan2020probabilistic,\ntitle={Probabilistic modeling the hidden layers of deep neural networks},\nauthor={Xinjie Lan and Kenneth E. Barner},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxJO3VFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByxJO3VFwB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "360;538;192",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "663;1355;1063",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            363.3333333333333,
            141.27357227104517
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1027.0,
            283.65236940076255
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aaXVslt0h2QJ:scholar.google.com/&scioq=Probabilistic+modeling+the+hidden+layers+of+deep+neural+networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ByxJjlHKwr",
        "title": "Learning Latent State Spaces for Planning through Reward Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "A latent reward prediction model is learned to achieve concise representation and plan efficiently using MPC.",
        "abstract": "Model-based reinforcement learning methods typically learn models for high-dimensional state spaces by aiming to reconstruct and predict the original observations. However, drawing inspiration from model-free reinforcement learning, we propose learning a latent dynamics model directly from rewards. In this work, we introduce a model-based planning framework which learns a latent reward prediction model and then plan in the latent state-space. The latent representation is learned exclusively from multi-step reward prediction which we show to be the only necessary information for successful planning.  With this framework, we are able to benefit from the concise model-free representation, while still enjoying the data-efficiency of model-based algorithms.  We demonstrate our framework in multi-pendulum and multi-cheetah environments where several pendulums or cheetahs are shown to the agent but only one of them produces rewards. In these environments, it is important for the agent to construct a concise latent representation to filter out irrelevant observations. We find that our method can successfully learn an accurate latent reward prediction model in the presence of the irrelevant information while existing model-based methods fail. Planning in the learned latent state-space shows strong performance and high sample efficiency over model-free and model-based baselines.",
        "keywords": "Deep Reinforcement Learning;Representation Learning;Model Based Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aaron Havens;Yi Ouyang;Prabhat Nagarajan;Yasuhiro Fujita",
        "authorids": "ahavens2@illinois.edu;ouyangyi@preferred-america.com;prabhat@preferred.jp;fujita@preferred.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhavens2020learning,\ntitle={Learning Latent State Spaces for Planning through Reward Prediction},\nauthor={Aaron Havens and Yi Ouyang and Prabhat Nagarajan and Yasuhiro Fujita},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxJjlHKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ByxJjlHKwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "417;495;277",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "517;560;337",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            396.3333333333333,
            90.18992306362293
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            471.3333333333333,
            96.59652627754731
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16028809831555878069&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ByxKo04tvr",
        "title": "Multigrid Neural Memory",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel neural memory architecture that co-locates memory and computation throughout the network structure, providing addressable, scalable, long-term and large capacity neural memory.",
        "abstract": "We introduce a novel architecture that integrates a large addressable memory space into the core functionality of a deep neural network.  Our design distributes both memory addressing operations and storage capacity over many network layers.  Distinct from strategies that connect neural networks to external memory banks, our approach co-locates memory with computation throughout the network structure.  Mirroring recent architectural innovations in convolutional networks, we organize memory into a multiresolution hierarchy, whose internal connectivity enables learning of dynamic information routing strategies and data-dependent read/write operations.  This multigrid spatial layout permits parameter-efficient scaling of memory size, allowing us to experiment with memories substantially larger than those in prior work.  We demonstrate this capability on synthetic exploration and mapping tasks, where the network is able to self-organize and retain long-term memory for trajectories of thousands of time steps.  On tasks decoupled from any notion of spatial geometry, such as sorting or associative recall, our design functions as a truly generic memory and yields results competitive with those of the recently proposed Differentiable Neural Computer.",
        "keywords": "multigrid architecture;memory network;convolutional neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tri Huynh;Michael Maire;Matthew R. Walter",
        "authorids": "trihuynh@uchicago.edu;mmaire@uchicago.edu;mwalter@ttic.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhuynh2020multigrid,\ntitle={Multigrid Neural Memory},\nauthor={Tri Huynh and Michael Maire and Matthew R. Walter},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxKo04tvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByxKo04tvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "309;549;313",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "705;1102;315",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            390.3333333333333,
            112.20615946650265
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            707.3333333333334,
            321.29564094286883
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15687545930604068210&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "ByxODxHYwB",
        "title": "Multi-source Multi-view Transfer Learning in Neural Topic Modeling with Pretrained Topic and Word Embeddings",
        "track": "main",
        "status": "Reject",
        "tldr": "Transfer learning in Neural Topic Modeling using Pretrained Word and Topic Embeddings jointly from one or many sources to improve quality of topics and document representations in sparse-data settings ",
        "abstract": "Though word embeddings and topics are complementary representations, several\npast works have only used pretrained word embeddings in (neural) topic modeling\nto address data sparsity problem in short text or small collection of documents.\nHowever, no prior work has employed (pretrained latent) topics in transfer learning\nparadigm. In this paper, we propose a framework to perform transfer learning\nin neural topic modeling using (1) pretrained (latent) topics obtained from a large\nsource corpus, and (2) pretrained word and topic embeddings jointly (i.e., multiview)\nin order to improve topic quality, better deal with polysemy and data sparsity\nissues in a target corpus. In doing so, we first accumulate topics and word representations\nfrom one or many source corpora to build respective pools of pretrained\ntopic (i.e., TopicPool) and word embeddings (i.e., WordPool). Then, we identify\none or multiple relevant source domain(s) and take advantage of corresponding\ntopics and word features via the respective pools to guide meaningful learning\nin the sparse target domain. We quantify the quality of topic and document representations\nvia generalization (perplexity), interpretability (topic coherence) and\ninformation retrieval (IR) using short-text, long-text, small and large document\ncollections from news and medical domains. We have demonstrated the state-ofthe-\nart results on topic modeling with the proposed transfer learning approaches.",
        "keywords": "Neural Topic Modeling;Transfer Learning;Unsupervised learning;Natural Language Processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pankaj Gupta;Yatin Chaudhary;Hinrich Sch\u00fctze",
        "authorids": "pankaj_gupta96@yahoo.com;yatinchaudhary91@gmail.com;hinrich@hotmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngupta2020multisource,\ntitle={Multi-source Multi-view Transfer Learning in Neural Topic Modeling with Pretrained Topic and Word Embeddings},\nauthor={Pankaj Gupta and Yatin Chaudhary and Hinrich Sch{\\\"u}tze},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxODxHYwB}\n}",
        "github": "https://drive.google.com/drive/folders/1tBFEfEapcCYw6JZ6Gi-KtPrJwH6D2KZK",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ByxODxHYwB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "196;379;314",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "324;181;148",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            296.3333333333333,
            75.74665375814007
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            217.66666666666666,
            76.38644440533207
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xO9HDccjftIJ:scholar.google.com/&scioq=Multi-source+Multi-view+Transfer+Learning+in+Neural+Topic+Modeling+with+Pretrained+Topic+and+Word+Embeddings&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "ByxQB1BKwH",
        "title": "Abstract Diagrammatic Reasoning with Multiplex Graph Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "MXGNet is a multilayer, multiplex graph based architecture which achieves good performance on various diagrammatic reasoning tasks.",
        "abstract": "Abstract reasoning, particularly in the visual domain, is a complex human ability, but it remains a challenging problem for artificial neural learning systems. In this work we propose MXGNet, a multilayer graph neural network for multi-panel diagrammatic reasoning tasks. MXGNet combines three powerful concepts, namely, object-level representation, graph neural networks and multiplex graphs, for solving visual reasoning tasks. MXGNet first extracts object-level representations for each element in all panels of the diagrams, and then forms a multi-layer multiplex graph capturing multiple relations between objects across different diagram panels. MXGNet summarises the multiple graphs extracted from the diagrams of the task, and uses this summarisation to pick the most probable answer from the given candidates. We have tested MXGNet on two types of diagrammatic reasoning tasks, namely Diagram Syllogisms and Raven Progressive Matrices (RPM). For an Euler Diagram Syllogism task MXGNet achieves state-of-the-art accuracy of 99.8%.  For PGM and RAVEN, two comprehensive datasets for RPM reasoning, MXGNet outperforms the state-of-the-art models by a considerable margin.",
        "keywords": "reasoning;Raven Progressive Matrices;graph neural networks;multiplex graphs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Duo Wang;Mateja Jamnik;Pietro Lio",
        "authorids": "wd263@cam.ac.uk;mateja.jamnik@cl.cam.ac.uk;pietro.lio@cl.cam.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nWang2020Abstract,\ntitle={Abstract Diagrammatic Reasoning with Multiplex Graph Networks},\nauthor={Duo Wang and Mateja Jamnik and Pietro Lio},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxQB1BKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByxQB1BKwH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "442;281;280",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1048;57;324",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            334.3333333333333,
            76.13292469242347
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            476.3333333333333,
            418.66799362847036
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 70,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14542749729372868151&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "ByxRM0Ntvr",
        "title": "Are Transformers universal approximators of sequence-to-sequence functions?",
        "track": "main",
        "status": "Poster",
        "tldr": "We prove that Transformer networks are universal approximators of sequence-to-sequence functions.",
        "abstract": "Despite the widespread adoption of Transformer models for NLP tasks, the expressive power of these models is not well-understood. In this paper, we establish that Transformer models are universal approximators of continuous permutation equivariant sequence-to-sequence functions with compact support, which is quite surprising given the amount of shared parameters in these models. Furthermore, using positional encodings, we circumvent the restriction of permutation equivariance, and show that Transformer models can universally approximate arbitrary continuous sequence-to-sequence functions on a compact domain. Interestingly, our proof techniques clearly highlight the different roles of the self-attention and the feed-forward layers in Transformers. In particular, we prove that fixed width self-attention layers can compute contextual mappings of the input sequences, playing a key role in the universal approximation property of Transformers. Based on this insight from our analysis, we consider other simpler alternatives to self-attention layers and empirically evaluate them.",
        "keywords": "Transformer;universal approximation;contextual mapping;expressive power;permutation equivariance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chulhee Yun;Srinadh Bhojanapalli;Ankit Singh Rawat;Sashank Reddi;Sanjiv Kumar",
        "authorids": "chulheey@mit.edu;bsrinadh@google.com;ankitsrawat@google.com;sashank@google.com;sanjivk@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nYun2020Are,\ntitle={Are Transformers universal approximators of sequence-to-sequence functions?},\nauthor={Chulhee Yun and Srinadh Bhojanapalli and Ankit Singh Rawat and Sashank Reddi and Sanjiv Kumar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxRM0Ntvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ByxRM0Ntvr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "377;840;384",
        "wc_reply_reviewers": "0;176;0",
        "wc_reply_authors": "578;509;248",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            533.6666666666666,
            216.62922753456476
        ],
        "wc_reply_reviewers_avg": [
            58.666666666666664,
            82.96719565922157
        ],
        "wc_reply_authors_avg": [
            445.0,
            142.11966788590522
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 432,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5470957626891296966&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "ByxSdh4tPS",
        "title": "EnsembleNet: End-to-End Optimization of Multi-headed Models",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Ensembling is a universally useful approach to boost the performance of machine learning models. However, individual models in an ensemble were traditionally trained independently in separate stages without information access about the overall ensemble. Many co-distillation approaches were proposed in order to treat model ensembling as first-class citizens. In this paper, we reveal a deeper connection between ensembling and distillation, and come up with a simpler yet more effective co-distillation architecture. On large-scale datasets including ImageNet, YouTube-8M, and Kinetics, we demonstrate a general procedure that can convert a single deep neural network to a multi-headed model that has not only a smaller size but also better performance. The model can be optimized end-to-end with our proposed co-distillation loss in a single stage without human intervention.",
        "keywords": "Computer Vision;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hanhan Li;Joe Ng;Apostol (Paul) Natsev",
        "authorids": "mirror.haha@gmail.com;yhng@google.com;natsev@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByxSdh4tPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "240;268;198",
        "wc_reply_reviewers": "293;175;0",
        "wc_reply_authors": "298;131;57",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            235.33333333333334,
            28.767265347188555
        ],
        "wc_reply_reviewers_avg": [
            156.0,
            120.36887748361977
        ],
        "wc_reply_authors_avg": [
            162.0,
            100.80013227504548
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14779734297940603909&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ByxT7TNFvH",
        "title": "Semantically-Guided Representation Learning for Self-Supervised Monocular Depth",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a novel semantically-guided architecture for self-supervised monocular depth estimation",
        "abstract": "Self-supervised learning is showing great promise for monocular depth estimation, using geometry as the only source of supervision. Depth networks are indeed capable of learning representations that relate visual appearance to 3D properties by implicitly leveraging category-level patterns. In this work we investigate how to leverage more directly this semantic structure to guide geometric representation learning, while remaining in the self-supervised regime. Instead of using semantic labels and proxy losses in a multi-task approach, we propose a new architecture leveraging fixed pretrained semantic segmentation networks to guide self-supervised representation learning via pixel-adaptive convolutions. Furthermore, we propose a two-stage training process to overcome a common semantic bias on dynamic objects via resampling. Our method improves upon the state of the art for self-supervised monocular depth prediction over all pixels, fine-grained details, and per semantic categories.\n",
        "keywords": "computer vision;machine learning;deep learning;monocular depth estimation;self-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vitor Guizilini;Rui Hou;Jie Li;Rares Ambrus;Adrien Gaidon",
        "authorids": "vitor.guizilini@tri.global;rayhou@umich.edu;jie.li@tri.global;rares.ambrus@tri.global;adrien.gaidon@tri.global",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nGuizilini2020Semantically-Guided,\ntitle={Semantically-Guided Representation Learning for Self-Supervised Monocular Depth},\nauthor={Vitor Guizilini and Rui Hou and Jie Li and Rares Ambrus and Adrien Gaidon},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxT7TNFvH}\n}",
        "github": "https://github.com/tri-ml/packnet-sfm",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByxT7TNFvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "193;688;279",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1151;1432;676",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            386.6666666666667,
            215.94803901762006
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1086.3333333333333,
            312.0046295952816
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 285,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17082069917027724929&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ByxV2kBYwB",
        "title": "Progressive Knowledge Distillation For Generative Modeling",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper introduces progressive knowledge distillation for learning generative models that are recognition task oriented",
        "abstract": "While modern  generative  models are able to synthesize high-fidelity, visually appealing images, successfully generating examples that are useful for recognition tasks remains an elusive goal. To this end, our key insight is that the examples should be synthesized to recover classifier decision boundaries that would be learned from a large amount of real examples. More concretely, we treat a classifier trained on synthetic examples as ''student'' and a classifier trained on real examples as ''teacher''. By introducing knowledge distillation into a meta-learning framework, we encourage the generative model to produce examples in a way that enables the student classifier to mimic the behavior of the teacher. To mitigate the potential gap between student and teacher classifiers, we further propose to distill the knowledge in a progressive manner, either by gradually strengthening the teacher or weakening the student. We demonstrate the use of our model-agnostic distillation approach to deal with data scarcity, significantly improving few-shot learning performance on miniImageNet and ImageNet1K benchmarks.",
        "keywords": "knowledge distillation;generative modeling;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu-Xiong Wang;Adrien Bardes;Ruslan Salakhutdinov;Martial Hebert",
        "authorids": "yuxiongw@cs.cmu.edu;adrien.bardes@dbmail.com;rsalakhu@cs.cmu.edu;hebert@ri.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByxV2kBYwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "348;740;207",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            431.6666666666667,
            225.4955037738496
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=165062064312071874&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ByxXZpVtPB",
        "title": "Homogeneous Linear Inequality Constraints for Neural Network Activations",
        "track": "main",
        "status": "Reject",
        "tldr": "We enforce homogeneous linear inequality constraints on neural network activations by directly incorporating these constraints into the architecture, which yields a significant speed-up at test time.",
        "abstract": "We propose a method to impose homogeneous linear inequality constraints of the form $Ax\\leq 0$ on neural network activations. The proposed method allows a data-driven training approach to be combined with modeling prior knowledge about the task. One way to achieve this task is by means of a projection step at test time after unconstrained training.\nHowever, this is an expensive operation. By directly incorporating the constraints into the architecture, we can significantly speed-up inference at test time; for instance, our experiments show a speed-up of up to two orders of magnitude over a projection method. Our algorithm computes a suitable parameterization of the feasible set at initialization and uses standard variants of stochastic gradient descent to find solutions to the constrained network. Thus, the modeling constraints are always satisfied during training. Crucially, our approach avoids to solve an optimization problem at each training step or to manually trade-off data and constraint fidelity with additional hyperparameters. We consider constrained generative modeling as an important application domain and experimentally demonstrate the proposed method by constraining a variational autoencoder.",
        "keywords": "deep learning;constrained optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thomas Frerix;Matthias Nie\u00dfner;Daniel Cremers",
        "authorids": "thomas.frerix@tum.de;niessner@tum.de;cremers@tum.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfrerix2020homogeneous,\ntitle={Homogeneous Linear Inequality Constraints for Neural Network Activations},\nauthor={Thomas Frerix and Matthias Nie{\\ss}ner and Daniel Cremers},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxXZpVtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer5",
        "site": "https://openreview.net/forum?id=ByxXZpVtPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "313;323;401",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "207;105;169",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            345.6666666666667,
            39.33898264515181
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            160.33333333333334,
            42.089850980438925
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6806111505950610614&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "ByxY8CNtvr",
        "title": "Improving Neural Language Generation with Spectrum Control",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Recent Transformer-based models such as Transformer-XL and BERT have achieved huge success on various natural language processing tasks. However, contextualized embeddings at the output layer of these powerful models tend to degenerate and occupy an anisotropic cone in the vector space, which is called the representation degeneration problem. In this paper, we propose a novel spectrum control approach to address this degeneration problem. The core idea of our method is to directly guide the spectra training of the output embedding matrix with a slow-decaying singular value prior distribution through a reparameterization framework. We show that our proposed method encourages isotropy of the learned word representations while maintains the modeling power of these contextual neural models. We further provide a theoretical analysis and insight on the benefit of modeling singular value distribution. We demonstrate that our spectrum control method outperforms the state-of-the-art Transformer-XL modeling for language model, and various Transformer-based models for machine translation, on common benchmark datasets for these tasks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lingxiao Wang;Jing Huang;Kevin Huang;Ziniu Hu;Guangtao Wang;Quanquan Gu",
        "authorids": "lingxw@cs.ucla.edu;jing.huang@jd.com;kevin.huang3@jd.com;bull@cs.ucla.edu;guangtao.wang@jd.com;qgu@cs.ucla.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nWang2020Improving,\ntitle={Improving Neural Language Generation with Spectrum Control},\nauthor={Lingxiao Wang and Jing Huang and Kevin Huang and Ziniu Hu and Guangtao Wang and Quanquan Gu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxY8CNtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByxY8CNtvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "108;855;164",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "623;711;87",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            375.6666666666667,
            339.7100070484955
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            473.6666666666667,
            275.7647951102936
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 96,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12337947197707124105&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Byx_GeSKPS",
        "title": "Learning Semantic Correspondences from Noisy Data-text Pairs by Local-to-Global Alignments",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a local-to-global alignment framework to learn semantic correspondences from noisy data-text pairs with weak supervision",
        "abstract": "Learning semantic correspondence between the structured data (e.g., slot-value pairs) and associated texts is a core problem for many downstream NLP applications, e.g., data-to-text generation. Recent neural generation methods require to use large scale training data. However, the collected data-text pairs for training are usually loosely corresponded, where texts contain additional or contradicted information compare to its paired input. In this paper, we propose a local-to-global alignment (L2GA) framework to learn semantic correspondences from loosely related data-text pairs. First, a local alignment model based on multi-instance learning is applied to build the semantic correspondences within a data-text pair. Then, a global alignment model built on top of a memory guided conditional random field (CRF) layer is designed to exploit dependencies among alignments in the entire training corpus, where the memory is used to integrate the alignment clues provided by the local alignment model. Therefore, it is capable of inducing missing alignments for text spans that are not supported by its imperfect paired input. Experiments on recent restaurant dataset show that our proposed method can improve the alignment accuracy and as a by product, our method is also applicable to induce semantically equivalent training data-text pairs for neural generation models.",
        "keywords": "textual grounding;data-to-text generation;multi-instance learning;conditional random fields",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Feng Nie;Jinpeng Wang;Rong Pan;Chin-Yew Lin",
        "authorids": "fengniesysu@gmail.com;jinpwa@microsoft.com;panr@sysu.edu.cn;cyl@microsoft.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Byx_GeSKPS",
        "pdf_size": 0,
        "rating": "3;8",
        "confidence": "0;0",
        "wc_review": "161;102",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            5.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            131.5,
            29.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QXtVrQqYj9YJ:scholar.google.com/&scioq=Learning+Semantic+Correspondences+from+Noisy+Data-text+Pairs+by+Local-to-Global+Alignments&hl=en&as_sdt=0,33",
        "gs_version_total": 2
    },
    {
        "id": "Byx_YAVYPH",
        "title": "Jelly Bean World: A Testbed for Never-Ending Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Machine learning has shown growing success in recent years. However, current machine learning systems are highly specialized, trained for particular problems or domains, and typically on a single narrow dataset. Human learning, on the other hand, is highly general and adaptable. Never-ending learning is a machine learning paradigm that aims to bridge this gap, with the goal of encouraging researchers to design machine learning systems that can learn to perform a wider variety of inter-related tasks in more complex environments. To date, there is no environment or testbed to facilitate the development and evaluation of never-ending learning systems. To this end, we propose the Jelly Bean World testbed. The Jelly Bean World allows experimentation over two-dimensional grid worlds which are filled with items and in which agents can navigate. This testbed provides environments that are sufficiently complex and where more generally intelligent algorithms ought to perform better than current state-of-the-art reinforcement learning approaches. It does so by producing non-stationary environments and facilitating experimentation with multi-task, multi-agent, multi-modal, and curriculum learning settings. We hope that this new freely-available software will prompt new research and interest in the development and evaluation of never-ending learning systems and more broadly, general intelligence systems.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Emmanouil Antonios Platanios;Abulhair Saparov;Tom Mitchell",
        "authorids": "e.a.platanios@cs.cmu.edu;asaparov@cs.cmu.edu;tom.mitchell@cs.cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nPlatanios2020Jelly,\ntitle={Jelly Bean World: A Testbed for Never-Ending Learning},\nauthor={Emmanouil Antonios Platanios and Abulhair Saparov and Tom Mitchell},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Byx_YAVYPH}\n}",
        "github": "[![github](/images/github_icon.svg) eaplatanios/jelly-bean-world](https://github.com/eaplatanios/jelly-bean-world) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=Byx_YAVYPH)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Byx_YAVYPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "1269;763;129",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "790;1567;160",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            720.3333333333334,
            466.379911898253
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            839.0,
            575.449389607809
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 40,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13920710483001851413&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ByxaUgrFvH",
        "title": "Mutual Information Gradient Estimation for Representation Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Mutual Information (MI) plays an important role in representation learning. However, MI is unfortunately intractable in continuous and high-dimensional settings. Recent advances establish tractable and scalable MI estimators to discover useful representation. However, most of the existing methods are not capable of providing an accurate estimation of MI with low-variance when the MI is large. We argue that directly estimating the gradients of MI is more appealing for representation learning than estimating MI in itself. To this end, we propose the Mutual Information Gradient Estimator (MIGE) for representation learning based on the score estimation of implicit distributions. MIGE exhibits a tight and smooth gradient estimation of MI in the high-dimensional and large-MI settings. We expand the applications of MIGE in both unsupervised learning of deep representations based on InfoMax and the Information Bottleneck method. Experimental results have indicated significant performance improvement in learning useful representation.",
        "keywords": "Mutual Information;Score Estimation;Representation Learning;Information Bottleneck",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liangjian Wen;Yiji Zhou;Lirong He;Mingyuan Zhou;Zenglin Xu",
        "authorids": "wlj6816@gmail.com;zhouyiji@outlook.com;ronghe1217@gmail.com;mingyuan.zhou@mccombs.utexas.edu;zenglin@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nWen2020Mutual,\ntitle={Mutual Information Gradient Estimation for  Representation Learning},\nauthor={Liangjian Wen and Yiji Zhou and Lirong He and Mingyuan Zhou and Zenglin Xu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxaUgrFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByxaUgrFvH",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "481;594;381;685",
        "wc_reply_reviewers": "217;164;0;272",
        "wc_reply_authors": "710;1238;337;915",
        "reply_reviewers": "1;1;0;3",
        "reply_authors": "2;3;1;3",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            535.25,
            114.68734673014282
        ],
        "wc_reply_reviewers_avg": [
            163.25,
            101.6940878320859
        ],
        "wc_reply_authors_avg": [
            800.0,
            326.9319501058286
        ],
        "reply_reviewers_avg": [
            1.25,
            1.0897247358851685
        ],
        "reply_authors_avg": [
            2.25,
            0.82915619758885
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2470573821042476753&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ByxdUySKvS",
        "title": "Adversarial AutoAugment",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce the idea of adversarial learning into automatic data augmentation to improve the generalization  of a targe network.",
        "abstract": "Data augmentation (DA) has been widely utilized to improve generalization in training deep neural networks. Recently, human-designed data augmentation has been gradually replaced by automatically learned augmentation policy. Through finding the best policy in well-designed search space of data augmentation, AutoAugment (Cubuk et al., 2019) can significantly improve validation accuracy on image classification tasks. However, this approach is not computationally practical for large-scale problems. In this paper, we develop an adversarial method to arrive at a computationally-affordable solution called Adversarial AutoAugment, which can simultaneously optimize target related object and augmentation policy search loss. The augmentation policy network attempts to increase the training loss of a target network through generating adversarial augmentation policies, while the target network can learn more robust features from harder examples to improve the generalization. In contrast to prior work, we reuse the computation in target network training for policy evaluation, and dispense with the retraining of the target network. Compared to AutoAugment, this leads to about 12x reduction in computing cost and 11x shortening in time overhead on ImageNet. We show experimental results of our approach on CIFAR-10/CIFAR-100, ImageNet, and demonstrate significant performance improvements over state-of-the-art. On CIFAR-10, we achieve a top-1 test error of 1.36%, which is the currently best performing single model. On ImageNet, we achieve a leading performance of top-1 accuracy 79.40% on ResNet-50 and 80.00% on ResNet-50-D without extra data.",
        "keywords": "Automatic Data Augmentation;Adversarial Learning;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinyu Zhang;Qiang Wang;Jian Zhang;Zhao Zhong",
        "authorids": "zhangxinyu10@huawei.com;wangqiang168@huawei.com;zhangjian157@huawei.com;zorro.zhongzhao@huawei.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nzhang2020adversarial,\ntitle={Adversarial AutoAugment},\nauthor={Xinyu Zhang and Qiang Wang and Jian Zhang and Zhao Zhong},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxdUySKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ByxdUySKvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "807;233;365",
        "wc_reply_reviewers": "547;0;9",
        "wc_reply_authors": "1050;540;438",
        "reply_reviewers": "3;0;1",
        "reply_authors": "4;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            468.3333333333333,
            245.46192282216717
        ],
        "wc_reply_reviewers_avg": [
            185.33333333333334,
            255.7633454756347
        ],
        "wc_reply_authors_avg": [
            676.0,
            267.71626771640155
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            1.247219128924647
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 257,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7032678034593697034&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ByxduJBtPB",
        "title": "When Covariate-shifted Data Augmentation Increases Test Error And How to Fix It",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Empirically, data augmentation sometimes improves and sometimes hurts test error, even when only adding points with labels from the true conditional distribution that the hypothesis class is expressive enough to fit.  In this paper, we provide precise conditions under which data augmentation hurts test accuracy for minimum norm estimators in linear regression. To mitigate the failure modes of augmentation, we introduce X-regularization, which uses unlabeled data to regularize the parameters towards the non-augmented estimate. We prove that our new estimator never hurts test error and exhibits significant improvements over adversarial data augmentation on CIFAR-10.",
        "keywords": "data augmentation;adversarial training;interpolation;overparameterized",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sang Michael Xie*;Aditi Raghunathan*;Fanny Yang;John C. Duchi;Percy Liang",
        "authorids": "xie@cs.stanford.edu;aditir@stanford.edu;fannyang@stanford.edu;jduchi@stanford.edu;pliang@cs.stanford.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nxie*2020when,\ntitle={When Covariate-shifted Data Augmentation Increases Test Error And How to Fix It},\nauthor={Sang Michael Xie* and Aditi Raghunathan* and Fanny Yang and John C. Duchi and Percy Liang},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxduJBtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByxduJBtPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "1180;363;710",
        "wc_reply_reviewers": "0;0;4",
        "wc_reply_authors": "482;156;337",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            751.0,
            334.7964555766185
        ],
        "wc_reply_reviewers_avg": [
            1.3333333333333333,
            1.8856180831641267
        ],
        "wc_reply_authors_avg": [
            325.0,
            133.35916416454728
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6443923883306151649&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ByxhOyHYwH",
        "title": "Fast Task Adaptation for Few-Shot Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel Metric-Softmax loss to learn task-agnostic feature and adapt the classifier to each few-shot task using a task-adaptive transformation.",
        "abstract": "Few-shot classification is a challenging task due to the scarcity of training examples for each class. The key lies in generalization of prior knowledge learned from large-scale base classes and fast adaptation of the classifier to novel classes. In this paper, we introduce a two-stage framework. In the first stage, we attempt to learn task-agnostic feature on base data with a novel Metric-Softmax loss. The Metric-Softmax loss is trained against the whole label set and learns more discriminative feature than episodic training. Besides, the Metric-Softmax classifier can be applied to base and novel classes in a consistent manner, which is critical for the generalizability of the learned feature. In the second stage, we design a task-adaptive transformation which adapts the classifier to each few-shot setting very fast within a few tuning epochs. Compared with existing fine-tuning scheme, the scarce examples of novel classes are exploited more effectively. Experiments show that our approach outperforms current state-of-the-arts by a large margin on the commonly used mini-ImageNet and CUB-200-2011 benchmarks.",
        "keywords": "Few-Shot Learning;Metric-Softmax Loss;Fast Task Adaptation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yingying Zhang;Qiaoyong Zhong;Di Xie;Shiliang Pu",
        "authorids": "zhangyingying7@hikvision.com;zhongqiaoyong@hikvision.com;xiedi@hikvision.com;pushiliang@hikvision.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020fast,\ntitle={Fast Task Adaptation for Few-Shot Learning},\nauthor={Yingying Zhang and Qiaoyong Zhong and Di Xie and Shiliang Pu},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxhOyHYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByxhOyHYwH",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "866;646;273",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "487;789;38",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            595.0,
            244.7624698900276
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            438.0,
            308.5460527484782
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            27,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "Byxl-04KvH",
        "title": "NESTED LEARNING FOR MULTI-GRANULAR TASKS",
        "track": "main",
        "status": "Reject",
        "tldr": "DNNs can learn nested representations and output different quality of predictions with their respective confidence.",
        "abstract": "Standard deep neural networks (DNNs) used for classification are trained in an end-to-end fashion for very specific tasks - object recognition, face identification, character recognition, etc. This specificity often leads to overconfident models that generalize poorly to samples that are not from the original training distribution. Moreover, they do not allow to leverage information from heterogeneously annotated data, where for example, labels may be provided with different levels of granularity. Finally, standard DNNs do not produce results with simultaneous different levels of confidence for different levels of detail, they are most commonly an all or nothing approach. To address these challenges, we introduce the problem of nested learning: how to obtain a hierarchical representation of the input such that a coarse label can be extracted first, and sequentially refine this representation to obtain successively refined predictions, all of them with the corresponding confidence. We explicitly enforce this behaviour by creating a sequence of nested information bottlenecks. Looking at the problem of nested learning from an in formation theory perspective, we design a network topology with two important properties. First, a sequence of low dimensional (nested) feature embeddings are enforced. Then we show how the explicit combination of nested outputs can improve both robustness and finer predictions. Experimental results on CIFAR-10, MNIST, and FASHION-MNIST demonstrate that nested learning outperforms the same network trained in the standard end-to-end fashion. Since the network can be naturally trained with mixed data labeled at different levels of nested details, we also study what is the most efficient way of annotating data, when a fixed training budget is given and the cost of labels increases with the levels in the nested hierarchy.",
        "keywords": "Nested learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rapha\u00ebl Achddou;J. Matias Di Martino;Guillermo Sapiro",
        "authorids": "raphael.achddou@telecom-paristech.fr;matiasdm@fing.edu.uy;guillermo.sapiro@duke.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nachddou2020nested,\ntitle={{\\{}NESTED{\\}} {\\{}LEARNING{\\}} {\\{}FOR{\\}} {\\{}MULTI{\\}}-{\\{}GRANULAR{\\}} {\\{}TASKS{\\}}},\nauthor={Rapha{\\\"e}l Achddou and J. Matias Di Martino and Guillermo Sapiro},\nyear={2020},\nurl={https://openreview.net/forum?id=Byxl-04KvH}\n}",
        "github": "https://github.com/nestedlearning2019",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Byxl-04KvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "259;541;522",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "490;427;833",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            440.6666666666667,
            128.69170740788064
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            583.3333333333334,
            178.40465863374257
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1932859127942858571&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ByxloeHFPS",
        "title": "PROVABLY BENEFITS OF DEEP HIERARCHICAL RL",
        "track": "main",
        "status": "Reject",
        "tldr": "We theoretically show an exponential improvement using Deep HRL comparing to standard RL framework.",
        "abstract": "Modern complex sequential decision-making problem often both low-level policy and high-level planning. Deep hierarchical reinforcement learning (Deep HRL) admits multi-layer abstractions which naturally model the policy in a hierarchical manner, and it is believed that deep HRL can reduce the sample complexity compared to the standard RL frameworks. We initiate the study of rigorously characterizing the complexity of Deep HRL. We present a model-based optimistic algorithm which demonstrates that the complexity of learning a near-optimal policy for deep HRL scales with the sum of number of states at each abstraction layer whereas standard RL scales with the product of number of states at each abstraction layer. Our algorithm achieves this goal by using the fact that distinct high-level states have similar low-level structures, which allows an efficient information exploitation and thus experiences from different high-level state-action pairs can be generalized to unseen state-actions. Overall, our result shows an exponential improvement using Deep HRL comparing to standard RL framework.",
        "keywords": "hierarchical model;reinforcement learning;low regret;online learning;tabular reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zeyu Jia;Simon S. Du;Ruosong Wang;Mengdi Wang;Lin F. Yang",
        "authorids": "jiazy@pku.edu.cn;ssdu@ias.edu;ruosongw@andrew.cmu.edu;mengdiw@princeton.edu;linyang@ee.ucla.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\njia2020provably,\ntitle={{\\{}PROVABLY{\\}} {\\{}BENEFITS{\\}} {\\{}OF{\\}} {\\{}DEEP{\\}} {\\{}HIERARCHICAL{\\}} {\\{}RL{\\}}},\nauthor={Zeyu Jia and Simon S. Du and Ruosong Wang and Mengdi Wang and Lin F. Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxloeHFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByxloeHFPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1092;621;320",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "179;47;199",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            677.6666666666666,
            317.70461473233627
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            141.66666666666666,
            67.43556595414289
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fSNnr0tSj3MJ:scholar.google.com/&scioq=PROVABLY+BENEFITS+OF+DEEP+HIERARCHICAL+RL&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Byxn9CNYPr",
        "title": "Domain-Relevant Embeddings for Question Similarity",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We show that question-answer matching is a particularly good pre-training task for question-similarity and release a dataset for medical question similarity",
        "abstract": "The rate at which medical questions are asked online significantly exceeds the capacity of qualified people to answer them, leaving many questions unanswered or inadequately answered. Many of these questions are not unique, and reliable identification of similar questions would enable more efficient and effective question answering schema. While many research efforts have focused on the problem of general question similarity, these approaches do not generalize well to the medical domain, where medical expertise is often required to determine semantic similarity. In this paper, we show how a semi-supervised approach of pre-training a neural network on medical question-answer pairs is a particularly useful intermediate task for the ultimate goal of determining medical question similarity. While other pre-training tasks yield an accuracy below 78.7% on this task, our model achieves an accuracy of 82.6% with the same number of training examples, an accuracy of 80.0% with a much smaller training set, and an accuracy of 84.5% when the full corpus of medical question-answer data is used.",
        "keywords": "Question Similarity;Medical Domain;Transfer Learning;Question Entailment",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Clara McCreery;Namit Katariya;Anitha Kannan;Manish Chablani;Xavier Amatriain",
        "authorids": "mccreery@stanford.edu;namit@curai.com;anitha@curai.com;manish@curai.com;xavier@curai.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Byxn9CNYPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "501;563;352",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            472.0,
            88.54753902095003
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aXngbVMWjS8J:scholar.google.com/&scioq=Domain-Relevant+Embeddings+for+Question+Similarity&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ByxoqJrtvr",
        "title": "Learning to Reach Goals Without Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Learning how to reach goals from scratch by using imitation learning with data relabeling",
        "abstract": "Imitation learning algorithms provide a simple and straightforward approach for training control policies via standard supervised learning methods. By maximizing the likelihood of good actions provided by an expert demonstrator, supervised imitation learning can produce effective policies without the algorithmic complexities and optimization challenges of reinforcement learning, at the cost of requiring an expert demonstrator -- typically a person -- to provide the demonstrations. In this paper, we ask: can we use imitation learning to train effective policies without any expert demonstrations? The key observation that makes this possible is that, in the multi-task setting, trajectories that are generated by a suboptimal policy can still serve as optimal examples for other tasks. In particular, in the setting where the tasks correspond to different goals, every trajectory is a successful demonstration for the state that it actually reaches. Informed by this observation, we propose a very simple algorithm for learning behaviors without any demonstrations, user-provided reward functions, or complex reinforcement learning methods. Our method simply maximizes the likelihood of actions the agent actually took in its own previous rollouts, conditioned on the goal being the state that it actually reached. Although related variants of this approach have been proposed previously in imitation learning settings with example demonstrations, we present the first instance of this approach as a method for learning goal-reaching policies entirely from scratch. We present a theoretical result linking self-supervised imitation learning and reinforcement learning, and empirical results showing that it performs competitively with more complex reinforcement learning methods on a range of challenging goal reaching problems.",
        "keywords": "Reinforcement Learning;Goal Reaching;Imitation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dibya Ghosh;Abhishek Gupta;Justin Fu;Ashwin Reddy;Coline Devin;Benjamin Eysenbach;Sergey Levine",
        "authorids": "dibya.ghosh@berkeley.edu;abhigupta@berkeley.edu;justinjfu@eecs.berkeley.edu;adreddy@berkeley.edu;coline@berkeley.edu;beysenba@cs.cmu.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nghosh2020learning,\ntitle={Learning to Reach Goals Without Reinforcement Learning},\nauthor={Dibya Ghosh and Abhishek Gupta and Justin Fu and Ashwin Reddy and Coline Devin and Benjamin Eysenbach and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxoqJrtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByxoqJrtvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "327;489;228",
        "wc_reply_reviewers": "63;216;0",
        "wc_reply_authors": "913;799;409",
        "reply_reviewers": "1;2;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.0,
            107.58252646224665
        ],
        "wc_reply_reviewers_avg": [
            93.0,
            90.69729874698585
        ],
        "wc_reply_authors_avg": [
            707.0,
            215.79620015190258
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 29,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1006577904486807676&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ByxtC2VtPB",
        "title": "Mixup Inference: Better Exploiting Mixup to Defend Adversarial Attacks",
        "track": "main",
        "status": "Poster",
        "tldr": "We exploit the global linearity of the mixup-trained models in inference to break the locality of the adversarial perturbations.",
        "abstract": "It has been widely recognized that adversarial examples can be easily crafted to fool deep networks, which mainly root from the locally non-linear behavior nearby input examples. Applying mixup in training provides an effective mechanism to improve generalization performance and model robustness against adversarial perturbations, which introduces the globally linear behavior in-between training examples. However, in previous work, the mixup-trained models only passively defend adversarial attacks in inference by directly classifying the inputs, where the induced global linearity is not well exploited. Namely, since the locality of the adversarial perturbations, it would be more efficient to actively break the locality via the globality of the model predictions. Inspired by simple geometric intuition, we develop an inference principle, named mixup inference (MI), for mixup-trained models. MI mixups the input with other random clean samples, which can shrink and transfer the equivalent perturbation if the input is adversarial. Our experiments on CIFAR-10 and CIFAR-100 demonstrate that MI can further improve the adversarial robustness for the models trained by mixup and its variants.",
        "keywords": "Trustworthy Machine Learning;Adversarial Robustness;Inference Principle;Mixup",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianyu Pang*;Kun Xu*;Jun Zhu",
        "authorids": "pty17@mails.tsinghua.edu.cn;kunxu.thu@gmail.com;dcszj@mail.tsinghua.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nPang*2020Mixup,\ntitle={Mixup Inference: Better Exploiting Mixup to Defend Adversarial Attacks},\nauthor={Tianyu Pang* and Kun Xu* and Jun Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxtC2VtPB}\n}",
        "github": "https://github.com/P2333/Mixup-Inference",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByxtC2VtPB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "686;141;254",
        "wc_reply_reviewers": "239;0;0",
        "wc_reply_authors": "590;149;173",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            360.3333333333333,
            234.85645734268317
        ],
        "wc_reply_reviewers_avg": [
            79.66666666666667,
            112.66568046905657
        ],
        "wc_reply_authors_avg": [
            304.0,
            202.46975082712973
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 136,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17489632663330060721&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "ByxtHCVKwB",
        "title": "Targeted sampling of enlarged neighborhood via Monte Carlo tree search for TSP",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper combines Monte Carlo tree search with 2-opt local search in a variable neighborhood mode to solve the TSP effectively.",
        "abstract": "The travelling salesman problem (TSP) is a well-known combinatorial optimization problem with a variety of real-life applications. We tackle TSP by incorporating machine learning methodology and leveraging the variable neighborhood search strategy. More precisely, the search process is considered as a Markov decision process (MDP), where a 2-opt local search is used to search within a small neighborhood, while a Monte Carlo tree search (MCTS) method (which iterates through simulation, selection and back-propagation steps), is used to sample a number of targeted actions within an enlarged neighborhood. This new paradigm clearly distinguishes itself from the existing machine learning (ML) based paradigms for solving the TSP, which either uses an end-to-end ML model, or simply applies traditional techniques after ML for post optimization. Experiments based on two public data sets show that, our approach clearly dominates all the existing learning based TSP algorithms in terms of performance, demonstrating its high potential on the TSP. More importantly, as a general framework without complicated hand-crafted rules, it can be readily extended to many other combinatorial optimization problems.",
        "keywords": "Travelling salesman problem;Monte Carlo tree search;Reinforcement learning;Variable neighborhood search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhang-Hua Fu;Kai-Bin Qiu;Meng Qiu;Hongyuan Zha",
        "authorids": "fuzhanghua@cuhk.edu.cn;20150008030@m.scnu.edu.cn;qiumeng.sz@gmail.com;zhahy@cuhk.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfu2020targeted,\ntitle={Targeted sampling of enlarged neighborhood via Monte Carlo tree search for {\\{}TSP{\\}}},\nauthor={Zhang-Hua Fu and Kai-Bin Qiu and Meng Qiu and Hongyuan Zha},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxtHCVKwB}\n}",
        "github": "https://github.com/Spider-scnu/Monte-Carlo-tree-search-for-TSP",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ByxtHCVKwB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "349;315;492",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            385.3333333333333,
            76.69130039030212
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3872364250295277619&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Byxv2pEKPH",
        "title": "Farkas layers: don't shift the data, fix the geometry",
        "track": "main",
        "status": "Reject",
        "tldr": "Geometric approach to mimicking effect of batch norm; can still train DNNs at large learning rate in the absence of all normalization",
        "abstract": "Successfully training deep neural networks often requires either {batch normalization}, appropriate {weight initialization}, both of which come with their own challenges. We propose an alternative, geometrically motivated method for training. Using elementary results from linear programming, we introduce Farkas layers: a method that ensures at least one neuron is active at a given layer. Focusing on residual networks with ReLU activation, we empirically demonstrate a significant improvement in training capacity in the absence of batch normalization or methods of initialization across a broad range of network sizes on benchmark datasets.",
        "keywords": "initialization;deep networks;residual networks;batch normalization;training;optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aram-Alexandre Pooladian;Chris Finlay;Adam M Oberman",
        "authorids": "aram-alexandre.pooladian@mail.mcgill.ca;christopher.finlay@gmail.com;adam.oberman@mcgill.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npooladian2020farkas,\ntitle={Farkas layers: don't shift the data, fix the geometry},\nauthor={Aram-Alexandre Pooladian and Chris Finlay and Adam M Oberman},\nyear={2020},\nurl={https://openreview.net/forum?id=Byxv2pEKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Byxv2pEKPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "510;266;229",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "194;76;180",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            335.0,
            124.66221025903025
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            150.0,
            52.63712251507169
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ezmnkrB88_IJ:scholar.google.com/&scioq=Farkas+layers:+don%27t+shift+the+data,+fix+the+geometry&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "ByxxgCEYDS",
        "title": "Inductive Matrix Completion Based on Graph Neural Networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "We propose an inductive matrix completion model without using side information. By factorizing the (rating) matrix into the product of low-dimensional latent embeddings of rows (users) and columns (items), a majority of existing matrix completion methods are transductive, since the learned embeddings cannot generalize to unseen rows/columns or to new matrices. To make matrix completion inductive, most previous works use content (side information), such as user's age or movie's genre, to make predictions. However, high-quality content is not always available, and can be hard to extract. Under the extreme setting where not any side information is available other than the matrix to complete, can we still learn an inductive matrix completion model? In this paper, we propose an Inductive Graph-based Matrix Completion (IGMC) model to address this problem. IGMC trains a graph neural network (GNN) based purely on 1-hop subgraphs around (user, item) pairs generated from the rating matrix and maps these subgraphs to their corresponding ratings. It achieves highly competitive performance with state-of-the-art transductive baselines. In addition, IGMC is inductive -- it can generalize to users/items unseen during the training (given that their interactions exist), and can even transfer to new tasks. Our transfer learning experiments show that a model trained out of the MovieLens dataset can be directly used to predict Douban movie ratings with surprisingly good performance. Our work demonstrates that: 1) it is possible to train inductive matrix completion models without using side information while achieving similar or better performances than state-of-the-art transductive methods; 2) local graph patterns around a (user, item) pair are effective predictors of the rating this user gives to the item; and 3) Long-range dependencies might not be necessary for modeling recommender systems.",
        "keywords": "matrix completion;graph neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Muhan Zhang;Yixin Chen",
        "authorids": "muhan@wustl.edu;chen@cse.wustl.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nZhang2020Inductive,\ntitle={Inductive Matrix Completion Based on Graph Neural Networks},\nauthor={Muhan Zhang and Yixin Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ByxxgCEYDS}\n}",
        "github": "https://github.com/muhanzhang/IGMC",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ByxxgCEYDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "569;412;232",
        "wc_reply_reviewers": "15;0;0",
        "wc_reply_authors": "631;804;106",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.3333333333333,
            137.68643925803136
        ],
        "wc_reply_reviewers_avg": [
            5.0,
            7.0710678118654755
        ],
        "wc_reply_authors_avg": [
            513.6666666666666,
            296.7898620610586
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 346,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16467785209736673104&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1MOqeHYvB",
        "title": "At Your Fingertips: Automatic Piano Fingering Detection",
        "track": "main",
        "status": "Reject",
        "tldr": "We automatically extract fingering information from videos of piano performances, to be used in automatic fingering prediction models.",
        "abstract": "Automatic Piano Fingering is a hard task which computers can learn using data. As data collection is hard and expensive, we propose to automate this process by automatically extracting fingerings from public videos and MIDI files, using computer-vision techniques. Running this process on 90 videos results in the largest dataset for piano fingering with more than 150K notes. We show that when running a previously proposed model for automatic piano fingering on our dataset and then fine-tuning it on manually labeled piano fingering data, we achieve state-of-the-art results.\nIn addition to the fingering extraction method, we also introduce a novel method for transferring deep-learning computer-vision models to work on out-of-domain data, by fine-tuning it on out-of-domain augmentation proposed by a Generative Adversarial Network (GAN).\n\nFor demonstration, we anonymously release a visualization of the output of our process for a single video on https://youtu.be/Gfs1UWQhr5Q",
        "keywords": "piano;fingering;dataset",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amit Moryossef;Yanai Elazar;Yoav Goldberg",
        "authorids": "amitmoryossef@gmail.com;yanaiela@gmail.com;yoav.goldberg@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmoryossef2020at,\ntitle={At Your Fingertips: Automatic Piano Fingering Detection},\nauthor={Amit Moryossef and Yanai Elazar and Yoav Goldberg},\nyear={2020},\nurl={https://openreview.net/forum?id=H1MOqeHYvB}\n}",
        "github": "https://drive.google.com/file/d/1kDPZSA7ppOaup9Q1Dab7bW4OXNh9mAQA/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1MOqeHYvB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "256;842;199",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "293;0;371",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            432.3333333333333,
            290.61123324622
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            221.33333333333334,
            159.7129369281719
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13555787352774136048&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1e-X64FDB",
        "title": "Fast Linear Interpolation for Piecewise-Linear Functions, GAMs, and Deep Lattice Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Fast implementations of linear interpolation operators are given for both piecewise linear functions and multi-dimensional look-up tables, producing 3-11x faster runtimes for single evaluations.",
        "abstract": "We present fast implementations of linear interpolation operators for both piecewise linear functions and multi-dimensional look-up tables. We use a compiler-based solution (using MLIR) for accelerating this family of workloads. On real-world multi-layer lattice models and a standard CPU, we show these strategies deliver $5-10\\times$ faster runtimes compared to a C++ interpreter implementation that uses prior techniques, producing runtimes that are 1000s of times faster than TensorFlow 2.0 for single evaluations.",
        "keywords": "hardware;compiler;MLIR;runtime;CPU;interpolation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nathan Zhang;Kevin Canini;Sean Silva;and Maya R. Gupta",
        "authorids": "nzhang32@gmail.com;canini@google.com;silvasean@google.com;mayagupta@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020fast,\ntitle={Fast Linear Interpolation for Piecewise-Linear Functions, {\\{}GAM{\\}}s, and Deep Lattice Networks},\nauthor={Nathan Zhang and Kevin Canini and Sean Silva and and Maya R. Gupta},\nyear={2020},\nurl={https://openreview.net/forum?id=H1e-X64FDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=H1e-X64FDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "265;400;651",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "256;484;622",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            438.6666666666667,
            159.9381825025601
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            454.0,
            150.91719583930785
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pwHiW1RtlMUJ:scholar.google.com/&scioq=Fast+Linear+Interpolation+for+Piecewise-Linear+Functions,+GAMs,+and+Deep+Lattice+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1e0Wp4KvH",
        "title": "Automated curriculum generation through setter-solver interactions",
        "track": "main",
        "status": "Poster",
        "tldr": "We investigate automatic curriculum generation and identify a number of losses useful to learn to generate a curriculum of tasks.",
        "abstract": "Reinforcement learning algorithms use correlations between policies and rewards to improve agent performance.   But in dynamic or sparsely rewarding environments these correlations are often too small,  or rewarding events are too infrequent to make learning feasible. Human education instead relies on curricula \u2013the breakdown of tasks into simpler, static challenges with dense rewards\u2013 to build up to complex behaviors.  While curricula are also useful for artificial agents, hand-crafting them is time consuming.  This has lead researchers to explore automatic curriculum generation. Here we explore automatic curriculum generation in rich,dynamic environments.  Using a setter-solver paradigm we show the importance of considering goal validity, goal feasibility, and goal coverage to construct useful curricula.  We demonstrate the success of our approach in rich but sparsely rewarding 2D and 3D environments, where an agent is tasked to achieve a single goal selected from a set of possible goals that varies between episodes, and identify challenges for future work.  Finally, we demonstrate the value of a novel technique that guides agents towards a desired goal distribution. Altogether, these results represent a substantial step towards applying automatic task curricula to learn complex,  otherwise unlearnable goals,  and to our knowledge are the first to demonstrate automated curriculum generation for goal-conditioned agents in environments where the possible goals vary between episodes.",
        "keywords": "Deep Reinforcement Learning;Automatic Curriculum",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sebastien Racaniere;Andrew Lampinen;Adam Santoro;David Reichert;Vlad Firoiu;Timothy Lillicrap",
        "authorids": "lampinen@stanford.edu;sracaniere@google.com;adamsantoro@google.com;reichert@google.com;vladfi@google.com;countzero@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nRacaniere2020Automated,\ntitle={Automated curriculum generation through setter-solver interactions},\nauthor={Sebastien Racaniere and Andrew Lampinen and Adam Santoro and David Reichert and Vlad Firoiu and Timothy Lillicrap},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1e0Wp4KvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1e0Wp4KvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "188;729;277",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "248;1292;142",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            398.0,
            236.8557929767956
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            560.6666666666666,
            518.938232248202
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 44,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10135162708842515632&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "H1e31AEYwB",
        "title": "Stiffness: A New Perspective on Generalization in Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We defined the concept of stiffness, showed its utility in providing a perspective to better understand generalization in neural networks, observed its variation with learning rate, and defined the concept of dynamical critical length using it.",
        "abstract": "We investigate neural network training and generalization using the concept of stiffness. We measure how stiff a network is by looking at how a small gradient step on one example affects the loss on another example. In particular, we study how stiffness depends on 1) class membership, 2) distance between data points in the input space, 3) training iteration, and 4) learning rate. We experiment on MNIST, FASHION MNIST, and CIFAR-10 using fully-connected and convolutional neural networks. Our results demonstrate that stiffness is a useful concept for diagnosing and characterizing generalization. We observe that small learning rates reliably lead to higher stiffness at a given epoch as well as at a given training loss. In addition, we measure how stiffness between two data points depends on their mutual input-space distance, and establish the concept of a dynamical critical length that characterizes the distance over which datapoints react similarly to gradient updates. The dynamical critical length decreases with training and the higher the learning rate, the smaller the critical length.",
        "keywords": "stiffness;gradient alignment;critical scale",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stanislav Fort;Pawe\u0142 Krzysztof Nowak;Stanis\u0142aw Jastrzebski;Srini Narayanan",
        "authorids": "stanislav.fort@gmail.com;powalnow@google.com;staszek.jastrzebski@gmail.com;srinin@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfort2020stiffness,\ntitle={Stiffness: A New Perspective on Generalization in Neural Networks},\nauthor={Stanislav Fort and Pawe{\\l} Krzysztof Nowak and Stanis{\\l}aw Jastrzebski and Srini Narayanan},\nyear={2020},\nurl={https://openreview.net/forum?id=H1e31AEYwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer5;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1e31AEYwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "574;420;217",
        "wc_reply_reviewers": "936;435;0",
        "wc_reply_authors": "2024;973;365",
        "reply_reviewers": "2;2;0",
        "reply_authors": "4;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            403.6666666666667,
            146.20153517965838
        ],
        "wc_reply_reviewers_avg": [
            457.0,
            382.4369229036339
        ],
        "wc_reply_authors_avg": [
            1120.6666666666667,
            685.2855041675858
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 92,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10418535676226450853&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1e3HlSFDr",
        "title": "Variational Constrained Reinforcement Learning with Application to Planning at Roundabout",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Planning at roundabout is crucial for autonomous driving in urban and rural environments. Reinforcement learning is promising not only in dealing with complicated environment but also taking safety constraints into account as a as a constrained Markov Decision Process. However, the safety constraints should be explicitly mathematically formulated while this is challenging for planning at roundabout due to unpredicted dynamic behavior of the obstacles. Therefore, to discriminate the obstacles' states as either safe or unsafe is desired which is known as situation awareness modeling. In this paper, we combine variational learning and constrained reinforcement learning to simultaneously learn a Conditional Representation Model (CRM) to encode the states into safe and unsafe distributions respectively as well as to learn the corresponding safe policy.  Our approach is evaluated in using Simulation of Urban Mobility (SUMO) traffic simulator and it can generalize to various traffic flows.",
        "keywords": "Safe reinforcement learning;Autonomous driving;obstacle avoidance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuan Tian;Minghao Han;Lixian Zhang;Wulong Liu;Jun Wang;Wei Pan",
        "authorids": "yuantian013@163.com;mhhan@hit.edu.cn;lixianzhang@hit.edu.cn;liuwulong@huawei.com;jun.wang@cs.ucl.ac.uk;wei.pan@tudelft.nl",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ntian2020variational,\ntitle={Variational Constrained Reinforcement Learning with Application to Planning at Roundabout},\nauthor={Yuan Tian and Minghao Han and Lixian Zhang and Wulong Liu and Jun Wang and Wei Pan},\nyear={2020},\nurl={https://openreview.net/forum?id=H1e3HlSFDr}\n}",
        "github": "https://www.dropbox.com/sh/oo6zty99c6tclx1/AAA8RXynrE8K9SYpxzqBhv4Va?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=H1e3HlSFDr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "270;254;554",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.3333333333333,
            137.80501522884506
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5848396560162964973&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1e552VKPr",
        "title": "Subgraph Attention for Node Classification and Hierarchical Graph Pooling",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel subgraph attention mechanism which can be readily used for node classification and we further propose a hierarchical graph classification technique using it.",
        "abstract": "Graph neural networks have gained significant interest from the research community for both node classification within a graph and graph classification within a set of graphs. Attention mechanism applied on the neighborhood of a node improves the performance of graph neural networks. Typically, it helps to identify a neighbor node which plays more important role to determine the label of the node under consideration. But in real world scenarios, a particular subset of nodes together, but not the individual nodes in the subset, may be important to determine the label of a node. To address this problem, we introduce the concept of subgraph attention for graphs. To show the efficiency of this, we use subgraph attention with graph convolution for node classification. We further use subgraph attention for the entire graph classification by proposing a novel hierarchical neural graph pooling architecture. Along with attention over the subgraphs, our pooling architecture also uses attention to determine the important nodes within a level graph and attention to determine the important levels in the whole hierarchy. Competitive performance over the state-of-the-arts for both node and graph classification shows the efficiency of the algorithms proposed in this paper.",
        "keywords": "Graph Neural Network;Graph Attention;Graph Pooling;Node Classification;Graph Classification;Network Representation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sambaran Bandyopadhyay;Manasvi Aggarwal;M. N. Murty",
        "authorids": "sambaran.ban89@gmail.com;manasvia@iisc.ac.in;mnm@iisc.ac.in",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nbandyopadhyay2020subgraph,\ntitle={Subgraph Attention for Node Classification and Hierarchical Graph Pooling},\nauthor={Sambaran Bandyopadhyay and Manasvi Aggarwal and M. N. Murty},\nyear={2020},\nurl={https://openreview.net/forum?id=H1e552VKPr}\n}",
        "github": "https://drive.google.com/drive/folders/1IxTttX3hH255D8_rBTvkWzlRm0bug_9q?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1e552VKPr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "244;159;280",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "828;158;368",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            227.66666666666666,
            50.73022329495067
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            451.3333333333333,
            279.80151695244416
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17758163044739228890&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1e5GJBtDr",
        "title": "Axial Attention in Multidimensional Transformers",
        "track": "main",
        "status": "Reject",
        "tldr": "Easy-to-implement and effective multidimensional Transformer with faster sampling",
        "abstract": "Self-attention effectively captures large receptive fields with high information bandwidth, but its computational resource requirements grow quadratically with the number of points over which attention is performed. For data arranged as large multidimensional tensors, such as images and videos, the quadratic growth makes self-attention prohibitively expensive. These tensors often have thousands of positions that one wishes to capture and proposed attentional alternatives either limit the resulting receptive field or require custom subroutines. We propose Axial Attention, a simple generalization of self-attention that naturally aligns with the multiple dimensions of the tensors in both the encoding and the decoding settings. The Axial Transformer uses axial self-attention layers and a shift operation to efficiently build large and full receptive fields.  Notably the proposed structure of the layers allows for the vast majority of the context to be computed in parallel during decoding without introducing any independence assumptions. This semi-parallel structure goes a long way to making decoding from even a very large Axial Transformer broadly applicable. We demonstrate state-of-the-art results for the Axial Transformer on the ImageNet-32 and ImageNet-64 image benchmarks as well as on the BAIR Robotic Pushing video benchmark. We open source the implementation of Axial Transformers.",
        "keywords": "self-attention;transformer;images;videos",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jonathan Ho;Nal Kalchbrenner;Dirk Weissenborn;Tim Salimans",
        "authorids": "jonathanho@google.com;nalk@google.com;diwe@google.com;salimans@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nho2020axial,\ntitle={Axial Attention in Multidimensional Transformers},\nauthor={Jonathan Ho and Nal Kalchbrenner and Dirk Weissenborn and Tim Salimans},\nyear={2020},\nurl={https://openreview.net/forum?id=H1e5GJBtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer5;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1e5GJBtDr",
        "pdf_size": 0,
        "rating": "1;1;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "277;198;152;292",
        "wc_reply_reviewers": "0;173;0;0",
        "wc_reply_authors": "73;177;50;0",
        "reply_reviewers": "0;2;0;0",
        "reply_authors": "1;1;1;0",
        "rating_avg": [
            2.75,
            2.0463381929681126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            229.75,
            57.360156031865884
        ],
        "wc_reply_reviewers_avg": [
            43.25,
            74.91119742735394
        ],
        "wc_reply_authors_avg": [
            75.0,
            64.53293732660865
        ],
        "reply_reviewers_avg": [
            0.5,
            0.8660254037844386
        ],
        "reply_authors_avg": [
            0.75,
            0.4330127018922193
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 711,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8263165805863026616&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1eA7AEtvS",
        "title": "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations",
        "track": "main",
        "status": "Spotlight",
        "tldr": "A new pretraining method that establishes new state-of-the-art results on the GLUE, RACE, and SQuAD benchmarks while having fewer parameters compared to BERT-large. ",
        "abstract": "Increasing model size when pretraining natural language representations often results in improved performance on downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations and longer training times. To address these problems,  we present two parameter-reduction techniques to lower memory consumption and increase the training speed of BERT~\\citep{devlin2018bert}. Comprehensive empirical evidence shows that our proposed methods lead to models that scale much better compared to the original BERT. We also use a self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream tasks with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE, RACE, and \\squad benchmarks while having fewer parameters compared to BERT-large. The code and the pretrained models are available at https://github.com/google-research/ALBERT.",
        "keywords": "Natural Language Processing;BERT;Representation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhenzhong Lan;Mingda Chen;Sebastian Goodman;Kevin Gimpel;Piyush Sharma;Radu Soricut",
        "authorids": "lanzhzh@google.com;mchen@ttic.edu;seabass@google.com;kgimpel@ttic.edu;piyushsharma@google.com;rsoricut@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nLan2020ALBERT:,\ntitle={ALBERT: A Lite BERT for Self-supervised Learning of Language Representations},\nauthor={Zhenzhong Lan and Mingda Chen and Sebastian Goodman and Kevin Gimpel and Piyush Sharma and Radu Soricut},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eA7AEtvS}\n}",
        "github": "https://github.com/google-research/ALBERT",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1eA7AEtvS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "498;199;205",
        "wc_reply_reviewers": "0;5;0",
        "wc_reply_authors": "572;200;176",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            300.6666666666667,
            139.55723636638203
        ],
        "wc_reply_reviewers_avg": [
            1.6666666666666667,
            2.357022603955158
        ],
        "wc_reply_authors_avg": [
            316.0,
            181.28430709799457
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            32,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8832,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6606720413006378435&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "H1eArT4tPH",
        "title": "The Effect of Residual Architecture on the Per-Layer Gradient of Deep Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "A critical part of the training process of neural networks takes place in the very first gradient steps post initialization. In this work, we study the connection between the network's architecture and initialization parameters, to the statistical properties of the gradient in random fully connected ReLU networks, through the study of the the Jacobian. We compare three types of architectures: vanilla networks, ResNets and DenseNets. The later two, as we show, preserve the variance of the gradient norm through arbitrary depths when initialized properly,  which prevents exploding or decaying gradients at deeper layers. In addition, we show that the statistics of the per layer gradient norm is a function of the architecture and the layer's size, but surprisingly not the layer's depth. \n\nThis depth invariant result is surprising in light of the literature results that state that the norm of the layer's activations grows exponentially with the specific layer's depth. Experimental support is given in order to validate our theoretical results and to reintroduce concatenated ReLU blocks, which, as we show, present better initialization properties than ReLU blocks in the case of fully connected networks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Etai Littwin;Lior Wolf",
        "authorids": "etai.littwin@gmail.com;wolf@fb.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlittwin2020the,\ntitle={The Effect of Residual Architecture on the Per-Layer Gradient of Deep Networks},\nauthor={Etai Littwin and Lior Wolf},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eArT4tPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1eArT4tPH",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "437;261;628",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "621;114;295",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            442.0,
            149.8688315383378
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            343.3333333333333,
            209.78454555937995
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Cl-ZS712n4IJ:scholar.google.com/&scioq=The+Effect+of+Residual+Architecture+on+the+Per-Layer+Gradient+of+Deep+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1eCR34FPB",
        "title": "Sequence-level Intrinsic Exploration Model for Partially Observable Domains",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Training reinforcement learning policies in partially observable domains with sparse reward signal is an important and open problem for the research community. In this paper, we introduce a new sequence-level intrinsic novelty model to tackle the challenge of training reinforcement learning policies in sparse rewarded partially observable domains. First, we propose a new reasoning paradigm to infer the novelty for the partially observable states, which is built upon forward dynamics prediction. Different from conventional approaches that perform self-prediction or one-step forward prediction, our proposed approach engages open-loop multi-step prediction, which enables the difficulty of novelty prediction to flexibly scale and thus results in high-quality novelty scores. Second, we propose a novel dual-LSTM architecture to facilitate the sequence-level reasoning over the partially observable state space.  Our proposed architecture efficiently synthesizes information from an observation sequence and an action sequence to derive meaningful latent representations for inferring the novelty for states. To evaluate the efficiency of our proposed approach, we conduct extensive experiments on several challenging 3D navigation tasks from ViZDoom and DeepMind Lab. We also present results on two hard-exploration domains from Atari 2600 series in Appendix to demonstrate our proposed approach could generalize beyond partially observable navigation tasks. Overall, the experiment results reveal that our proposed intrinsic novelty model could outperform several state-of-the-art curiosity baselines with considerable significance in the testified domains.",
        "keywords": "deep learning;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haiyan Yin;Jianda Chen;Sinno Jialin Pan",
        "authorids": "yinhaiyan@outlook.com;jianda001@e.ntu.edu.sg;sinnopan@ntu.edu.sg",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyin2020sequencelevel,\ntitle={Sequence-level Intrinsic Exploration Model for Partially Observable Domains},\nauthor={Haiyan Yin and Jianda Chen and Sinno Jialin Pan},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eCR34FPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1eCR34FPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "508;220;157",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1103;46;415",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.0,
            152.7939789389621
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            521.3333333333334,
            438.0200401300784
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:76GRQj0jGkoJ:scholar.google.com/&scioq=Sequence-level+Intrinsic+Exploration+Model+for+Partially+Observable+Domains&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1eCw3EKvH",
        "title": "On the Weaknesses of Reinforcement Learning for Neural Machine Translation",
        "track": "main",
        "status": "Poster",
        "tldr": "Reinforcment practices for machine translation performance gains might not come from better predictions.",
        "abstract": "Reinforcement learning (RL) is frequently used to increase performance in text generation tasks,\nincluding machine translation (MT), \nnotably through the use of Minimum Risk Training (MRT) and Generative Adversarial Networks (GAN). \nHowever, little is known about what and how these methods learn in the context of MT. \nWe prove that one of the most common RL methods for MT does not optimize the \nexpected reward, as well as show that other methods take an infeasibly long time to converge.\nIn fact, our results suggest that RL practices in MT are likely to improve performance\nonly where the pre-trained parameters are already close to yielding the correct translation.\nOur findings further suggest that observed gains may be due to effects unrelated to the training signal, concretely, changes in the shape of the distribution curve.",
        "keywords": "Reinforcement learning;MRT;minimum risk training;reinforce;machine translation;peakkiness;generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Leshem Choshen;Lior Fox;Zohar Aizenbud;Omri Abend",
        "authorids": "leshem.choshen@mail.huji.ac.il;lior.fox@mail.huji.ac.il;zohar.aizenbud@mail.huji.ac.il;oabend@cs.huji.ac.il",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nChoshen2020On,\ntitle={On the Weaknesses of Reinforcement Learning for Neural Machine Translation},\nauthor={Leshem Choshen and Lior Fox and Zohar Aizenbud and Omri Abend},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eCw3EKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1eCw3EKvH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "288;496;702",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "246;457;180",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            495.3333333333333,
            169.0154496554153
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            294.3333333333333,
            118.13645594067151
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 98,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15010710740851428000&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1eD7REtPr",
        "title": "CAN ALTQ LEARN FASTER: EXPERIMENTS AND THEORY",
        "track": "main",
        "status": "Reject",
        "tldr": "New Experiments and Theory for Adam Based Q-Learning",
        "abstract": "Differently from the popular Deep Q-Network (DQN) learning, Alternating Q-learning (AltQ) does not fully fit a target Q-function at each iteration, and is generally known to be unstable and inefficient. Limited applications of AltQ mostly rely on substantially altering the algorithm architecture in order to improve its performance. Although Adam appears to be a natural solution, its performance in AltQ has rarely been studied before. In this paper, we first provide a solid exploration on how well AltQ performs with Adam. We then take a further step to improve the implementation by adopting the technique of parameter restart. More specifically, the proposed algorithms are tested on a batch of Atari 2600 games and exhibit superior performance than the DQN learning method. The convergence rate of the slightly modified version of the proposed algorithms is characterized under the linear function approximation. To the best of our knowledge, this is the first theoretical study on the Adam-type algorithms in Q-learning. ",
        "keywords": "Reinforcement Learning;Q-Learning;Adam;Restart;Convergence Analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bowen Weng;Huaqing Xiong;Yingbin Liang;Wei Zhang",
        "authorids": "weng.172@buckeyemail.osu.edu;xiong.309@buckeyemail.osu.edu;liang.889@osu.edu;zhangw3@sustech.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nweng2020can,\ntitle={{\\{}CAN{\\}} {\\{}ALTQ{\\}} {\\{}LEARN{\\}} {\\{}FASTER{\\}}: {\\{}EXPERIMENTS{\\}} {\\{}AND{\\}} {\\{}THEORY{\\}}},\nauthor={Bowen Weng and Huaqing Xiong and Yingbin Liang and Wei Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eD7REtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1eD7REtPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "173;994;323",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "577;822;636",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            496.6666666666667,
            356.95969271364834
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            678.3333333333334,
            104.40412933510926
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JMCra_f0SkUJ:scholar.google.com/&scioq=CAN+ALTQ+LEARN+FASTER:+EXPERIMENTS+AND+THEORY&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1eF3kStPS",
        "title": "Redundancy-Free Computation Graphs for Graph Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We present Hierarchically Aggregated computation Graphs (HAGs), a new GNN graph representation that explicitly avoids redundant computations in GNN training and inference.",
        "abstract": "Graph Neural Networks (GNNs) are based on repeated aggregations of information across nodes\u2019 neighbors in a graph.  However, because common neighbors are shared between different nodes, this leads to repeated and inefficient computations.We propose Hierarchically Aggregated computation Graphs (HAGs), a new GNN graph representation that explicitly avoids redundancy by managing intermediate aggregation  results  hierarchically,  and  eliminating  repeated  computations  and unnecessary data transfers in GNN training and inference. We introduce an accurate cost function to quantitatively evaluate the runtime performance of different HAGsand use a novel search algorithm to find optimized HAGs.  Experiments show that the HAG representation significantly outperforms the standard GNN graph representation by increasing the end-to-end training throughput by up to 2.8x and reducing the aggregations and data transfers in GNN training by up to 6.3x and 5.6x. Meanwhile, HAGs improve runtime performance by preserving GNNcomputation, and maintain the original model accuracy for arbitrary GNNs.",
        "keywords": "Graph Neural Networks;Runtime Performance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhihao Jia;Sina Lin;Rex Ying;Jiaxuan You;Jure Leskovec;Alex Aiken.",
        "authorids": "zhihao@cs.stanford.edu;silin@microsoft.com;rexying@stanford.edu;jiaxuan@stanford.edu;jure@cs.stanford.edu;aiken@cs.stanford.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\njia2020redundancyfree,\ntitle={Redundancy-Free Computation Graphs for Graph Neural Networks},\nauthor={Zhihao Jia and Sina Lin and Rex Ying and Jiaxuan You and Jure Leskovec and Alex Aiken.},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eF3kStPS}\n}",
        "github": "https://www.dropbox.com/sh/pce8j67byr6bq83/AACh0UKkua0toPM_ZjtTTVava?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=H1eF3kStPS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "235;254;245",
        "wc_reply_reviewers": "0;227;0",
        "wc_reply_authors": "530;573;289",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            244.66666666666666,
            7.760297817881877
        ],
        "wc_reply_reviewers_avg": [
            75.66666666666667,
            107.00882621956418
        ],
        "wc_reply_authors_avg": [
            464.0,
            124.98266546472222
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 104,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1449619471298263640&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 15
    },
    {
        "id": "H1eH9hNtwr",
        "title": "Stagnant zone segmentation with U-net",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Silo discharging and monitoring the process for\nindustrial or research application depend on computerized\nsegmentation of different parts of images such as stagnant and\nflowing zones which is the toughest task. X-ray Computed\nTomography (CT) is one of a powerful non-destructive technique\nfor cross-sectional images of a 3D object based on X-ray\nabsorption. CT is the most proficient for investigating different\ngranular flow phenomena and segmentation of the stagnant zone\nas compared to other imaging techniques. In any case, manual\nsegmentation is tiresome and erroneous for further investigations.\nHence, automatic and precise strategies are required. In the\npresent work, a U-net architecture is used for segmenting the\nstagnant zone during silo discharging process. This proposed\nimage segmentation method provides fast and effective outcomes\nby exploiting a convolutional neural networks technique with an\naccuracy of 97 percent",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Selam Waktola;Laurent Babout;Krzysztof Grudzien",
        "authorids": "selam.waktola@gmail.com;selam.waktola@gmail.com;selam.waktola@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwaktola2020stagnant,\ntitle={Stagnant zone segmentation with U-net},\nauthor={Selam Waktola and Laurent Babout and Krzysztof Grudzien},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eH9hNtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1eH9hNtwr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "133;14;78",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            75.0,
            48.62783839187864
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12548819709229816015&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "H1eJAANtvr",
        "title": "CGT: Clustered Graph Transformer for Urban Spatio-temporal Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "We developed CGT (clustered graph-transformer) for handling the spatial and temporal unsmoothness, which greatly improve the model capability and lift the spatiotemporal prediction performance.",
        "abstract": "Deep learning based approaches have been widely used in various urban spatio-temporal forecasting problems, but most of them fail to account for the unsmoothness issue of urban data in their architecture design, which significantly deteriorates  their prediction performance. The aim of this paper is to develop a novel clustered graph transformer framework that integrates both graph attention network and transformer under an encoder-decoder architecture to address such unsmoothness issue.  Specifically,  we propose two novel structural components to refine the architectures of those existing deep learning models. In spatial domain, we propose a gradient-based clustering method to distribute different feature extractors to regions in different contexts. In temporal domain, we propose to use multi-view position encoding to address the periodicity and closeness of urban time series data. Experiments on real datasets obtained from a ride-hailing business show that our method can achieve 10\\%-25\\% improvement than many state-of-the-art baselines. ",
        "keywords": "Unsmooth spatiotemporal forecasting;Clustered graph neural network;Graph-Transformer;Urban computing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xu Geng;Lingyu Zhang;Shulin Li;Yuanbo Zhang;Lulu Zhang;Leye Wang;Qiang Yang;Hongtu Zhu;Jieping Ye",
        "authorids": "xgeng@connect.ust.hk;zhanglingyu@didiglobal.com;lishulin_i@didiglobal.com;bozhangyuanbo_i@didiglobal.com;zhanglulululu@didiglobal.com;leyewang@pku.edu.cn;qyang@cse.ust.hk;zhuhongtu@didiglobal.com;yejieping@didiglobal.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@misc{\ngeng2020cgt,\ntitle={{\\{}CGT{\\}}: Clustered Graph Transformer for Urban Spatio-temporal Prediction},\nauthor={Xu Geng and Lingyu Zhang and Shulin Li and Yuanbo Zhang and Lulu Zhang and Leye Wang and Qiang Yang and Hongtu Zhu and Jieping Ye},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eJAANtvr}\n}",
        "github": "https://github.com/CGT-ICLR2020/CGT-ICLR2020",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1eJAANtvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "404;1150;236",
        "wc_reply_reviewers": "0;470;0",
        "wc_reply_authors": "244;360;217",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            596.6666666666666,
            397.23153058246635
        ],
        "wc_reply_reviewers_avg": [
            156.66666666666666,
            221.5601247717849
        ],
        "wc_reply_authors_avg": [
            273.6666666666667,
            62.03404083422442
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14934109644220241859&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1eKT1SFvH",
        "title": "Towards Effective 2-bit Quantization: Pareto-optimal Bit Allocation for Deep CNNs Compression",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "State-of-the-art quantization methods can compress deep neural networks down to 4 bits without losing accuracy. However, when it comes to 2 bits, the performance drop is still noticeable. One problem in these methods is that they assign equal bit rate to quantize weights and activations in all layers, which is not reasonable in the case of high rate compression (such as 2-bit quantization), as some of layers in deep neural networks are sensitive to quantization and performing coarse quantization on these layers can hurt the accuracy. In this paper, we address an important problem of how to optimize the bit allocation of weights and activations for deep CNNs compression. We first explore the additivity of output error caused by quantization and find that additivity property holds for deep neural networks which are continuously differentiable in the layers. Based on this observation, we formulate the optimal bit allocation problem of weights and activations in a joint framework and propose a very efficient method to solve the optimization problem via Lagrangian Formulation. Our method obtains excellent results on deep neural networks. It can compress deep CNN ResNet-50 down to 2 bits with only 0.7% accuracy loss. To the best our knowledge, this is the first paper that reports 2-bit results on deep CNNs without hurting the accuracy.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhe Wang;Jie Lin;Mohamed M. Sabry Aly;Sean I Young;Vijay Chandrasekhar;Bernd Girod",
        "authorids": "mark.wangzhe@gmail.com;lin-j@i2r.a-star.edu.sg;msabry@ntu.edu.sg;sean.i.young@stanford.edu;vijay@i2r.a-star.edu.sg;bgirod@stanford.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nwang2020towards,\ntitle={Towards Effective 2-bit Quantization: Pareto-optimal Bit Allocation for Deep {\\{}CNN{\\}}s Compression},\nauthor={Zhe Wang and Jie Lin and Mohamed M. Sabry Aly and Sean I Young and Vijay Chandrasekhar and Bernd Girod},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eKT1SFvH}\n}",
        "github": "https://www.dropbox.com/sh/x4k4vf2kj7waix4/AADbfNvhr0Vl82YnfIc3OCJPa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1eKT1SFvH",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "100;313;342",
        "wc_reply_reviewers": "0;25;0",
        "wc_reply_authors": "512;955;349",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            251.66666666666666,
            107.89604049989765
        ],
        "wc_reply_reviewers_avg": [
            8.333333333333334,
            11.785113019775793
        ],
        "wc_reply_authors_avg": [
            605.3333333333334,
            256.04990832951995
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zSzQhSABN8kJ:scholar.google.com/&scioq=Towards+Effective+2-bit+Quantization:+Pareto-optimal+Bit+Allocation+for+Deep+CNNs+Compression&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1eLVxrKwS",
        "title": "Removing input features via a generative model to explain their attributions to classifier's decisions",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Interpretability methods often measure the contribution of an input feature to an image classifier's decisions by heuristically removing it via e.g. blurring, adding noise, or graying out, which often produce unrealistic, out-of-samples. Instead, we propose to integrate a generative inpainter into three representative attribution map methods as a mechanism for removing input features. Compared to the original counterparts, our methods (1) generate more plausible counterfactual samples under the true data generating process; (2) are more robust to hyperparameter settings; and (3) localize objects more accurately. Our findings were consistent across both ImageNet and Places365 datasets and two different pairs of classifiers and inpainters.",
        "keywords": "attribution maps;generative models;inpainting;counterfactual;explanations;interpretability;explainability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chirag Agarwal;Dan Schonfeld;Anh Nguyen",
        "authorids": "chiragagarwall12@gmail.com;dans@uic.edu;anh.ng8@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nagarwal2020removing,\ntitle={Removing input features via a generative model to explain their attributions to classifier's decisions},\nauthor={Chirag Agarwal and Dan Schonfeld and Anh Nguyen},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eLVxrKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1eLVxrKwS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "349;790;290",
        "wc_reply_reviewers": "0;49;0",
        "wc_reply_authors": "498;1236;841",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;3;2",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            476.3333333333333,
            223.09987798193785
        ],
        "wc_reply_reviewers_avg": [
            16.333333333333332,
            23.098821518760555
        ],
        "wc_reply_authors_avg": [
            858.3333333333334,
            301.536435977847
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 17,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11935161218326184742&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1eNleBYwr",
        "title": "GENN: Predicting Correlated Drug-drug Interactions with Graph Energy Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Gaining more comprehensive knowledge about drug-drug interactions (DDIs) is one of the most important tasks in drug development and medical practice. Recently graph neural networks have achieved great success in this task by modeling drugs as nodes and drug-drug interactions as links and casting DDI predictions as link prediction problems. However, correlations between link labels (e.g., DDI types) were rarely considered in existing works.\n We propose the graph energy neural network (\\mname) to explicitly model link type correlations.  We formulate the DDI prediction task as a structure prediction problem and introduce a new energy-based model where the energy function is defined by graph neural networks. Experiments on two real-world DDI datasets demonstrated that \\mname is superior to many baselines without consideration of link type correlations and achieved $13.77\\%$ and $5.01\\%$ PR-AUC improvement on the two datasets, respectively. We also present a case study in which \\mname can better capture meaningful DDI correlations compared with baseline models.",
        "keywords": "graph neural networks;energy model;structure prediction;drug-drug-interaction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tengfei Ma;Junyuan Shang;Cao Xiao;Jimeng Sun",
        "authorids": "tengfei.ma1@ibm.com;sjy1203@pku.edu.cn;cao.xiao@iqvia.com;sun@cc.gatech.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nma2020genn,\ntitle={{\\{}GENN{\\}}: Predicting Correlated Drug-drug Interactions with Graph Energy Neural Networks},\nauthor={Tengfei Ma and Junyuan Shang and Cao Xiao and Jimeng Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eNleBYwr}\n}",
        "github": "https://github.com/ICLR2020-code/GENN",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1eNleBYwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "388;325;323",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "484;613;276",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            345.3333333333333,
            30.18093585177386
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            457.6666666666667,
            138.83403361167927
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6635607014621226842&as_sdt=5,31&sciodt=0,31&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1eRI04KPB",
        "title": "Likelihood Contribution based Multi-scale Architecture for Generative Flows",
        "track": "main",
        "status": "Reject",
        "tldr": "Data-dependent factorization of dimensions in a multi-scale architecture based on contribution to the total log-likelihood",
        "abstract": "Deep generative modeling using flows has gained popularity owing to the tractable exact log-likelihood estimation with efficient training and synthesis process. However, flow models suffer from the challenge of having high dimensional latent space, same in dimension as the input space. An effective solution to the above challenge as proposed by Dinh et al. (2016) is a multi-scale architecture, which is based on iterative early factorization of a part of the total dimensions at regular intervals. Prior works on generative flows involving a multi-scale architecture perform the dimension factorization based on a static masking. We propose a novel multi-scale architecture that performs data dependent factorization to decide which dimensions should pass through more flow layers. To facilitate the same, we introduce a heuristic based on the contribution of each dimension to the total log-likelihood which encodes the importance of the dimensions. Our proposed heuristic is readily obtained as part of the flow training process, enabling versatile implementation of our likelihood contribution based multi-scale architecture for generic flow models. We present such an implementation for the original flow introduced in Dinh et al. (2016), and demonstrate improvements in log-likelihood score and sampling quality on standard image benchmarks. We also conduct ablation studies to compare proposed method with other options for dimension factorization.",
        "keywords": "Generative Flow;Normalizing Flow;Multi-scale Architecture;RealNVP;Dimension Factorization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hari Prasanna Das;Pieter Abbeel;Costas J. Spanos",
        "authorids": "hpdas@eecs.berkeley.edu;pabbeel@cs.berkeley.edu;spanos@eecs.berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndas2020likelihood,\ntitle={Likelihood Contribution based Multi-scale Architecture for Generative Flows},\nauthor={Hari Prasanna Das and Pieter Abbeel and Costas J. Spanos},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eRI04KPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=H1eRI04KPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1482;739;501",
        "wc_reply_reviewers": "161;0;0",
        "wc_reply_authors": "1089;538;687",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            907.3333333333334,
            417.8056432787326
        ],
        "wc_reply_reviewers_avg": [
            53.666666666666664,
            75.8961278473561
        ],
        "wc_reply_authors_avg": [
            771.3333333333334,
            232.71489471501866
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6386664074254287682&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1eRYxHYPB",
        "title": "Optimal Unsupervised Domain Translation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel, more rigorous framework for Unsupervised Domain Translation based on Optimal Transport.",
        "abstract": "Unsupervised Domain Translation~(UDT) consists in finding meaningful correspondences between two domains, without access to explicit pairings between them. Following the seminal work of \\textit{CycleGAN}, many variants and extensions of this model have been applied successfully to a wide range of applications. However, these methods remain poorly understood, and lack convincing theoretical guarantees. In this work, we define UDT in a rigorous, non-ambiguous manner, explore the implicit biases present in the approach and demonstrate the limits of theses approaches. Specifically, we show that mappings produced by these methods are  biased towards \\textit{low energy} transformations, leading us to cast UDT into an Optimal Transport~(OT) framework by making this implicit bias explicit. This not only allows us to provide theoretical guarantees for existing methods, but also to solve UDT problems where previous methods fail. Finally, making the link between the dynamic formulation of OT and CycleGAN, we propose a simple approach to solve UDT, and illustrate its properties in two distinct settings.",
        "keywords": "Unsupervised Domain Translation;CycleGAN;Optimal Transport",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Emmanuel de B\u00e9zenac;Ibrahim Ayed;Patrick Gallinari",
        "authorids": "emmanuel.de-bezenac@lip6.fr;ayedibrahim@gmail.com;patrick.gallinari@lip6.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nb{\\'e}zenac2020optimal,\ntitle={Optimal Unsupervised Domain Translation},\nauthor={Emmanuel de B{\\'e}zenac and Ibrahim Ayed and Patrick Gallinari},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eRYxHYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1eRYxHYPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "528;393;402",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "822;719;824",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            441.0,
            61.62791575252241
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            788.3333333333334,
            49.032868794536405
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3191269099642704923&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1eUz1rKPr",
        "title": "Representation Learning with Multisets",
        "track": "main",
        "status": "Reject",
        "tldr": "Based on fuzzy set theory, we propose a model that given only the sizes of symmetric differences between pairs of multisets, learns representations of such multisets and their elements.",
        "abstract": "We study the problem of learning permutation invariant representations that can capture containment relations. We propose training a model on a novel task: predicting the size of the symmetric difference between pairs of multisets, sets which may contain multiple copies of the same object. With motivation from fuzzy set theory, we formulate both multiset representations and how to predict symmetric difference sizes given these representations. We model multiset elements as vectors on the standard simplex and multisets as the summations of such vectors, and we predict symmetric difference as the l1-distance between multiset representations. We demonstrate that our representations more effectively predict the sizes of symmetric differences than DeepSets-based approaches with unconstrained object representations. Furthermore, we demonstrate that the model learns meaningful representations, mapping objects of different classes to different standard basis vectors.",
        "keywords": "multisets;fuzzy sets;permutation invariant;representation learning;containment;partial order;clustering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vasco Portilheiro",
        "authorids": "vascop@stanford.edu",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nportilheiro2020representation,\ntitle={Representation Learning with Multisets},\nauthor={Vasco Portilheiro},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eUz1rKPr}\n}",
        "github": "https://www.dropbox.com/s/foj613yhbopnevt/multisets.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1eUz1rKPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "249;486;181",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "147;216;212",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            305.3333333333333,
            130.7321264605181
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            191.66666666666666,
            31.626290048347787
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fsBlopFfJz8J:scholar.google.com/&scioq=Representation+Learning+with+Multisets&hl=en&as_sdt=0,5",
        "gs_version_total": 4
    },
    {
        "id": "H1eVlgHKPr",
        "title": "Event Discovery for History Representation in Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "event discovery to represent the history for the agent in RL",
        "abstract": "Environments in Reinforcement Learning (RL) are usually only partially observable. To address this problem, a possible solution is to provide the agent with information about past  observations. While common methods represent this history using a Recurrent Neural Network (RNN), in this paper we propose an alternative representation which is based on the record of the past events observed in a given episode. Inspired by the human memory, these events describe only important changes in the environment and, in our approach, are automatically discovered using self-supervision.\n We evaluate our history representation method using two challenging RL benchmarks: some games of the Atari-57 suite and the 3D environment Obstacle Tower. Using these benchmarks we show the advantage of our solution with respect to common RNN-based approaches.",
        "keywords": "reinforcement learning;self-supervision;POMDP",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aleksandr Ermolov;Enver Sangineto;Nicu Sebe",
        "authorids": "aleksandr.ermolov@unitn.it;enver.sangineto@unitn.it;niculae.sebe@unitn.it",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nermolov2020event,\ntitle={Event Discovery for History Representation in Reinforcement Learning},\nauthor={Aleksandr Ermolov and Enver Sangineto and Nicu Sebe},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eVlgHKPr}\n}",
        "github": "https://github.com/iclr2020anon/EDHR",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1eVlgHKPr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "1025;555;429",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1828;496;245",
        "reply_reviewers": "0;0;0",
        "reply_authors": "15;6;4",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            669.6666666666666,
            256.47005456561374
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            856.3333333333334,
            694.6713051668553
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            8.333333333333334,
            4.784233364802441
        ],
        "replies_avg": [
            29,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FOA8sSZlUiwJ:scholar.google.com/&scioq=Event+Discovery+for+History+Representation+in+Reinforcement+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1eWGREFvB",
        "title": "Stein Self-Repulsive Dynamics: Benefits from Past Samples",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new Stein self-repulsive dynamics for obtaining diversified samples from intractable un-normalized distributions. ",
        "abstract": "We propose a new Stein self-repulsive dynamics for obtaining diversified samples from intractable un-normalized distributions. Our idea is to introduce Stein variational gradient as a repulsive force to push the samples of Langevin dynamics \naway from the past trajectories. This simple idea allows us to significantly decrease the auto-correlation in Langevin dynamics and hence increase the effective sample size. Importantly, as we establish in our theoretical analysis, the asymptotic stationary distribution remains correct even with the addition of the repulsive force, thanks to the special properties of the Stein variational gradient. We perform extensive empirical studies of our new algorithm, showing that our method yields much higher sample efficiency and better uncertainty estimation than vanilla Langevin dynamics.",
        "keywords": "Approximate Inference;Markov Chain Monte Carlo;Stein Variational Gradient Descent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mao Ye;Tongzheng Ren;Qiang Liu",
        "authorids": "lushleaf21@gmail.com;rtz19970824@gmail.com;lqiang@cs.utexas.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nye2020stein,\ntitle={Stein Self-Repulsive Dynamics: Benefits from Past Samples},\nauthor={Mao Ye and Tongzheng Ren and Qiang Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eWGREFvB}\n}",
        "github": "https://www.dropbox.com/sh/xa6ihm45g181tmf/AAB6fysesxLJ0XfzRYHQlF3xa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1eWGREFvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "229;275;549",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1171;431;1701",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;4",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            351.0,
            141.26098777322304
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1101.0,
            520.83266666624
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16208020929684660762&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "H1eY00VFDB",
        "title": "Retrospection: Leveraging the Past for Efficient Training of Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "A retrospection loss that enables networks  to leverage past parameter states as guidance during training to improve performance",
        "abstract": "Deep neural networks are powerful learning machines that have enabled breakthroughs in several domains. In this work, we introduce retrospection loss to improve the performance of neural networks by utilizing prior experiences during training. Minimizing the retrospection loss pushes the parameter state at the current training step towards the optimal parameter state while pulling it away from the parameter state at a previous training step. We conduct extensive experiments to show that the proposed retrospection loss results in improved performance across multiple tasks, input types and network architectures.",
        "keywords": "Deep Neural Networks;Supervised Learning;Classification;Training Strategy;Generative Adversarial Networks;Convolutional Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ayush Chopra;Surgan Jandial;Mausoom Sarkar;Balaji Krishnamurthy;Vineeth Balasubramanian",
        "authorids": "ayuchopr@adobe.com;cs17btech11038@iith.ac.in;msarkar@adobe.com;kbalaji@adobe.com;vineethnb@iith.ac.in",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchopra2020retrospection,\ntitle={Retrospection: Leveraging the Past for Efficient Training of Deep Neural Networks},\nauthor={Ayush Chopra and Surgan Jandial and Mausoom Sarkar and Balaji Krishnamurthy and Vineeth Balasubramanian},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eY00VFDB}\n}",
        "github": "https://github.com/iclr-retrospection/retrospection",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1eY00VFDB",
        "pdf_size": 0,
        "rating": "3;8",
        "confidence": "0;0",
        "wc_review": "244;441",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "1115;2620",
        "reply_reviewers": "0;0",
        "reply_authors": "2;4",
        "rating_avg": [
            5.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            342.5,
            98.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1867.5,
            752.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            3.0,
            1.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:88WW7IOBXl4J:scholar.google.com/&scioq=Retrospection:+Leveraging+the+Past+for+Efficient+Training+of+Deep+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1e_cC4twS",
        "title": "Non-Autoregressive Dialog State Tracking",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose the first non-autoregressive neural model for Dialogue State Tracking (DST), achieving the SOTA accuracy (49.04%) on MultiWOZ2.1 benchmark, and reducing inference latency by an order of magnitude.",
        "abstract": "Recent efforts in Dialogue State Tracking (DST) for task-oriented dialogues have progressed toward open-vocabulary or generation-based approaches where the models can generate slot value candidates from the dialogue history itself. These approaches have shown good performance gain, especially in complicated dialogue domains with dynamic slot values. However, they fall short in two aspects: (1) they do not allow models to explicitly learn signals across domains and slots to detect potential dependencies among \\textit{(domain, slot)} pairs; and (2) existing models follow auto-regressive approaches which incur high time cost when the dialogue evolves over multiple domains and multiple turns. In this paper, we propose a novel framework of Non-Autoregressive Dialog State Tracking (NADST) which can factor in potential dependencies among domains and slots to optimize the models towards better prediction of dialogue states as a complete set rather than separate slots. In particular, the non-autoregressive nature of our method not only enables decoding in parallel to significantly reduce the latency of DST for real-time dialogue response generation, but also detect dependencies among slots at token level in addition to slot and domain level. Our empirical results show that our model achieves the state-of-the-art joint accuracy across all domains on the MultiWOZ 2.1 corpus, and the latency of our model is an order of magnitude lower than the previous state of the art as the dialogue history extends over time. ",
        "keywords": "task-oriented;dialogues;dialogue state tracking;non-autoregressive",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hung Le;Richard Socher;Steven C.H. Hoi",
        "authorids": "l.hung1610@gmail.com;rsocher@salesforce.com;shoi@salesforce.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLe2020Non-Autoregressive,\ntitle={Non-Autoregressive Dialog State Tracking},\nauthor={Hung Le and Richard Socher and Steven C.H. Hoi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1e_cC4twS}\n}",
        "github": "https://github.com/henryhungle/NADST",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1e_cC4twS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "114;130;593",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "357;170;435",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            279.0,
            222.1275909621915
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            320.6666666666667,
            111.19452424567596
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 62,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13522465904465807685&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1ebc0VYvH",
        "title": "Unaligned Image-to-Sequence Transformation with Loop Consistency",
        "track": "main",
        "status": "Reject",
        "tldr": "LoopGAN extends cycle length in CycleGAN to enable unaligned sequential transformation for more than two time steps.",
        "abstract": "We tackle the problem of modeling sequential visual phenomena. Given examples of a phenomena that can be divided into discrete time steps, we aim to take an input from any such time and realize this input at all other time steps in the sequence. Furthermore, we aim to do this \\textit{without} ground-truth aligned sequences --- avoiding the difficulties needed for gathering aligned data. This generalizes the unpaired image-to-image problem from generating pairs to generating sequences. We extend cycle consistency to \\textit{loop consistency} and alleviate difficulties associated with learning in the resulting long chains of computation. We show competitive results compared to existing image-to-image techniques when modeling several different data sets including the Earth's seasons and aging of human faces.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Siyang Wang;Justin Lazarow;Kwonjoon Lee;Zhuowen Tu",
        "authorids": "siw030@ucsd.edu;jlazarow@ucsd.edu;kwl042@ucsd.edu;ztu@ucsd.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020unaligned,\ntitle={Unaligned Image-to-Sequence Transformation with Loop Consistency},\nauthor={Siyang Wang and Justin Lazarow and Kwonjoon Lee and Zhuowen Tu},\nyear={2020},\nurl={https://openreview.net/forum?id=H1ebc0VYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1ebc0VYvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "275;135;161",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            190.33333333333334,
            60.80204674917522
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iEiUUwBPmBoJ:scholar.google.com/&scioq=Unaligned+Image-to-Sequence+Transformation+with+Loop+Consistency&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "H1ebhnEYDH",
        "title": "White Noise Analysis of Neural Networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "A white noise analysis of modern deep neural networks is presented to unveil\ntheir biases at the whole network level or the single neuron level. Our analysis is\nbased on two popular and related methods in psychophysics and neurophysiology\nnamely classification images and spike triggered analysis. These methods have\nbeen widely used to understand the underlying mechanisms of sensory systems\nin humans and monkeys. We leverage them to investigate the inherent biases of\ndeep neural networks and to obtain a first-order approximation of their functionality.\nWe emphasize on CNNs since they are currently the state of the art methods\nin computer vision and are a decent model of human visual processing. In\naddition, we study multi-layer perceptrons, logistic regression, and recurrent neural\nnetworks. Experiments over four classic datasets, MNIST, Fashion-MNIST,\nCIFAR-10, and ImageNet, show that the computed bias maps resemble the target\nclasses and when used for classification lead to an over two-fold performance than\nthe chance level. Further, we show that classification images can be used to attack\na black-box classifier and to detect adversarial patch attacks. Finally, we utilize\nspike triggered averaging to derive the filters of CNNs and explore how the behavior\nof a network changes when neurons in different layers are modulated. Our\neffort illustrates a successful example of borrowing from neurosciences to study\nANNs and highlights the importance of cross-fertilization and synergy across machine\nlearning, deep learning, and computational neuroscience.",
        "keywords": "Classification images;spike triggered analysis;deep learning;network visualization;adversarial attack;adversarial defense;microstimulation;computational neuroscience",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ali Borji;Sikun Lin",
        "authorids": "aliborji@gmail.com;sikun@ucsb.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nborji2020unveiling,\ntitle={Unveiling Hidden Biases in Deep Networks with Classification Images and Spike Triggered Analysis},\nauthor={Ali Borji and Sikun Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1ebhnEYDH}\n}",
        "github": "https://github.com/aliborji/WhiteNoiseAnalysis.git",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1ebhnEYDH",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "147;176;410;278",
        "wc_reply_reviewers": "0;0;53;0",
        "wc_reply_authors": "338;89;1174;792",
        "reply_reviewers": "0;0;1;0",
        "reply_authors": "1;1;2;2",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            252.75,
            103.0033373245741
        ],
        "wc_reply_reviewers_avg": [
            13.25,
            22.949673200287624
        ],
        "wc_reply_authors_avg": [
            598.25,
            417.1608652546401
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9695855514031715751&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1edEyBKDS",
        "title": "Plug and Play Language Models: A Simple Approach to Controlled Text Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "We control the topic and sentiment of text generation (almost) without any training. ",
        "abstract": "Large transformer-based language models (LMs) trained on huge text corpora have shown unparalleled generation capabilities. However, controlling attributes of the generated language (e.g. switching topic or sentiment) is difficult without modifying the model architecture or fine-tuning on attribute-specific data and entailing the significant cost of retraining. We propose a simple alternative: the Plug and Play Language Model (PPLM) for controllable language generation, which combines a pretrained LM with one or more simple attribute classifiers that guide text generation without any further training of the LM. In the canonical scenario we present, the attribute models are simple classifiers consisting of a user-specified bag of words or a single learned layer with 100,000 times fewer parameters than the LM. Sampling entails a forward and backward pass in which gradients from the attribute model push the LM's hidden activations and thus guide the generation. Model samples demonstrate control over a range of topics and sentiment styles, and extensive automated and human annotated evaluations show attribute alignment and fluency. PPLMs are flexible in that any combination of differentiable attribute models may be used to steer text generation, which will allow for diverse and creative applications beyond the examples given in this paper.",
        "keywords": "controlled text generation;generative models;conditional generative models;language modeling;transformer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sumanth Dathathri;Andrea Madotto;Janice Lan;Jane Hung;Eric Frank;Piero Molino;Jason Yosinski;Rosanne Liu",
        "authorids": "dathathris@gmail.com;amadotto@connect.ust.hk;lan.janice.j@gmail.com;jane.hung@uber.com;mysterefrank@uber.com;piero@uber.com;yosinski@uber.com;rosanne@uber.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nDathathri2020Plug,\ntitle={Plug and Play Language Models: A Simple Approach to Controlled Text Generation},\nauthor={Sumanth Dathathri and Andrea Madotto and Janice Lan and Jane Hung and Eric Frank and Piero Molino and Jason Yosinski and Rosanne Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1edEyBKDS}\n}",
        "github": "https://github.com/uber-research/PPLM",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1edEyBKDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "320;588;504",
        "wc_reply_reviewers": "70;0;0",
        "wc_reply_authors": "1014;1361;759",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            470.6666666666667,
            111.92060678097765
        ],
        "wc_reply_reviewers_avg": [
            23.333333333333332,
            32.99831645537222
        ],
        "wc_reply_authors_avg": [
            1044.6666666666667,
            246.72026444718497
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            27,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1069,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9850887597524341216&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "H1edV6VKvS",
        "title": "ProxNet: End-to-End Learning of Structured Representation by Proximal Mapping",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Underpinning the success of deep learning is the effective regularization that allows a broad range of structures in data to be compactly modeled in a deep architecture.  Examples include transformation invariances, robustness to adversarial/random perturbations, and correlations between multiple modalities. However, most existing methods incorporate such priors either by auto-encoders, whose result is used to initialize supervised learning, or by augmenting the data with exemplifications of the transformations which, despite the improved performance of  supervised learning, leaves it unclear whether the learned latent representation does encode the desired regularities. To address these issues, this work proposes an \\emph{end-to-end} representation learning framework that allows prior structures to be encoded \\emph{explicitly} in the hidden layers, and to be trained efficiently in conjunction with the supervised target. Our approach is based on proximal mapping in a reproducing kernel Hilbert space, and leverages differentiable optimization. The resulting technique is applied to generalize dropout and invariant kernel warping, and to develop novel algorithms for multiview modeling and robust temporal learning.",
        "keywords": "representation learning;multiview learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mao Li;Yingyi Ma;Xinhua Zhang",
        "authorids": "mli206@uic.edu;yma36@uic.edu;zhangx@uic.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://github.com/learndeep2019/ProxNet.git",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1edV6VKvS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "276;420;150",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;83;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            282.0,
            110.30865786510141
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            27.666666666666668,
            39.12657522565563
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WmR61vLT5IkJ:scholar.google.com/&scioq=ProxNet:+End-to-End+Learning+of+Structured+Representation+by+Proximal+Mapping&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1efEp4Yvr",
        "title": "Global Concavity and Optimization in a Class of Dynamic Discrete Choice Models",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Discrete choice models with unobserved heterogeneity are commonly used Econometric models for dynamic Economic behavior which have been adopted in practice to predict behavior of individuals and firms from schooling and job choices to strategic decisions in market competition. These models  feature optimizing agents who choose among a finite set of options in a sequence of periods and receive choice-specific payoffs that depend on both variables that are observed by the agent and recorded in the data and variables that are only observed by the agent but not recorded in the data. Existing work in Econometrics assumes that optimizing agents are fully rational and requires finding a functional fixed point to find the optimal policy. We show that in an important class of discrete choice models the value function is globally concave in the policy. That means that simple algorithms that do not require fixed point computation, such as the policy gradient algorithm, globally converge to the optimal policy. This finding can both be used to relax behavioral assumption regarding the optimizing agents and to facilitate Econometric analysis of dynamic behavior. In particular, we demonstrate significant computational advantages in using a simple implementation policy gradient algorithm over existing \"nested fixed point\" algorithms used in Econometrics.",
        "keywords": "Reinforcement learning;Policy Gradient;Global Concavity;Dynamic Discrete Choice Model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yiding Feng;Ekaterina Khmelnitskaya;Denis Nekipelov",
        "authorids": "yidingfeng2021@u.northwestern.edu;eak5rf@virginia.edu;denis@virginia.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfeng2020global,\ntitle={Global Concavity and Optimization in a Class of Dynamic Discrete Choice Models},\nauthor={Yiding Feng and Ekaterina Khmelnitskaya and Denis Nekipelov},\nyear={2020},\nurl={https://openreview.net/forum?id=H1efEp4Yvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1efEp4Yvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "312;620;332",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "550;282;359",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            421.3333333333333,
            140.7156312883856
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            397.0,
            112.66173559228824
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14951586633955733372&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "H1egcgHtvB",
        "title": "RAT-SQL: Relation-Aware Schema Encoding and Linking for Text-to-SQL Parsers",
        "track": "main",
        "status": "Withdraw",
        "tldr": "State of the art in complex text-to-SQL parsing by combining hard and soft relational reasoning in schema/question encoding.",
        "abstract": "When translating natural language questions into SQL queries to answer questions from a database, contemporary semantic parsing models struggle to generalize to unseen database schemas.  The generalization challenge lies in (a) encoding the database relations in an accessible way for the semantic parser, and (b) modeling alignment between database columns and their mentions in a given query.  We present a unified framework, based on the relation-aware self-attention mechanism,to address schema encoding, schema linking, and feature representation within a text-to-SQL encoder. On the challenging Spider dataset this framework boosts the exact match accuracy to 53.7%, compared to 47.4% for the previous state-of-the-art model unaugmented with BERT embeddings. In addition, we observe qualitative improvements in the model\u2019s understanding of schema linking and alignment.",
        "keywords": "semantic parsing;text-to-sql;self-attention;program synthesis;transformer;representation learning;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bailin Wang*;Richard Shin*;Xiaodong Liu;Oleksandr Polozov;Matthew Richardson",
        "authorids": "bailin.wang@ed.ac.uk;ricshin@cs.berkeley.edu;xiaodl@microsoft.com;polozov@microsoft.com;mattri@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1egcgHtvB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "192;310;386",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "574;597;378",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            296.0,
            79.8164561144296
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            516.3333333333334,
            98.26607869566294
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 639,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6430504776276190722&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "H1ekF2EYDH",
        "title": "TechKG: A Large-Scale Chinese Technology-Oriented Knowledge Graph",
        "track": "main",
        "status": "Reject",
        "tldr": "TechKG",
        "abstract": "Knowledge graph is a kind of valuable knowledge base which would benefit lots of AI-related applications. Up to now, lots of large-scale knowledge graphs have been built. However, most of them are non-Chinese and designed for general purpose. In this work, we introduce TechKG, a large scale Chinese knowledge graph that is technology-oriented. It is built automatically from massive technical papers that are published in Chinese academic journals of different research domains. Some carefully designed heuristic rules are used to extract high quality entities and relations. Totally, it comprises of over 260 million triplets that are built upon more than 52 million entities which come from 38 research domains. Our preliminary experiments indicate that TechKG has high adaptability and can be used as a dataset for many diverse AI-related applications.",
        "keywords": "Chinese knowledge graph building",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Feiliang Ren",
        "authorids": "renfeiliang@cse.neu.edu.cn",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nren2020techkg,\ntitle={Tech{\\{}KG{\\}}: A Large-Scale Chinese Technology-Oriented Knowledge Graph},\nauthor={Feiliang Ren},\nyear={2020},\nurl={https://openreview.net/forum?id=H1ekF2EYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1ekF2EYDH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "227;308;223",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "253;320;228",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            252.66666666666666,
            39.160637833870325
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            267.0,
            38.841558499456056
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12859946322065484625&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1emfT4twB",
        "title": "Few-shot Text Classification with Distributional Signatures",
        "track": "main",
        "status": "Poster",
        "tldr": "Meta-learning methods used for vision, directly applied to NLP, perform worse than nearest neighbors on new classes; we can do better with distributional signatures.",
        "abstract": "In this paper, we explore meta-learning for few-shot text classification. Meta-learning has shown strong performance in computer vision, where low-level patterns are transferable across learning tasks. However, directly applying this approach to text is challenging--lexical features highly informative for one task may be insignificant for another. Thus, rather than learning solely from words, our model also leverages their distributional signatures, which encode pertinent word occurrence patterns. Our model is trained within a meta-learning framework to map these signatures into attention scores, which are then used to weight the lexical representations of words. We demonstrate that our model consistently outperforms prototypical networks learned on lexical knowledge (Snell et al., 2017) in both few-shot text classification and relation classification by a significant margin across six benchmark datasets (20.0% on average in 1-shot classification).",
        "keywords": "text classification;meta learning;few shot learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yujia Bao;Menghua Wu;Shiyu Chang;Regina Barzilay",
        "authorids": "yujia@csail.mit.edu;rmwu@mit.edu;shiyu.chang@ibm.com;regina@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nBao2020Few-shot,\ntitle={Few-shot Text Classification with Distributional Signatures},\nauthor={Yujia Bao and Menghua Wu and Shiyu Chang and Regina Barzilay},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1emfT4twB}\n}",
        "github": "https://github.com/YujiaBao/Distributional-Signatures",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1emfT4twB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "444;489;508",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "348;701;711",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            480.3333333333333,
            26.83695627716046
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            586.6666666666666,
            168.8121901864778
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 233,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4872590605106254296&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1enKkrFDB",
        "title": "Stable Rank Normalization for Improved Generalization in Neural Networks and GANs",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose Stable Rank Normalisation, a new regularisor based on recent generelization bounds and show how to optimize it with extensive experiments.",
        "abstract": "Exciting new work on generalization bounds for neural networks (NN) given by Bartlett et al. (2017); Neyshabur et al. (2018) closely depend on two parameter- dependant quantities: the Lipschitz constant upper bound and the stable rank (a softer version of rank). Even though these bounds typically have minimal practical utility, they facilitate questions on whether controlling such quantities together could improve the generalization behaviour of NNs in practice. To this end, we propose stable rank normalization (SRN), a novel, provably optimal, and computationally efficient weight-normalization scheme which minimizes the stable rank of a linear operator. Surprisingly we find that SRN, despite being non-convex, can be shown to have a unique optimal solution. We provide extensive analyses across a wide variety of NNs (DenseNet, WideResNet, ResNet, Alexnet, VGG), where applying SRN to their linear layers leads to improved classification accuracy, while simultaneously showing improvements in genealization, evaluated empirically using\u2014(a) shattering experiments (Zhang et al., 2016); and (b) three measures of sample complexity by Bartlett et al. (2017), Neyshabur et al. (2018), & Wei & Ma. Additionally, we show that, when applied to the discriminator of GANs, it improves Inception, FID, and Neural divergence scores, while learning mappings with low empirical Lipschitz constant.",
        "keywords": "Generelization;regularization;empirical lipschitz",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amartya Sanyal;Philip H. Torr;Puneet K. Dokania",
        "authorids": "amartya.sanyal@cs.ox.ac.uk;philip.torr@eng.ox.ac.uk;puneet@robots.ox.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nSanyal2020Stable,\ntitle={Stable Rank Normalization for Improved Generalization in Neural Networks and GANs},\nauthor={Amartya Sanyal and Philip H. Torr and Puneet K. Dokania},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1enKkrFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1enKkrFDB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "396;333;504",
        "wc_reply_reviewers": "134;83;0",
        "wc_reply_authors": "1105;419;590",
        "reply_reviewers": "1;1;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            411.0,
            70.61161377563892
        ],
        "wc_reply_reviewers_avg": [
            72.33333333333333,
            55.222781128403476
        ],
        "wc_reply_authors_avg": [
            704.6666666666666,
            291.5594088498755
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 55,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1842964313569281006&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1eo9h4KPH",
        "title": "Certifying Distributional Robustness using Lipschitz Regularisation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Distributional robust risk (DRR) minimisation has arisen as a flexible and effective framework for machine learning. Approximate solutions based on dualisation have become particularly favorable in addressing the semi-infinite optimisation, and they also provide a certificate of the robustness for the worst-case population loss. However existing methods are restricted to either linear models or very small perturbations, and cannot find the globally optimal solution for restricted nonlinear models such as kernel methods. In this paper we resolved these limitations by upper bounding DRRs with an empirical risk regularised by the Lipschitz constant of the model, including deep neural networks and kernel methods. As an application, we showed that it also provides a certificate for adversarial training, and global solutions can be achieved on product kernel machines in polynomial time.",
        "keywords": "kernel method;adversarial learning;distributionally robust optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zac Cranko;Zhan Shi;Xinhua Zhang;Simon Kornblith;Richard Nock",
        "authorids": "zac.cranko@anu.edu.au;zshi22@uic.edu;zhangx@uic.edu;skornblith@google.com;richard.nock@data61.csiro.au",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ncranko2020certifying,\ntitle={Certifying Distributional Robustness using Lipschitz Regularisation},\nauthor={Zac Cranko and Zhan Shi and Xinhua Zhang and Simon Kornblith and Richard Nock},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eo9h4KPH}\n}",
        "github": "https://github.com/learndeep2019/CertifyDRR.git",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1eo9h4KPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "236;521;723",
        "wc_reply_reviewers": "0;0;222",
        "wc_reply_authors": "492;679;1187",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            493.3333333333333,
            199.77709800897824
        ],
        "wc_reply_reviewers_avg": [
            74.0,
            104.65180361560904
        ],
        "wc_reply_authors_avg": [
            786.0,
            293.64718058695314
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bNpRqR3KkLEJ:scholar.google.com/&scioq=Certifying+Distributional+Robustness+using+Lipschitz+Regularisation&hl=en&as_sdt=0,33",
        "gs_version_total": 2
    },
    {
        "id": "H1ep5TNKwr",
        "title": "Hebbian Graph Embeddings",
        "track": "main",
        "status": "Reject",
        "tldr": "Graph embeddings for link prediction, reconstruction and for a recommender system",
        "abstract": "Representation learning has recently been successfully used to create vector representations of entities in language learning, recommender systems and in similarity learning. Graph embeddings exploit the locality structure of a graph and generate embeddings for nodes which could be words in a language, products on a retail website; and the nodes are connected based on a context window.  In this paper, we consider graph embeddings with an error-free associative learning update rule, which models the embedding vector of node as a non-convex Gaussian mixture of the embeddings of the nodes in its immediate vicinity with some constant variance that is reduced as iterations progress.  It is very easy to parallelize our algorithm without any form of shared memory, which makes it possible to use it on very large graphs with a much higher dimensionality of the embeddings. We study the efficacy of proposed method on several benchmark data sets in Goyal & Ferrara(2018b) and favorably compare with state of the art methods. Further, proposed method is applied to generate relevant recommendations for a large retailer.",
        "keywords": "graph embeddings;hebbian learning;simulated annealing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shalin Shah;Venkataramana Kini",
        "authorids": "shalin.shah@target.com;venkataramana.kini@target.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nshah2020hebbian,\ntitle={Hebbian Graph Embeddings},\nauthor={Shalin Shah and Venkataramana Kini},\nyear={2020},\nurl={https://openreview.net/forum?id=H1ep5TNKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1ep5TNKwr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "155;170;839",
        "wc_reply_reviewers": "0;194;0",
        "wc_reply_authors": "52;163;131",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            388.0,
            318.9639478060177
        ],
        "wc_reply_reviewers_avg": [
            64.66666666666667,
            91.45247703346013
        ],
        "wc_reply_authors_avg": [
            115.33333333333333,
            46.64999702274612
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4500286036167848930&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1epaJSYDS",
        "title": "Anchor & Transform: Learning Sparse Representations of Discrete Objects",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a general method to learn sparse representations of discrete objects that is scalable, flexible, end-to-end trainable, and allows the user to easily incorporate domain knowledge about object relationships.",
        "abstract": "Learning continuous representations of discrete objects such as text, users, and items lies at the heart of many applications including text and user modeling. Unfortunately, traditional methods that embed all objects do not scale to large vocabulary sizes and embedding dimensions. In this paper, we propose a general method, Anchor & Transform (ANT) that learns sparse representations of discrete objects by jointly learning a small set of anchor embeddings and a sparse transformation from anchor objects to all objects. ANT is scalable, flexible, end-to-end trainable, and allows the user to easily incorporate domain knowledge about object relationships (e.g. WordNet, co-occurrence, item clusters). ANT also recovers several task-specific baselines under certain structural assumptions on the anchors and transformation matrices. On text classification and language modeling benchmarks, ANT demonstrates stronger performance with fewer parameters as compared to existing vocabulary selection and embedding compression baselines.",
        "keywords": "sparse representation learning;discrete inputs;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Paul Pu Liang;Manzil Zaheer;Yuan Wang;Amr Ahmed",
        "authorids": "pliang@cs.cmu.edu;manzilzaheer@google.com;yuanwang@google.com;amra@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nliang2020anchor,\ntitle={Anchor {\\&} Transform: Learning Sparse Representations of Discrete Objects},\nauthor={Paul Pu Liang and Manzil Zaheer and Yuan Wang and Amr Ahmed},\nyear={2020},\nurl={https://openreview.net/forum?id=H1epaJSYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1epaJSYDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "339;258;758",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "255;555;1250",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            451.6666666666667,
            219.1199569997118
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            686.6666666666666,
            416.7399935478022
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12908604180380620368&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1eqOnNYDH",
        "title": "Data augmentation instead of explicit regularization",
        "track": "main",
        "status": "Reject",
        "tldr": "Deep neural networks trained with data augmentation do not require any other explicit regularization (such as weight decay and dropout) and exhibit greater adaptaibility to changes in the architecture and the amount of training data.",
        "abstract": "Modern deep artificial neural networks have achieved impressive results through models with orders of magnitude more parameters than training examples which control overfitting with the help of regularization. Regularization can be implicit, as is the case of stochastic gradient descent and parameter sharing in convolutional layers, or explicit. Explicit regularization techniques, most common forms are weight decay and dropout, have proven successful in terms of improved generalization, but they blindly reduce the effective capacity of the model, introduce sensitive hyper-parameters and require deeper and wider architectures to compensate for the reduced capacity. In contrast, data augmentation techniques exploit domain knowledge to increase the number of training examples and improve generalization without reducing the effective capacity and without introducing model-dependent parameters, since it is applied on the training data. In this paper we systematically contrast data augmentation and explicit regularization on three popular architectures and three data sets. Our results demonstrate that data augmentation alone can achieve the same performance or higher as regularized models and exhibits much higher adaptability to changes in the architecture and the amount of training data.",
        "keywords": "data augmentation;implicit regularization;explicit regularization;object recognition;convolutional neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alex Hernandez-Garcia;Peter K\u00f6nig",
        "authorids": "alexhg15@gmail.com;pkoenig@uos.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nhernandez-garcia2020data,\ntitle={Data augmentation instead of explicit regularization},\nauthor={Alex Hernandez-Garcia and Peter K{\\\"o}nig},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eqOnNYDH}\n}",
        "github": "https://www.dropbox.com/sh/ki0syyl0bvp29rl/AABmmbqC-Ft86rzJ5WyUTo4da?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1eqOnNYDH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1043;734;106",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2067;2098;388",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;4;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            627.6666666666666,
            389.8481185738307
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1517.6666666666667,
            798.8952093707633
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.247219128924647
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 215,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18176253708979082271&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1eqQeHFDS",
        "title": "AdvectiveNet: An Eulerian-Lagrangian Fluidic Reservoir for Point Cloud Processing",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a new grid-particle learning method to process point clouds motivated by computational fluid dynamics.",
        "abstract": "This paper presents a novel physics-inspired deep learning approach for point cloud processing motivated by the natural flow phenomena in fluid mechanics.  Our learning architecture jointly defines data in an Eulerian world space, using a static background grid, and a Lagrangian material space, using moving particles. By introducing this Eulerian-Lagrangian representation, we are able to naturally evolve and accumulate particle features using flow velocities generated from a generalized, high-dimensional force field.  We demonstrate the efficacy of this system by solving various point cloud classification and segmentation problems with state-of-the-art performance. The entire geometric reservoir and data flow mimic the pipeline of the classic PIC/FLIP scheme in modeling natural flow, bridging the disciplines of geometric machine learning and physical simulation.",
        "keywords": "Point Cloud Processing;Physical Reservoir Learning;Eulerian-Lagrangian Method;PIC/FLIP",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xingzhe He;Helen Lu Cao;Bo Zhu",
        "authorids": "xingzhe.he95@gmail.com;helen.l.cao.22@dartmouth.edu;bo.zhu@dartmouth.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nHe2020AdvectiveNet:,\ntitle={AdvectiveNet: An Eulerian-Lagrangian Fluidic Reservoir for Point Cloud Processing     },\nauthor={Xingzhe He and Helen Lu Cao and Bo Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1eqQeHFDS}\n}",
        "github": "https://github.com/DIUDIUDIUDIUDIU/AdvectiveNet-An-Eulerian-Lagrangian-Fluidic-Reservoir-for-Point-Cloud-Processing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1eqQeHFDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "365;358;260",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            327.6666666666667,
            47.932823363072984
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16984583145926125597&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "H1ervR4FwH",
        "title": "Improved Structural Discovery and Representation Learning of Multi-Agent Data",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an improved approach to discovering the group structure and ordered representation of multi-agent data",
        "abstract": "Central to all machine learning algorithms is data representation. For multi-agent systems, selecting a representation which adequately captures the interactions among agents is challenging due to the latent group structure which tends to vary depending on various contexts. However, in multi-agent systems with strong group structure, we can simultaneously learn this structure and map a set of agents to a consistently ordered representation for further learning. In this paper, we present a dynamic alignment method which provides a robust ordering of structured multi-agent data which allows for representation learning to occur in a fraction of the time of previous methods.  We demonstrate the value of this approach using a large amount of soccer tracking data from a professional league. ",
        "keywords": "multi-agent;gaussian mixture;permutation learning;representation learning;group structure",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jennifer Hobbs;Matthew Holbrook;Nathan Frank;Long Sha;Patrick Lucey",
        "authorids": "jennifer.hobbs@statsperform.com;matthewholbrook@statsperform.com;nathan.frank@statsperform.com;long.sha@statsperform.com;patrick.lucey@statsperform.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nhobbs2020improved,\ntitle={Improved Structural Discovery and Representation Learning of Multi-Agent Data},\nauthor={Jennifer Hobbs and Matthew Holbrook and Nathan Frank and Long Sha and Patrick Lucey},\nyear={2020},\nurl={https://openreview.net/forum?id=H1ervR4FwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1ervR4FwH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "389;344;564",
        "wc_reply_reviewers": "0;0;62",
        "wc_reply_authors": "876;746;882",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;2;2",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            432.3333333333333,
            94.8976056365784
        ],
        "wc_reply_reviewers_avg": [
            20.666666666666668,
            29.227080289043965
        ],
        "wc_reply_authors_avg": [
            834.6666666666666,
            62.744632351213035
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16279182457538188431&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1exf64KwH",
        "title": "Exploring Model-based Planning with Policy Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "how to achieve state-of-the-art performance by combining policy network in model-based planning",
        "abstract": "Model-based reinforcement learning (MBRL) with model-predictive control or\nonline planning has shown great potential for locomotion control tasks in both\nsample efficiency and asymptotic performance. Despite the successes, the existing\nplanning methods search from candidate sequences randomly generated in the\naction space, which is inefficient in complex high-dimensional environments. In\nthis paper, we propose a novel MBRL algorithm, model-based policy planning\n(POPLIN), that combines policy networks with online planning. More specifically,\nwe formulate action planning at each time-step as an optimization problem using\nneural networks. We experiment with both optimization w.r.t. the action sequences\ninitialized from the policy network, and also online optimization directly w.r.t. the\nparameters of the policy network. We show that POPLIN obtains state-of-the-art\nperformance in the MuJoCo benchmarking environments, being about 3x more\nsample efficient than the state-of-the-art algorithms, such as PETS, TD3 and SAC.\nTo explain the effectiveness of our algorithm, we show that the optimization surface\nin parameter space is smoother than in action space. Further more, we found the\ndistilled policy network can be effectively applied without the expansive model\npredictive control during test time for some environments such as Cheetah. Code\nis released.",
        "keywords": "reinforcement learning;model-based reinforcement learning;planning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tingwu Wang;Jimmy Ba",
        "authorids": "tingwuwang@cs.toronto.edu;jba@cs.toronto.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nWang2020Exploring,\ntitle={Exploring Model-based Planning with Policy Networks},\nauthor={Tingwu Wang and Jimmy Ba},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1exf64KwH}\n}",
        "github": "[![github](/images/github_icon.svg) WilsonWangTHU/POPLIN](https://github.com/WilsonWangTHU/POPLIN)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1exf64KwH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "284;358;477",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "354;491;486",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            373.0,
            79.50262050188451
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            443.6666666666667,
            63.436757658491835
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 191,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5788425518026701179&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1ezFREtwH",
        "title": "Composing Task-Agnostic Policies with Deep Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a novel reinforcement learning-based skill transfer and composition method that takes the agent's primitive policies to solve unseen tasks.",
        "abstract": "The composition of elementary behaviors to solve challenging transfer learning problems is one of the key elements in building intelligent machines. To date, there has been plenty of work on learning task-specific policies or skills but almost no focus on composing necessary, task-agnostic skills to find a solution to new problems. In this paper, we propose a novel deep reinforcement learning-based skill transfer and composition method that takes the agent's primitive policies to solve unseen tasks. We evaluate our method in difficult cases where training policy through standard reinforcement learning (RL) or even hierarchical RL is either not feasible or exhibits high sample complexity. We show that our method not only transfers skills to new problem settings but also solves the challenging environments requiring both task planning and motion control with high data efficiency.",
        "keywords": "composition;transfer learning;deep reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ahmed H. Qureshi;Jacob J. Johnson;Yuzhe Qin;Taylor Henderson;Byron Boots;Michael C. Yip",
        "authorids": "a1qureshi@ucsd.edu;jjj025@eng.ucsd.edu;y1qin@eng.ucsd.edu;tjwest@ucsd.edu;bboots@cs.washington.edu;yip@ucsd.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nQureshi2020Composing,\ntitle={Composing Task-Agnostic Policies with Deep Reinforcement Learning},\nauthor={Ahmed H. Qureshi and Jacob J. Johnson and Yuzhe Qin and Taylor Henderson and Byron Boots and Michael C. Yip},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1ezFREtwH}\n}",
        "github": "https://drive.google.com/file/d/1pbF9vMy5E3NLdOE5Id5zqzKlUesgStym/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1ezFREtwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "455;596;291",
        "wc_reply_reviewers": "154;548;0",
        "wc_reply_authors": "1281;1075;499",
        "reply_reviewers": "3;2;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            447.3333333333333,
            124.6336854769029
        ],
        "wc_reply_reviewers_avg": [
            234.0,
            230.7610596843988
        ],
        "wc_reply_authors_avg": [
            951.6666666666666,
            330.94746142284004
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            1.247219128924647
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4731844841840574207&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1g4M0EtPS",
        "title": "Gaussian MRF Covariance Modeling for Efficient Black-Box Adversarial Attacks",
        "track": "main",
        "status": "Reject",
        "tldr": "A query efficient one-step black-box adversarial attack",
        "abstract": "We study the problem of generating adversarial examples in a black-box setting, where we only have access to a zeroth order oracle, providing us with loss function evaluations. We employ Markov Random Fields (MRF) to exploit the structure of input data to systematically model the covariance structure of the gradients. The MRF structure in addition to Bayesian inference for the gradients facilitates one-step attacks akin to Fast Gradient Sign Method (FGSM) albeit in the black-box setting. The resulting method uses fewer queries than the current state of the art to achieve comparable performance. In particular, in the regime of lower query budgets, we show that our method is particularly effective in terms of fewer average queries with high attack accuracy while employing one-step attacks.",
        "keywords": "Black-Box Adversarial Attacks;Gaussian Markov Random Fields",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anit Kumar Sahu;J. Zico Kolter;Satya Narayan Shukla",
        "authorids": "anit.sahu@gmail.com;zkolter@cs.cmu.edu;snshukla@cs.umass.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsahu2020gaussian,\ntitle={Gaussian {\\{}MRF{\\}} Covariance Modeling for Efficient Black-Box Adversarial Attacks},\nauthor={Anit Kumar Sahu and J. Zico Kolter and Satya Narayan Shukla},\nyear={2020},\nurl={https://openreview.net/forum?id=H1g4M0EtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1g4M0EtPS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "351;350;118",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "430;1032;38",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            273.0,
            109.60231141114984
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            500.0,
            408.80639264408114
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ttGfXzosBO4J:scholar.google.com/&scioq=Gaussian+MRF+Covariance+Modeling+for+Efficient+Black-Box+Adversarial+Attacks&hl=en&as_sdt=0,5",
        "gs_version_total": 5
    },
    {
        "id": "H1g6kaVKvH",
        "title": "Learning with Long-term Remembering: Following the Lead of Mixed Stochastic Gradient",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel and effective lifelong learning algorithm which achieves the state-of-the-art results on several benchmarks.",
        "abstract": "Current deep neural networks can achieve remarkable performance on a single task. However, when the deep neural network is continually trained on a sequence of tasks, it seems to gradually forget the previous learned knowledge.  This phenomenon is referred to as catastrophic forgetting and motivates the field called lifelong learning.  The central question in lifelong learning is how to enable deep neural networks to maintain performance on old tasks while learning a new task. In this paper, we introduce a novel and effective lifelong learning algorithm, called MixEd stochastic GrAdient (MEGA), which allows deep neural networks to acquire the ability of retaining performance on old tasks while learning new tasks. MEGA modulates the balance between old tasks and the new task by integrating the current gradient with the gradient computed on a small reference episodic memory.  Extensive  experimental  results  show  that  the  proposed  MEGA  algorithm significantly advances the state-of-the-art on all four commonly used life-long learning benchmarks, reducing the error by up to 18%.",
        "keywords": "lifelong learning;continual learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yunhui Guo;Mingrui Liu;Tianbao Yang;Tajana Rosing",
        "authorids": "yug185@eng.ucsd.edu;mingrui-liu@uiowa.edu;tianbao-yang@uiowa.edu;tajana@ucsd.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nguo2020learning,\ntitle={Learning with Long-term Remembering: Following the Lead of Mixed Stochastic Gradient},\nauthor={Yunhui Guo and Mingrui Liu and Tianbao Yang and Tajana Rosing},\nyear={2020},\nurl={https://openreview.net/forum?id=H1g6kaVKvH}\n}",
        "github": "https://drive.google.com/file/d/1ZKrIpkf1VoZrotMZ9MyALKIbbB_qEvg5/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1g6kaVKvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "600;731;285",
        "wc_reply_reviewers": "148;0;0",
        "wc_reply_authors": "758;1422;1011",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            538.6666666666666,
            187.1725288485347
        ],
        "wc_reply_reviewers_avg": [
            49.333333333333336,
            69.76786907707269
        ],
        "wc_reply_authors_avg": [
            1063.6666666666667,
            273.62301722544385
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6663071109684806269&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1g6s0NtwS",
        "title": "Learning Neural Surrogate Model for Warm-Starting Bayesian Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Bayesian optimization is an effective tool to optimize black-box functions and popular for hyper-parameter tuning in machine learning. Traditional Bayesian optimization methods are based on Gaussian process (GP), relying on a GP-based surrogate model for sampling points of  the function of interest.  In this work, we consider transferring knowledge from related problems to target problem by learning an initial surrogate model for warm-starting Bayesian optimization. We propose a neural network-based surrogate model to estimate the function mean value in GP.  Then we design a novel weighted Reptile algorithm with sampling strategy to learn an initial surrogate model from meta train set. The initial surrogate model is learned to be able to well adapt to new tasks. Extensive experiments show that this warm-starting technique enables us to find better minimizer or hyper-parameters than traditional GP and previous warm-starting methods.",
        "keywords": "Bayesian optimization;meta learning;neural network;surrogate model;hyper-parameters tuning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haotian Zhang;Jian Sun;Zongben Xu",
        "authorids": "zht570795275@stu.xjtu.edu.cn;jiansun@xjtu.edu.cn;zbxu@xjtu.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhang2020learning,\ntitle={Learning Neural Surrogate Model for Warm-Starting Bayesian Optimization},\nauthor={Haotian Zhang and Jian Sun and Zongben Xu},\nyear={2020},\nurl={https://openreview.net/forum?id=H1g6s0NtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1g6s0NtwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "466;291;944",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "263;96;503",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            567.0,
            275.9867146560984
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            287.3333333333333,
            167.04556929838702
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10887935094473852969&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1g79ySYvB",
        "title": "Revisiting Gradient Episodic Memory for Continual Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Gradient Episodic Memory (GEM) is an effective model for continual learning, where each gradient update for the current task is formulated as a quadratic program problem with inequality constraints that alleviate catastrophic forgetting of previous tasks. However, practical use of GEM is impeded by several limitations: (1) the data examples stored in the episodic memory may not be representative of past tasks; (2)  the inequality constraints appear to be rather restrictive for competing or conflicting tasks; (3) the inequality constraints can only avoid catastrophic forgetting but can not assure positive backward transfer. To address these issues, in this paper we aim at improving the original GEM model via three handy techniques without extra computational cost. Experiments on MNIST Permutations and incremental CIFAR100 datasets demonstrate that our techniques enhance the performance of GEM remarkably. On CIFAR100 the average accuracy is improved from 66.48% to 68.76%, along with the backward (knowledge) transfer growing from 1.38% to 4.03%.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiyi Chen;Tong Lin*",
        "authorids": "chenzhiy16@mails.tsinghua.edu.cn;lintong@pku.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nchen2020revisiting,\ntitle={Revisiting Gradient Episodic Memory for Continual Learning},\nauthor={Zhiyi Chen and Tong Lin*},\nyear={2020},\nurl={https://openreview.net/forum?id=H1g79ySYvB}\n}",
        "github": "https://github.com/yu-long-bian-hua/2020iclr",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1g79ySYvB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "367;467;426",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            420.0,
            41.04469108991645
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3882238862922722510&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1g7sxHKPr",
        "title": "Learning to Learn via Gradient Component Corrections",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a meta-learner to adapt quickly on multiple tasks even one step in a few-shot setting.",
        "abstract": "Gradient-based meta-learning algorithms require several steps of gradient descent to adapt to newly incoming tasks. This process becomes more costly as the number  of  samples  increases.   Moreover,  the  gradient  updates  suffer  from  several sources of noise leading to a degraded performance.   In this work,  we propose a meta-learning algorithm equipped with the GradiEnt Component COrrections, aGECCO cell for short, which generates a multiplicative corrective low-rank matrix which (after vectorization) corrects the estimated gradients. GECCO contains a simple decoder-like network with learnable parameters, an attention module and a so-called context input parameter. The context parameter of GECCO is updated to  generate  a  low-rank  corrective  term  for  the  network  gradients.   As  a  result, meta-learning requires only a few of gradient updates to absorb new task (often, a single update is sufficient in the few-shot scenario). While previous approaches address this problem by altering the learning rates, factorising network parameters or directly learning feature corrections from features and/or gradients, GECCO is an off-the-shelf generator-like unit that performs element-wise gradient corrections without the need to \u2018observe\u2019 the features and/or the gradients directly.  We show that our GECCO (i) accelerates learning, (ii) performs robust corrections of the gradients corrupted by a noise, and (iii) leads to notable improvements over existing gradient-based meta-learning algorithms.",
        "keywords": "meta-learning;classification;regression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Christian Simon;Piotr Koniusz;Richard Nock;Mehrtash Harandi",
        "authorids": "christian.simon@anu.edu.au;peter.koniusz@data61.csiro.au;richard.nock@data61.csiro.au;mehrtash.harandi@monash.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=H1g7sxHKPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "776;1347;534",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            885.6666666666666,
            340.8443763492202
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SVp8JAmXQX4J:scholar.google.com/&scioq=Learning+to+Learn+via+Gradient+Component+Corrections&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1g8p1BYvS",
        "title": "Adversarial Filters of Dataset Biases",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Large-scale benchmark datasets have been among the major driving forces in AI, supporting training of models and measuring their progress. The key assumption is that these benchmarks are realistic approximations of the target tasks in the real world. However, while machine performance on these benchmarks advances rapidly --- often surpassing human performance --- it still struggles on the target tasks in the wild. This raises an important question: whether the surreal high performance on existing benchmarks are inflated due to spurious biases in them, and if so, how we can effectively revise these benchmarks to better simulate more realistic problem distributions in the real world. \u00a0\nIn this paper, we posit that while the real world problems consist of a great deal of long-tail problems, existing benchmarks are overly populated with a great deal of similar (thus non-tail) problems, which in turn, leads to a major overestimation of true AI performance. To address this challenge, we present a novel framework of Adversarial Filters to investigate model-based reduction of dataset biases. We discuss that the optimum bias reduction via AFOptimum is intractable, thus propose AFLite, an iterative greedy algorithm that adversarially filters out data points to identify a reduced dataset with more realistic problem distributions and considerably less spurious biases. \nAFLite is lightweight and can in principle be applied to any task and dataset. We apply it to popular benchmarks that are practically solved --- ImageNet and Natural Language Inference (SNLI, MNLI, QNLI) --- and present filtered counterparts as new challenge datasets where the model performance drops considerably (e.g., from 84% to 24% for ImageNet and from 92% to 62% for SNLI), while human performance remains high. An extensive suite of analysis demonstrates that AFLite effectively reduces measurable dataset biases in both the synthetic and real datasets. Finally, we introduce new measures of dataset biases based on K-nearest-neighbors to help guide future research on dataset developments and bias reduction.\u00a0",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ronan Le Bras;Swabha Swayamdipta;Chandra Bhagavatula;Rowan Zellers;Matthew Peters;Ashish Sabharwal;Yejin Choi",
        "authorids": "ronanlb@allenai.org;swabhas@allenai.org;chandrab@allenai.org;rowanz@cs.washington.edu;matthewp@allenai.org;ashishs@allenai.org;yejinc@allenai.org",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nbras2020adversarial,\ntitle={Adversarial Filters of Dataset Biases},\nauthor={Ronan Le Bras and Swabha Swayamdipta and Chandra Bhagavatula and Rowan Zellers and Matthew Peters and Ashish Sabharwal and Yejin Choi},\nyear={2020},\nurl={https://openreview.net/forum?id=H1g8p1BYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1g8p1BYvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "314;377;138",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "156;478;30",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            276.3333333333333,
            101.1412653448411
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            221.33333333333334,
            188.6395740264722
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 251,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11617966867048191189&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "H1gB4RVKvB",
        "title": "Recurrent neural circuits for contour detection",
        "track": "main",
        "status": "Poster",
        "tldr": "Contextual illusions are a feature, not a bug, of neural routines optimized for contour detection.",
        "abstract": "We introduce a deep recurrent neural network architecture that approximates visual cortical circuits (M\u00e9ly et al., 2018). We show that this architecture, which we refer to as the \ud835\udf38-net, learns to solve contour detection tasks with better sample efficiency than state-of-the-art feedforward networks, while also exhibiting a classic perceptual illusion, known as the orientation-tilt illusion. Correcting this illusion significantly reduces \\gnetw contour detection accuracy by driving it to prefer low-level edges over high-level object boundary contours. Overall, our study suggests that the orientation-tilt illusion is a byproduct of neural circuits that help biological visual systems achieve robust and efficient contour detection, and that incorporating these circuits in artificial neural networks can improve computer vision.",
        "keywords": "Contextual illusions;visual cortex;recurrent feedback;neural circuits",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Drew Linsley*;Junkyung Kim*;Alekh Ashok;Thomas Serre",
        "authorids": "drew_linsley@brown.edu;junkyung_kim@brown.edu;alekh_karkada_ashok@brown.edu;thomas_serre@brown.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLinsley*2020Recurrent,\ntitle={Recurrent neural circuits for contour detection},\nauthor={Drew Linsley* and Junkyung Kim* and Alekh Ashok and Thomas Serre},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gB4RVKvB}\n}",
        "github": "https://mega.nz/#F!DrA12KCT!4BC_rfjqN5pXBbCl9Ay1DA",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1gB4RVKvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "684;913;471",
        "wc_reply_reviewers": "0;20;0",
        "wc_reply_authors": "615;1094;461",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            689.3333333333334,
            180.4851486657251
        ],
        "wc_reply_reviewers_avg": [
            6.666666666666667,
            9.428090415820632
        ],
        "wc_reply_authors_avg": [
            723.3333333333334,
            269.53581497744517
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 56,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9315270226985028639&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1gBhkBFDH",
        "title": "B-Spline CNNs on Lie groups",
        "track": "main",
        "status": "Poster",
        "tldr": "The paper describes a flexible framework for building CNNs that are equivariant to a large class of transformations groups.",
        "abstract": "Group convolutional neural networks (G-CNNs) can be used to improve classical CNNs by equipping them with the geometric structure of groups. Central in the success of G-CNNs is the lifting of feature maps to higher dimensional disentangled representations, in which data characteristics are effectively learned, geometric data-augmentations are made obsolete, and predictable behavior under geometric transformations (equivariance) is guaranteed via group theory. Currently, however, the practical implementations of G-CNNs are limited to either discrete groups (that leave the grid intact) or continuous compact groups such as rotations (that enable the use of Fourier theory). In this paper we lift these limitations and propose a modular framework for the design and implementation of G-CNNs for arbitrary Lie groups. In our approach the differential structure of Lie groups is used to expand convolution kernels in a generic basis of B-splines that is defined on the Lie algebra. This leads to a flexible framework that enables localized, atrous, and deformable convolutions in G-CNNs by means of respectively localized, sparse and non-uniform B-spline expansions. The impact and potential of our approach is studied on two benchmark datasets: cancer detection in histopathology slides (PCam dataset) in which rotation equivariance plays a key role and facial landmark localization (CelebA dataset) in which scale equivariance is important. In both cases, G-CNN architectures outperform their classical 2D counterparts and the added value of atrous and localized group convolutions is studied in detail.",
        "keywords": "equivariance;Lie groups;B-Splines;G-CNNs;deep learning;group convolution;computer vision;medical image analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Erik J Bekkers",
        "authorids": "e.j.bekkers@tue.nl",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nBekkers2020B-Spline,\ntitle={B-Spline CNNs on Lie groups},\nauthor={Erik J Bekkers},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gBhkBFDH}\n}",
        "github": "https://github.com/ebekkers/gsplinets",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1gBhkBFDH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "390;88;345",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1565;144;994",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            274.3333333333333,
            133.0321598545087
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            901.0,
            583.8361642333118
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 167,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14711713420421113660&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1gBsgBYwH",
        "title": "Generalization of Two-layer Neural Networks: An Asymptotic Viewpoint",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Derived population risk of two-layer neural networks in high dimensions and examined presence / absence of  \"double descent\".",
        "abstract": "This paper investigates the generalization properties of two-layer neural networks in high-dimensions, i.e. when the number of samples $n$, features $d$, and neurons $h$ tend to infinity at the same rate. Specifically, we derive the exact population risk of the unregularized least squares regression problem with two-layer neural networks when either the first or the second layer is trained using a gradient flow under different initialization setups.  When only the second layer coefficients are optimized, we recover the \\textit{double descent} phenomenon: a cusp in the population risk appears at $h\\approx n$ and further overparameterization decreases the risk. In contrast, when the first layer weights are optimized, we highlight how different scales of initialization lead to different inductive bias, and show that the resulting risk is \\textit{independent} of overparameterization. Our theoretical and experimental results suggest that previously studied model setups that provably give rise to \\textit{double descent} might not translate to optimizing two-layer neural networks.",
        "keywords": "Neural Networks;Generalization;High-dimensional Statistics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jimmy Ba;Murat Erdogdu;Taiji Suzuki;Denny Wu;Tianzong Zhang",
        "authorids": "jba@cs.toronto.edu;erdogdu@cs.toronto.edu;taiji@mist.i.u-tokyo.ac.jp;dennywu@cs.toronto.edu;ztz16@mails.tsinghua.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nBa2020Generalization,\ntitle={Generalization of Two-layer Neural Networks: An Asymptotic Viewpoint},\nauthor={Jimmy Ba and Murat Erdogdu and Taiji Suzuki and Denny Wu and Tianzong Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gBsgBYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1gBsgBYwH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "593;151;198",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "391;70;80",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            314.0,
            198.21368940279243
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            180.33333333333334,
            149.01976006184177
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 97,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10812123593159438732&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1gCeyHFDS",
        "title": "Gram-Gauss-Newton Method: Learning Overparameterized Neural Networks for Regression Problems",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel Gram-Gauss-Newton method to train neural networks, inspired by neural tangent kernel and Gauss-Newton method, with fast convergence speed both theoretically and experimentally.",
        "abstract": "First-order methods such as stochastic gradient descent (SGD) are currently the standard algorithm for training deep neural networks. Second-order methods, despite their better convergence rate, are rarely used in practice due to the pro- hibitive computational cost in calculating the second-order information. In this paper, we propose a novel Gram-Gauss-Newton (GGN) algorithm to train deep neural networks for regression problems with square loss. Our method draws inspiration from the connection between neural network optimization and kernel regression of neural tangent kernel (NTK). Different from typical second-order methods that have heavy computational cost in each iteration, GGN only has minor overhead compared to first-order methods such as SGD. We also give theoretical results to show that for sufficiently wide neural networks, the convergence rate of GGN is quadratic. Furthermore, we provide convergence guarantee for mini-batch GGN algorithm, which is, to our knowledge, the first convergence result for the mini-batch version of a second-order method on overparameterized neural net- works. Preliminary experiments on regression tasks demonstrate that for training standard networks, our GGN algorithm converges much faster and achieves better performance than SGD.",
        "keywords": "Deep learning;Optimization;Second-order method;Neural Tangent Kernel regression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianle Cai*;Ruiqi Gao*;Jikai Hou*;Siyu Chen;Dong Wang;Di He;Zhihua Zhang;Liwei Wang",
        "authorids": "caitianle1998@pku.edu.cn;grq@pku.edu.cn;1600010681@pku.edu.cn;siyuchen@pku.edu.cn;wangdongcis@pku.edu.cn;di_he@pku.edu.cn;zhzhang@math.pku.edu.cn;wanglw@cis.pku.edu.cn",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\ncai*2020gramgaussnewton,\ntitle={Gram-Gauss-Newton Method: Learning Overparameterized Neural Networks for Regression Problems},\nauthor={Tianle Cai* and Ruiqi Gao* and Jikai Hou* and Siyu Chen and Dong Wang and Di He and Zhihua Zhang and Liwei Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gCeyHFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer5",
        "site": "https://openreview.net/forum?id=H1gCeyHFDS",
        "pdf_size": 0,
        "rating": "1;3;3;3;6",
        "confidence": "0;0;0;0;0",
        "wc_review": "541;444;357;527;404",
        "wc_reply_reviewers": "0;0;0;0;0",
        "wc_reply_authors": "17;148;224;17;17",
        "reply_reviewers": "0;0;0;0;0",
        "reply_authors": "1;1;1;1;1",
        "rating_avg": [
            3.2,
            1.6
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            454.6,
            70.57648333545671
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            84.6,
            86.21044020302877
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9319246200576225750&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1gDNyrKDS",
        "title": "Understanding and Robustifying Differentiable Architecture Search",
        "track": "main",
        "status": "Talk",
        "tldr": "We study the failure modes of DARTS (Differentiable Architecture Search) by looking at the eigenvalues of the Hessian of validation loss w.r.t. the architecture and propose robustifications based on our analysis.",
        "abstract": "Differentiable Architecture Search (DARTS) has attracted a lot of attention due to its simplicity and small search costs achieved by a continuous relaxation and an approximation of the resulting bi-level optimization problem.  However, DARTS does not work robustly for new problems: we identify a wide range of search spaces for which DARTS yields degenerate architectures with very poor test performance. We study this failure mode and show that, while DARTS successfully minimizes validation loss, the found solutions generalize poorly when they coincide with high validation loss curvature in the  architecture space. We show that by adding one of various types of regularization we can robustify DARTS to find solutions with less curvature and better generalization properties. Based on these observations, we propose several simple variations of DARTS that perform substantially more robustly in practice.  Our observations are robust across five search spaces on three image classification tasks and also hold for the very different domains of disparity estimation (a dense regression task) and language modelling.",
        "keywords": "Neural Architecture Search;AutoML;AutoDL;Deep Learning;Computer Vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arber Zela;Thomas Elsken;Tonmoy Saikia;Yassine Marrakchi;Thomas Brox;Frank Hutter",
        "authorids": "zelaa@cs.uni-freiburg.de;thomas.elsken@de.bosch.com;saikiat@cs.uni-freiburg.de;marrakch@cs.uni-freiburg.de;brox@cs.uni-freiburg.de;fh@cs.uni-freiburg.de",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nZela2020Understanding,\ntitle={Understanding and Robustifying Differentiable Architecture Search},\nauthor={Arber Zela and Thomas Elsken and Tonmoy Saikia and Yassine Marrakchi and Thomas Brox and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gDNyrKDS}\n}",
        "github": "https://github.com/automl/RobustDARTS",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=H1gDNyrKDS",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "1031;154;168",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1632;10;20",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            451.0,
            410.1617567090655
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            554.0,
            762.2720424275487
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 464,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16596643818035948993&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 14
    },
    {
        "id": "H1gDaa4YwS",
        "title": "Learning General and Reusable Features via Racecar-Training",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel bi-directional training approach for learning of general features.",
        "abstract": "We propose a novel training approach for improving the learning of generalizing features in neural networks. We augment the network with a reverse pass which aims for reconstructing the full sequence of internal states of the network. Despite being a surprisingly simple change, we demonstrate that this forward-backward training approach, i.e. racecar training, leads to significantly more general features to be extracted from a given data set. We demonstrate in our paper that a network obtained in this way is continually trained for the original task, it outperforms baseline models trained in a regular fashion. This improved performance is visible for a wide range of learning tasks from classification, to regression and stylization. In addition, networks trained with our approach exhibit improved performance for task transfers. We additionally analyze the mutual information of our networks to explain the improved generalizing capabilities.",
        "keywords": "transfer learning;neural networks;generalization;reusable features",
        "primary_area": "",
        "supplementary_material": "",
        "author": "You Xie;Nils Thuerey",
        "authorids": "you.xie@tum.de;nils.thuerey@tum.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nxie2020learning,\ntitle={Learning General and Reusable Features via Racecar-Training},\nauthor={You Xie and Nils Thuerey},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gDaa4YwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1gDaa4YwS",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "468;336",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "664;126",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            402.0,
            66.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            395.0,
            269.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ADHq4sfZF5sJ:scholar.google.com/&scioq=Learning+General+and+Reusable+Features+via+Racecar-Training&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1gEP6NFwr",
        "title": "On the Tunability of Optimizers in Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We provide a method to benchmark optimizers that is cognizant to the hyperparameter tuning process.",
        "abstract": "There is no consensus yet on the question whether adaptive gradient methods like Adam are easier to use than non-adaptive optimization methods like SGD. In this work, we fill in the important, yet ambiguous concept of \u2018ease-of-use\u2019 by defining an optimizer\u2019s tunability:  How easy is it to find good hyperparameter configurations using automatic random hyperparameter search? We propose a practical and universal quantitative measure for optimizer tunability that can form the basis for a fair optimizer benchmark.  Evaluating a variety of optimizers on an extensive set of standard datasets and architectures, we find  that Adam is the most tunable for the majority of problems, especially with a low budget for hyperparameter tuning.",
        "keywords": "Optimization;Benchmarking;Hyperparameter optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Prabhu Teja S*;Florian Mai*;Thijs Vogels;Martin Jaggi;Francois Fleuret",
        "authorids": "prabhu.teja@idiap.ch;florian.mai@idiap.ch;thijs.vogels@epfl.ch;martin.jaggi@epfl.ch;francois.fleuret@idiap.ch",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ns*2020on,\ntitle={On the Tunability of Optimizers in Deep Learning},\nauthor={Prabhu Teja S* and Florian Mai* and Thijs Vogels and Martin Jaggi and Francois Fleuret},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gEP6NFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gEP6NFwr",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "562;683",
        "wc_reply_reviewers": "0;11",
        "wc_reply_authors": "562;958",
        "reply_reviewers": "0;1",
        "reply_authors": "1;3",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            622.5,
            60.5
        ],
        "wc_reply_reviewers_avg": [
            5.5,
            5.5
        ],
        "wc_reply_authors_avg": [
            760.0,
            198.0
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            2.0,
            1.0
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eQfnviWVW1MJ:scholar.google.com/&scioq=On+the+Tunability+of+Optimizers+in+Deep+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1gHb1rFwr",
        "title": "Extreme Values are Accurate and Robust in Deep Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper aims to leverage good properties of robust visual features like SIFT to renovate CNN architectures towards better accuracy and robustness.",
        "abstract": "Recent evidence shows that convolutional neural networks (CNNs) are biased towards textures so that CNNs are non-robust to adversarial perturbations over textures, while traditional robust visual features like SIFT (scale-invariant feature transforms) are designed to be robust across a substantial range of affine distortion, addition of noise, etc with the mimic of human perception nature. This paper aims to leverage good properties of SIFT to renovate CNN architectures towards better accuracy and robustness. We borrow the scale-space extreme value idea from SIFT, and propose EVPNet (extreme value preserving network) which contains three novel components to model the extreme values: (1) parametric differences of Gaussian (DoG) to extract extrema, (2) truncated ReLU to suppress non-stable extrema and (3) projected normalization layer (PNL) to mimic PCA-SIFT like feature normalization. Experiments demonstrate that EVPNets can achieve similar or better accuracy than conventional CNNs, while achieving much better robustness on a set of adversarial attacks (FGSM,PGD,etc) even without adversarial training.",
        "keywords": "Biological inspired CNN architecture design;Adversarial Robustness Architecture",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jianguo Li;Mingjie Sun;Changshui Zhang",
        "authorids": "jianguo.li@intel.com;sunmj15@gmail.com;zcs@tsinghua.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nli2020extreme,\ntitle={Extreme Values are Accurate and Robust in Deep Networks},\nauthor={Jianguo Li and Mingjie Sun and Changshui Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gHb1rFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gHb1rFwr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "314;533;395",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "708;533;532",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            414.0,
            90.41017641836565
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            591.0,
            82.73250066731131
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Uw-eJqdq0qoJ:scholar.google.com/&scioq=Extreme+Values+are+Accurate+and+Robust+in+Deep+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1gJ2RVFPH",
        "title": "Being Bayesian, Even Just a Bit, Fixes Overconfidence in ReLU Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We argue theoretically that by simply assuming the weights of a ReLU network to be Gaussian distributed (without even a Bayesian formalism) could fix this issue; for a more calibrated uncertainty, a simple Bayesian method could already be sufficient.",
        "abstract": "The point estimates of ReLU classification networks, arguably the most widely used neural network architecture, have recently been shown to have arbitrarily high confidence far away from the training data. This architecture is thus not robust, e.g., against out-of-distribution data. Approximate Bayesian posteriors on the weight space have been empirically demonstrated to improve predictive uncertainty in deep learning. The theoretical analysis of such Bayesian approximations is limited, including for ReLU classification networks. We present an analysis of approximate Gaussian posterior distributions on the weights of ReLU networks. We show that even a simplistic (thus cheap), non-Bayesian Gaussian distribution fixes the asymptotic overconfidence issue. Furthermore, when a Bayesian method, even if a simple one, is employed to obtain the Gaussian, the confidence becomes better calibrated. This theoretical result motivates a range of Laplace approximations along a fidelity-cost trade-off. We validate these findings empirically via experiments using common deep ReLU networks.",
        "keywords": "uncertainty quantification;overconfidence;Bayesian inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Agustinus Kristiadi;Matthias Hein;Philipp Hennig",
        "authorids": "agustinus.kristiadi@uni-tuebingen.de;matthias.hein@uni-tuebingen.de;philipp.hennig@uni-tuebingen.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1gJ2RVFPH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "340;789;169",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            432.6666666666667,
            261.45787848566016
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 376,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12071417821093265788&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1gL3RVtwr",
        "title": "CURSOR-BASED ADAPTIVE QUANTIZATION FOR DEEP NEURAL NETWORK",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep neural network (DNN) has rapidly found many applications in different scenarios.\nHowever, its large computational cost and memory consumption are barriers\nto computing restrained applications. DNN model quantization is a widely\nused method to reduce the DNN storage and computation burden by decreasing\nthe bit width. In this paper, we propose a novel cursor based adaptive quantization\nmethod using differentiable architecture search (DAS). The multiple bits\u2019\nquantization mechanism is formulated as a DAS process with a continuous cursor\nthat represents the possible quantization bit. The cursor-based DAS adaptively\nsearches for the desired quantization bit for each layer. The DAS process can\nbe solved via an alternative approximate optimization process, which is designed\nfor mixed quantization scheme of a DNN model. We further devise a new loss\nfunction in the search process to simultaneously optimize accuracy and parameter\nsize of the model. In the quantization step, based on a new strategy, the closest\ntwo integers to the cursor are adopted as the bits to quantize the DNN together to\nreduce the quantization noise and avoid the local convergence problem. Comprehensive\nexperiments on benchmark datasets show that our cursor based adaptive\nquantization approach achieves the new state-of-the-art for multiple bits\u2019 quantization\nand can efficiently obtain lower size model with comparable or even better\nclassification accuracy.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bapu Li(*);Yanwen Fan(*);Zhiyu Cheng;Yingze Bao (* means equal contribution)",
        "authorids": "baopuli@baidu.com;fanyanwen@baidu.com;zhiyucheng@baidu.com;baoyingze@baidu.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nli(*)2020cursorbased,\ntitle={{\\{}CURSOR{\\}}-{\\{}BASED{\\}} {\\{}ADAPTIVE{\\}} {\\{}QUANTIZATION{\\}} {\\{}FOR{\\}} {\\{}DEEP{\\}} {\\{}NEURAL{\\}} {\\{}NETWORK{\\}}},\nauthor={Bapu Li(*) and Yanwen Fan(*) and Zhiyu Cheng and Yingze Bao (* means equal contribution)},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gL3RVtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1gL3RVtwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "225;371;1109",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "770;748;819",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            568.3333333333334,
            386.9274982330525
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            779.0,
            29.676028485406647
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cAcj4UE0P1oJ:scholar.google.com/&scioq=CURSOR-BASED+ADAPTIVE+QUANTIZATION+FOR+DEEP+NEURAL+NETWORK&hl=en&as_sdt=0,33",
        "gs_version_total": 5
    },
    {
        "id": "H1gN6kSFwS",
        "title": "Learning Neural Causal Models from Unknown Interventions",
        "track": "main",
        "status": "Reject",
        "tldr": "Using end-to-end deep learning to discover the structure of a graphical model which is robust to interventions and trained without knowing what the interventions are",
        "abstract": "Meta-learning over a set of distributions can be interpreted as learning different types of parameters corresponding to short-term vs long-term aspects of the mechanisms underlying the generation of data. These are respectively captured by quickly-changing \\textit{parameters} and slowly-changing \\textit{meta-parameters}. We present a new framework for meta-learning causal models where the relationship between each variable and its parents is modeled by a neural network, modulated by structural meta-parameters which capture the overall topology of a directed graphical model. Our approach avoids a discrete search over models in favour of a continuous optimization procedure. We study a setting where interventional distributions are induced as a result of a random intervention on a single unknown variable of an unknown ground truth causal model, and the observations arising after such an intervention constitute one meta-example. To disentangle the slow-changing aspects of each conditional from the fast-changing adaptations to each intervention, we parametrize the neural network into fast parameters and slow meta-parameters. We introduce a meta-learning objective that favours solutions \\textit{robust} to frequent but sparse interventional distribution change, and which generalize well to previously unseen interventions. Optimizing this objective is shown experimentally to recover the structure of the causal graph. Finally, we find that when the learner is unaware of the intervention variable, it is able to infer that information, improving results further and focusing the parameter and meta-parameter updates where needed.",
        "keywords": "deep learning;graphical models;meta learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nan Rosemary Ke;Olexa Bilaniuk;Anirudh Goyal;Stephan Bauer;Hugol Larochelle;Chris Pal;Yoshua Bengio",
        "authorids": "rosemary.nan.ke@gmail.com;obilaniu@gmail.com;anirudhgoyal9119@gmail.com;stefan.a.bauer@gmail.com;hugolarochelle@google.com;chris.j.pal@gmail.com;yoshua.bengio@mila.quebec",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nke2020learning,\ntitle={Learning Neural Causal Models from Unknown Interventions},\nauthor={Nan Rosemary Ke and Olexa Bilaniuk and Anirudh Goyal and Stephan Bauer and Hugol Larochelle and Chris Pal and Yoshua Bengio},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gN6kSFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gN6kSFwS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "616;280;519",
        "wc_reply_reviewers": "151;279;0",
        "wc_reply_authors": "2608;1613;581",
        "reply_reviewers": "1;1;0",
        "reply_authors": "7;3;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            471.6666666666667,
            141.1956876898945
        ],
        "wc_reply_reviewers_avg": [
            143.33333333333334,
            114.03021042201443
        ],
        "wc_reply_authors_avg": [
            1600.6666666666667,
            827.5652374418721
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            3.6666666666666665,
            2.494438257849294
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 197,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17856737708526146743&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1gNOeHKPS",
        "title": "Neural Arithmetic Units",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "Neural networks can approximate complex functions, but they struggle to perform exact arithmetic operations over real numbers. The lack of inductive bias for arithmetic operations leaves neural networks without the underlying logic necessary to extrapolate on tasks such as addition, subtraction, and multiplication. We present two new neural network components: the Neural Addition Unit (NAU), which can learn exact addition and subtraction; and the Neural Multiplication Unit (NMU) that can multiply subsets of a vector. The NMU is, to our knowledge, the first arithmetic neural network component that can learn to multiply elements from a vector, when the hidden size is large. The two new components draw inspiration from a theoretical analysis of recently proposed arithmetic components. We find that careful initialization, restricting parameter space, and regularizing for sparsity is important when optimizing the NAU and NMU. Our proposed units NAU and NMU, compared with previous neural units, converge more consistently, have fewer parameters, learn faster, can converge for larger hidden sizes, obtain sparse and meaningful weights, and can extrapolate to negative and small values.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andreas Madsen;Alexander Rosenberg Johansen",
        "authorids": "amwebdk@gmail.com;alexander@herhjemme.dk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nMadsen2020Neural,\ntitle={Neural Arithmetic Units},\nauthor={Andreas Madsen and Alexander Rosenberg Johansen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gNOeHKPS}\n}",
        "github": "https://github.com/AndreasMadsen/stable-nalu",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=H1gNOeHKPS",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "248;336;482;923",
        "wc_reply_reviewers": "0;0;0;455",
        "wc_reply_authors": "652;763;517;2042",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;1;1;5",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            497.25,
            259.62605319959704
        ],
        "wc_reply_reviewers_avg": [
            113.75,
            197.02077936095978
        ],
        "wc_reply_authors_avg": [
            993.5,
            611.5874835213683
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            2.0,
            1.7320508075688772
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 61,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10738415609014250822&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1gQY64FwB",
        "title": "COMMUNITY PRESERVING NODE EMBEDDING",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A community preserving node embedding algorithm that results in more effective detection of communities with a clustering on the embedded space",
        "abstract": "Detecting communities or the modular structure of real-life networks (e.g. a social\nnetwork or a product purchase network) is an important task because the way a\nnetwork functions is often determined by its communities.\nThe traditional approaches to community detection involve modularity-based approaches,\nwhich generally speaking, construct partitions based on heuristics that\nseek to maximize the ratio of the edges within the partitions to those between\nthem. Node embedding approaches, which represent each node in a graph as a\nreal-valued vector, transform the problem of community detection in a graph to\nthat of clustering a set of vectors. Existing node embedding approaches are primarily\nbased on first initiating uniform random walks from each node to construct\na context of a node and then seeks to make the vector representation of\nthe node close to its context. However, standard node embedding approaches do\nnot directly take into account the community structure of a network while constructing\nthe context around each node. To alleviate this, we explore two different\nthreads of work. First, we investigate the use of biased random walks (specifically,\nmaximum entropy based walks) to obtain more centrality preserving embedding\nof nodes, which we hypothesize may lead to more effective clusters in the embedded\nspace. Second, we propose a community structure aware node embedding\napproach where we incorporate modularity-based partitioning heuristics into\nthe objective function of node embedding. We demonstrate that our proposed approach\nfor community detection outperforms a number of modularity-based baselines\nas well as K-means on a standard node-embedded vector space (specifically,\nnode2vec) on a wide range of real-life networks of different sizes and densities.",
        "keywords": "node embedding;community detection;biased random walks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Swarup Chattopadhyay;Debasis Ganguly",
        "authorids": "swarup_r@isical.ac.in;debasis.ganguly1@ie.ibm.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=H1gQY64FwB",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1127,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4332952790202426210&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "H1gS364FwS",
        "title": "Event extraction from unstructured Amharic text",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper extract events from Amharic text.",
        "abstract": "In information extraction, event extraction is one of the types that extract the specific knowledge of certain incidents from texts. Event extraction has been done on different languages texts but not on one of the Semitic language Amharic. In this study, we present a system that extracts an event from unstructured Amharic text. The system has designed by the integration of supervised machine learning and rule-based approaches together.  We call it a hybrid system. The model from the supervised machine learning detects events from the text, then, handcrafted rules and the rule-based rules extract the event from the text. The hybrid system has compared with the standalone rule-based method that is well known for event extraction. The study has shown that the hybrid system has outperformed the standalone rule-based method. For the event extraction, we have been extracting event arguments. Event arguments identify event triggering words or phrases that clearly express the occurrence of the event. The event argument attributes can be verbs, nouns, occasionally adjectives such as \u1230\u122d\u130d/wedding and time as well.",
        "keywords": "Event extraction;machine learning classifiers;Nominal events",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ephrem Tadesse;Rosa Tsegaye;Kuulaa Qaqqabaa",
        "authorids": "ephe11ta@gmail.com;rosatsegaye@gmail.com;kuulaa@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntadesse2020event,\ntitle={Event extraction from unstructured Amharic text},\nauthor={Ephrem Tadesse and Rosa Tsegaye and Kuulaa Qaqqabaa},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gS364FwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gS364FwS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "222;267;563",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            350.6666666666667,
            151.26209777145834
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9134741327736985849&as_sdt=5,31&sciodt=0,31&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1gW93NKvH",
        "title": "Depth-Recurrent Residual Connections for Super-Resolution of Real-Time Renderings",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A method for persistent latent states in ResBlocks demonstrated for super-resolution of alised image sequences.",
        "abstract": "Inferring temporally coherent data features is crucial for a large variety of learning tasks. We propose a network architecture that introduces temporal recurrent connections for the internal  state of the widely used residual blocks. We demonstrate that, with these connections, convolutional neural networks can more robustly learn stable temporal states that persist between evaluations. We demonstrate their potential for inferring high-quality super-resolution images from low resolution images produced with real-time renderers. This data arises in a wide range of applications,  and is particularly challenging as it contains a strongly aliased signal. Hence, the data differs substantially from the smooth inputs encountered  in natural videos, and existing techniques do not succeed at producing acceptable image quality. We additionally propose a series of careful adjustments of typical generative adversarial architectures for video super-resolution to arrive at a first model that can produce detailed, yet temporally coherent images from an aliased stream of inputs from a real-time renderer.",
        "keywords": "temporal coherence;anti-aliasing;super-resolution;GAN;RNN;real-time rendering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aleksandra Franz;Mengyu Chu;R\u00fcdiger Westermann;Nils Thuerey",
        "authorids": "~Aleksandra_Franz1;~Mengyu_Chu3;~R\u00fcdiger_Westermann2;~Nils_Thuerey1",
        "gender": ";F;;M",
        "homepage": ";https://rachelcmy.github.io/;;https://ge.in.tum.de",
        "dblp": ";200/8285;;42/478",
        "google_scholar": ";G8FHnt8AAAAJ;;https://scholar.google.com.tw/citations?user=GEehwv8AAAAJ",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": "~Aleksandra_Franz1;~Mengyu_Chu3;~R\u00fcdiger_Westermann2;~Nils_Thuerey1",
        "aff": ";Technical University Munich;;Technical University Munich",
        "aff_domain": ";tum.de;;tum.de",
        "position": ";PhD student;;Associate Professor",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1gW93NKvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "131;149;317",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            199.0,
            83.7615663654877
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:O6tRXiEp_v0J:scholar.google.com/&scioq=Depth-Recurrent+Residual+Connections+for+Super-Resolution+of+Real-Time+Renderings&hl=en&as_sdt=0,5",
        "gs_version_total": 0,
        "aff_unique_index": "0;0",
        "aff_unique_norm": "Technical University of Munich",
        "aff_unique_dep": "",
        "aff_unique_url": "https://www.tum.de",
        "aff_unique_abbr": "TUM",
        "aff_campus_unique_index": "",
        "aff_campus_unique": "",
        "aff_country_unique_index": "0;0",
        "aff_country_unique": "Germany"
    },
    {
        "id": "H1gWyJBFDr",
        "title": "Fully Convolutional Graph Neural Networks using Bipartite Graph Convolutions",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Graph neural networks have been adopted in numerous applications ranging from learning relational representations to modeling data on irregular domains such as point clouds, social graphs, and molecular structures. Though diverse in nature, graph neural network architectures remain limited by the graph convolution operator whose input and output graphs must have the same structure. With this restriction, representational hierarchy can only be built by graph convolution operations followed by non-parameterized pooling or expansion layers. This is very much like early convolutional network architectures, which later have been replaced by more effective parameterized strided and transpose convolution operations in combination with skip connections. In order to bring a similar change to graph convolutional networks, here we introduce the bipartite graph convolution operation, a parameterized transformation between different input and output graphs. Our framework is general enough to subsume conventional graph convolution and pooling as its special cases and supports multi-graph aggregation leading to a class of flexible and adaptable network architectures, termed BiGraphNet. By replacing the sequence of graph convolution and pooling in hierarchical architectures with a single parametric bipartite graph convolution, (i) we answer the question of whether graph pooling matters, and (ii) accelerate computations and lower memory requirements in hierarchical networks by eliminating pooling layers. Then, with concrete examples, we demonstrate that the general BiGraphNet formalism (iii) provides the modeling flexibility to build efficient architectures such as graph skip connections, and autoencoders.",
        "keywords": "Graph Neural Networks;Graph Convolutional Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marcel Nassar;Xin Wang;Evren Tumer",
        "authorids": "nassar.marcel@gmail.com;caseus.viridis@gmail.com;nervetumer@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nnassar2020fully,\ntitle={Fully Convolutional Graph Neural Networks using Bipartite Graph Convolutions},\nauthor={Marcel Nassar and Xin Wang and Evren Tumer},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gWyJBFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gWyJBFDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "285;212;230",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "218;413;240",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            242.33333333333334,
            31.051927834229907
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            290.3333333333333,
            87.20219161364135
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5107785672984739462&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1gX8C4YPr",
        "title": "DD-PPO: Learning Near-Perfect PointGoal Navigators from 2.5 Billion Frames",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We present Decentralized Distributed Proximal Policy Optimization (DD-PPO), a method for distributed reinforcement learning in resource-intensive simulated environments. DD-PPO is distributed (uses multiple machines), decentralized (lacks a centralized server), and synchronous (no computation is ever \"stale\"), making it conceptually simple and easy to implement. In our experiments on training virtual robots to navigate in Habitat-Sim, DD-PPO exhibits near-linear scaling -- achieving a speedup of 107x on 128 GPUs over a serial implementation. We leverage this scaling to train an agent for 2.5 Billion steps of experience (the equivalent of 80 years of human experience) -- over 6 months of GPU-time training in under 3 days of wall-clock time with 64 GPUs. \n\nThis massive-scale training not only sets the state of art on Habitat Autonomous Navigation Challenge 2019, but essentially \"solves\" the task -- near-perfect autonomous navigation in an unseen environment without access to a map, directly from an RGB-D camera and a GPS+Compass sensor.  Fortuitously, error vs computation exhibits a power-law-like distribution; thus, 90% of peak performance is obtained relatively early (at 100 million steps) and relatively cheaply (under 1 day with 8 GPUs). Finally, we show that the scene understanding and navigation policies learned can be transferred to other navigation tasks -- the analog of \"ImageNet pre-training + task-specific fine-tuning\" for embodied AI. Our model outperforms ImageNet pre-trained CNNs on these transfer tasks and can serve as a universal resource (all models and code are publicly available). ",
        "keywords": "autonomous navigation;habitat;embodied AI;pointgoal navigation;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Erik Wijmans;Abhishek Kadian;Ari Morcos;Stefan Lee;Irfan Essa;Devi Parikh;Manolis Savva;Dhruv Batra",
        "authorids": "etw@gatech.edu;akadian@fb.com;arimorcos@gmail.com;leestef@oregonstate.edu;irfan@gatech.edu;parikh@gatech.edu;msavva@sfu.ca;dbatra@gatech.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nWijmans2020DD-PPO:,\ntitle={DD-PPO: Learning Near-Perfect PointGoal Navigators from 2.5 Billion Frames},\nauthor={Erik Wijmans and Abhishek Kadian and Ari Morcos and Stefan Lee and Irfan Essa and Devi Parikh and Manolis Savva and Dhruv Batra},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gX8C4YPr}\n}",
        "github": "https://github.com/facebookresearch/habitat-api",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1gX8C4YPr",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "205;702;165",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "958;349;55",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.3333333333333,
            244.2626091366057
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            454.0,
            376.0505285197722
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 542,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4884965845219755657&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1gXzxHKvH",
        "title": "Deep Nonlinear Stochastic Optimal Control for Systems with Multiplicative Uncertainties",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We present a deep recurrent neural network architecture to solve a class of stochastic optimal control problems described by fully nonlinear Hamilton Jacobi Bellman partial differential equations. Such PDEs arise when one considers stochastic dynamics characterized by uncertainties that are additive and control multiplicative. Stochastic models with the aforementioned characteristics have been used in computational neuroscience, biology, finance and aerospace systems and provide a more accurate representation of actuation than models with additive uncertainty. Previous literature has established the inadequacy of the linear HJB theory and instead rely on a non-linear Feynman-Kac lemma resulting in a second order forward-backward stochastic differential equations representation. However, the proposed solutions that use this representation suffer from compounding errors and computational complexity leading to lack of scalability. In this paper, we propose a deep learning based algorithm that leverages the second order Forward-Backward SDE representation and LSTM based recurrent neural networks to not only solve such Stochastic Optimal Control problems but also overcome the problems faced by previous approaches and scales well to high dimensional systems. The resulting control algorithm is tested on non-linear systems in robotics and biomechanics to demonstrate feasibility and out-performance against previous methods.   ",
        "keywords": "Deep Learning;Stochastic Optimal Control;Robotics;Biomechanics;LSTM",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marcus Pereira;Ziyi Wang;Tianrong Chen;Evangelos Theodorou",
        "authorids": "mpereira30@gatech.edu;zwang450@gatech.edu;tianrong.chen@gatech.edu;evangelos.theodorou@gatech.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\npereira2020deep,\ntitle={Deep Nonlinear Stochastic Optimal Control for Systems with Multiplicative Uncertainties},\nauthor={Marcus Pereira and Ziyi Wang and Tianrong Chen and Evangelos Theodorou},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gXzxHKvH}\n}",
        "github": "https://www.dropbox.com/s/ey7g97kzkvl1m24/ICLR_Code.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1gXzxHKvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "938;1341;353",
        "wc_reply_reviewers": "249;0;114",
        "wc_reply_authors": "192;286;373",
        "reply_reviewers": "1;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            877.3333333333334,
            405.6240733596675
        ],
        "wc_reply_reviewers_avg": [
            121.0,
            101.7742600071354
        ],
        "wc_reply_authors_avg": [
            283.6666666666667,
            73.91135832122752
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FNDOfiE6GXkJ:scholar.google.com/&scioq=Deep+Nonlinear+Stochastic+Optimal+Control+for+Systems+with+Multiplicative+Uncertainties&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1gZsJBYwH",
        "title": "Hybrid Weight Representation: A Quantization Method Represented with Ternary and Sparse-Large Weights",
        "track": "main",
        "status": "Reject",
        "tldr": "A representation of quantized neural networks with both values and indices. Centralizing weights for the efficiency of the representation.",
        "abstract": "Previous ternarizations such as the trained ternary quantization (TTQ), which quantized weights to three values (e.g., {\u2212Wn,  0,+Wp}), achieved the small model size and efficient inference process. However, the extreme limit on the number of quantization steps causes some degradation in accuracy.  To solve this problem, we propose a hybrid weight representation (HWR) method which produces a network consisting of two types of weights, i.e., ternary weights (TW) and sparse-large weights (SLW). The TW is similar to the TTQ\u2019s and requires three states to be stored in memory with 2 bits.  We utilize the one remaining state to indicate the SLW which is referred to as very rare and greater than TW. In HWR, we represent TW with values while SLW with indices of values.   By encoding SLW, the networks can preserve their model size with improving their accuracy. To fully utilize HWR, we also introduce a centralized quantization (CQ) process with a weighted ridge (WR) regularizer. They aim to reduce the entropy of weight distributions by centralizing weights toward ternary values.  Our comprehensive experiments show that HWR outperforms the state-of-the-art compressed models in terms of the trade-off between model size and accuracy.  Our proposed representation increased the AlexNet performance on CIFAR-100 by 4.15% with only1.13% increase in model size.",
        "keywords": "quantized neural networks;centralized quantization;hybrid weight representation;weighted ridge;ternary weight",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jinbae Park;Sung-Ho Bae",
        "authorids": "qkrwlsqo94@gmail.com;shbae@khu.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\npark2020hybrid,\ntitle={Hybrid Weight Representation: A Quantization Method Represented with Ternary and Sparse-Large Weights},\nauthor={Jinbae Park and Sung-Ho Bae},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gZsJBYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gZsJBYwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "308;289;314",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "475;514;92",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            303.6666666666667,
            10.656244908763853
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            360.3333333333333,
            190.40716606495903
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z8CzjArtxPAJ:scholar.google.com/&scioq=Hybrid+Weight+Representation:+A+Quantization+Method+Represented+with+Ternary+and+Sparse-Large+Weights&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1gax6VtDB",
        "title": "Contrastive Learning of Structured World Models",
        "track": "main",
        "status": "Talk",
        "tldr": "Contrastively-trained Structured World Models (C-SWMs) learn object-oriented state representations and a relational model of an environment from raw pixel input.",
        "abstract": "A structured understanding of our world in terms of objects, relations, and hierarchies is an important component of human cognition. Learning such a structured world model from raw sensory data remains a challenge. As a step towards this goal, we introduce Contrastively-trained Structured World Models (C-SWMs). C-SWMs utilize a contrastive approach for representation learning in environments with compositional structure. We structure each state embedding as a set of object representations and their relations, modeled by a graph neural network. This allows objects to be discovered from raw pixel observations without direct supervision as part of the learning process. We evaluate C-SWMs on compositional environments involving multiple interacting objects that can be manipulated independently by an agent, simple Atari games, and a multi-object physics simulation. Our experiments demonstrate that C-SWMs can overcome limitations of models based on pixel reconstruction and outperform typical representatives of this model class in highly structured environments, while learning interpretable object-based representations.",
        "keywords": "state representation learning;graph neural networks;model-based reinforcement learning;relational learning;object discovery",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thomas Kipf;Elise van der Pol;Max Welling",
        "authorids": "t.n.kipf@uva.nl;e.e.vanderpol@uva.nl;m.welling@uva.nl",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nKipf2020Contrastive,\ntitle={Contrastive Learning of Structured World Models},\nauthor={Thomas Kipf and Elise van der Pol and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gax6VtDB}\n}",
        "github": "https://github.com/tkipf/c-swm",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1gax6VtDB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "319;1032;394",
        "wc_reply_reviewers": "73;141;27",
        "wc_reply_authors": "486;1276;624",
        "reply_reviewers": "1;1;1",
        "reply_authors": "1;3;2",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            581.6666666666666,
            319.90241567633643
        ],
        "wc_reply_reviewers_avg": [
            80.33333333333333,
            46.828291543562514
        ],
        "wc_reply_authors_avg": [
            795.3333333333334,
            344.52027839043416
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 348,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11077069577434733177&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1gcw1HYPr",
        "title": "AlignNet: Self-supervised Alignment Module",
        "track": "main",
        "status": "Reject",
        "tldr": "A differentiable model for aligning pre-extracted entity representations with a slot based memory, to which new objects can be added.",
        "abstract": "The natural world consists of objects that we perceive as persistent in space and time, even though these objects appear, disappear and reappear in our field of view as we move. This can be attributed to our notion of object persistence -- our knowledge that objects typically continue to exist, even if we can no longer see them -- and our ability to track objects. Drawing inspiration from the psychology literature on `sticky indices', we propose the AlignNet, a model that learns to assign unique indices to new objects when they first appear and reassign the index to subsequent instances of that object. By introducing a persistent object-based memory, the AlignNet may be used to keep track of objects across time, even if they disappear and reappear later. We implement the AlignNet as a graph network applied to a bipartite graph, in which the input nodes are objects from two sets that we wish to align. The network is trained to predict the edges which connect two instances of the same object across sets. The model is also capable of identifying when there are no matches and dealing with these cases. We perform experiments to show the model's ability to deal with the appearance, disappearance and reappearance of objects. Additionally, we demonstrate how a persistent object-based memory can help solve question-answering problems in a partially observable environment.",
        "keywords": "Graph networks;alignment;objects;relation networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Antonia Creswell;Luis Piloto;David Barrett;Kyriacos Nikiforou;David Raposo;Marta Garnelo;Peter Battaglia;Murray Shanahan",
        "authorids": "tonicreswell@google.com;piloto@google.com;peterbattaglia@google.com;knikiforou@google.com;barrettdavid@google.com;garnelo@google.com;mshanahan@google.com;draposo@google.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\ncreswell2020alignnet,\ntitle={AlignNet: Self-supervised Alignment Module},\nauthor={Antonia Creswell and Luis Piloto and David Barrett and Kyriacos Nikiforou and David Raposo and Marta Garnelo and Peter Battaglia and Murray Shanahan},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gcw1HYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1gcw1HYPr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "427;228;116",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "672;441;76",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            257.0,
            128.61052315680342
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            396.3333333333333,
            245.35733578236912
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HSmaPp76cioJ:scholar.google.com/&scioq=AlignNet:+Self-supervised+Alignment+Module&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1gdAC4KDB",
        "title": "Adversarially Robust Generalization Just Requires More Unlabeled Data",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural network robustness has recently been highlighted by the existence of adversarial examples. Many previous works show that the learned networks do not perform well on perturbed test data, and significantly more labeled data is required to achieve adversarially robust generalization. In this paper, we theoretically and empirically show that with just more unlabeled data, we can learn a model with better adversarially robust generalization. The key insight of our results is based on a risk decomposition theorem, in which the expected robust risk is separated into two parts: the stability part which measures the prediction stability in the presence of perturbations, and the accuracy part which evaluates the standard classification accuracy. As the stability part does not depend on any label information, we can optimize this part using unlabeled data. We further prove that for a specific Gaussian mixture problem, adversarially robust generalization can be almost as easy as the standard generalization in supervised learning if a sufficiently large amount of unlabeled data is provided. Inspired by the theoretical findings, we further show that a practical adversarial training algorithm that leverages unlabeled data can improve adversarial robust generalization on MNIST and Cifar-10.",
        "keywords": "Adversarial Robustness;Semi-supervised Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Runtian Zhai;Tianle Cai;Di He;Chen Dan;Kun He;John E. Hopcroft;Liwei Wang",
        "authorids": "zhairuntian@pku.edu.cn;caitianle1998@pku.edu.cn;dihe@microsoft.com;cdan@cs.cmu.edu;brooklet60@hust.edu.cn;jeh17@cornell.edu;wanglw@cis.pku.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nzhai2020adversarially,\ntitle={Adversarially Robust Generalization Just Requires More Unlabeled Data},\nauthor={Runtian Zhai and Tianle Cai and Di He and Chen Dan and Kun He and John E. Hopcroft and Liwei Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gdAC4KDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gdAC4KDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "344;505;306",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            385.0,
            86.25929901562304
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 181,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2395728889881472942&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1gdF34FvS",
        "title": "Advantage Weighted Regression: Simple and Scalable Off-Policy Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We represent a simple off-policy reinforcement learning algorithm that uses standard supervised learning methods as subroutines.",
        "abstract": "In this paper, we aim to develop a simple and scalable reinforcement learning algorithm that uses standard supervised learning methods as subroutines. Our goal is an algorithm that utilizes only simple and convergent maximum likelihood loss functions, while also being able to leverage off-policy data. Our proposed approach, which we refer to as advantage-weighted regression (AWR), consists of two standard supervised learning steps: one to regress onto target values for a value function, and another to regress onto weighted target actions for the policy. The method is simple and general, can accommodate continuous and discrete actions, and can be implemented in just a few lines of code on top of standard supervised learning methods. We provide a theoretical motivation for AWR and analyze its properties when incorporating off-policy data from experience replay. We evaluate AWR on a suite of standard OpenAI Gym benchmark tasks, and show that it achieves competitive performance compared to a number of well-established state-of-the-art RL algorithms. AWR is also able to acquire more effective policies than most off-policy algorithms when learning from purely static datasets with no additional environmental interactions. Furthermore, we demonstrate our algorithm on challenging continuous control tasks with highly complex simulated characters.",
        "keywords": "reinforcement learning;policy search;control",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xue Bin Peng;Aviral Kumar;Grace Zhang;Sergey Levine",
        "authorids": "xbpeng@berkeley.edu;aviralkumar2907@gmail.com;grace.zhang@berkeley.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\npeng2020advantage,\ntitle={Advantage Weighted Regression: Simple and Scalable Off-Policy Reinforcement Learning},\nauthor={Xue Bin Peng and Aviral Kumar and Grace Zhang and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gdF34FvS}\n}",
        "github": "https://sites.google.com/view/awr-supp",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gdF34FvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "399;326;105",
        "wc_reply_reviewers": "0;787;0",
        "wc_reply_authors": "812;1838;166",
        "reply_reviewers": "0;7;0",
        "reply_authors": "2;11;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            276.6666666666667,
            124.99155527030172
        ],
        "wc_reply_reviewers_avg": [
            262.3333333333333,
            370.99535786254194
        ],
        "wc_reply_authors_avg": [
            938.6666666666666,
            688.4423642461937
        ],
        "reply_reviewers_avg": [
            2.3333333333333335,
            3.2998316455372216
        ],
        "reply_authors_avg": [
            5.0,
            4.242640687119285
        ],
        "replies_avg": [
            42,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 633,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12950273499371008747&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1gfFaEYDS",
        "title": "Adversarially Robust Representations with Smooth Encoders",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a method for computing adversarially robust representations in an entirely unsupervised way.",
        "abstract": "This paper studies the undesired phenomena of over-sensitivity of representations learned by deep networks to semantically-irrelevant changes in data. We identify a cause for this shortcoming in the classical Variational Auto-encoder (VAE) objective, the evidence lower bound (ELBO). We show that the ELBO fails to control the behaviour of the encoder out of the support of the empirical data distribution and this behaviour of the VAE can lead to extreme errors in the learned representation. This is a key hurdle in the effective use of representations for data-efficient learning and transfer. To address this problem, we propose to augment the data with specifications that enforce insensitivity of the representation with respect to families of transformations. To incorporate these specifications, we propose a regularization method that is based on a selection mechanism that creates a fictive data point by explicitly perturbing an observed true data point. For certain choices of parameters, our formulation naturally leads to the minimization of the entropy regularized Wasserstein distance between representations. We illustrate our approach on standard datasets and experimentally show that significant improvements in the downstream adversarial accuracy can be achieved by learning robust representations completely in an unsupervised manner, without a reference to a particular downstream task and without a costly supervised adversarial training procedure. \n",
        "keywords": "Adversarial Learning;Robust Representations;Variational AutoEncoder;Wasserstein Distance;Variational Inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Taylan Cemgil;Sumedh Ghaisas;Krishnamurthy (Dj) Dvijotham;Pushmeet Kohli",
        "authorids": "taylancemgil@google.com;sumedhg@google.com;dvij@google.com;pushmeet@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nCemgil2020Adversarially,\ntitle={Adversarially Robust Representations with Smooth Encoders},\nauthor={Taylan Cemgil and Sumedh Ghaisas and Krishnamurthy (Dj) Dvijotham and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gfFaEYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1gfFaEYDS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "159;308;770",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "226;559;672",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            412.3333333333333,
            260.1209120560838
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            485.6666666666667,
            189.31866140334807
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=569070925841422108&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "H1ggKyrYwB",
        "title": "On Incorporating Semantic Prior Knowlegde in Deep Learning Through Embedding-Space Constraints",
        "track": "main",
        "status": "Reject",
        "tldr": "Training method to enforce strict constraints on learned embeddings during supervised training. Applied to visual question answering.",
        "abstract": "The knowledge that humans hold about a problem often extends far beyond a set of training data and output labels. While the success of deep learning mostly relies on supervised training, important properties cannot be inferred efficiently from end-to-end annotations alone, for example causal relations or domain-specific invariances. We present a general technique to supplement supervised training with prior knowledge expressed as relations between training instances. We illustrate the method on the task of visual question answering to exploit various auxiliary annotations, including relations of equivalence and of logical entailment between questions. Existing methods to use these annotations, including auxiliary losses and data augmentation, cannot guarantee the strict inclusion of these relations into the model since they require a careful balancing against the end-to-end objective. Our method uses these relations to shape the embedding space of the model, and treats them as strict constraints on its learned representations. %The resulting model encodes relations that better generalize across instances. In the context of VQA, this approach brings significant improvements in accuracy and robustness, in particular over the common practice of incorporating the constraints as a soft regularizer. We also show that incorporating this type of prior knowledge with our method brings consistent improvements, independently from the amount of supervised data used. It demonstrates the value of an additional training signal that is otherwise difficult to extract from end-to-end annotations alone.",
        "keywords": "regularizers;vision;language;vqa;visual question answering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Damien Teney;Ehsan Abbasnejad;Anton van den Hengel",
        "authorids": "damien.teney@adelaide.edu.au;ehsan.abbasnejad@adelaide.edu.au;anton.vandenhengel@adelaide.edu.au",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nteney2020on,\ntitle={On Incorporating Semantic Prior Knowlegde in Deep Learning Through Embedding-Space Constraints},\nauthor={Damien Teney and Ehsan Abbasnejad and Anton van den Hengel},\nyear={2020},\nurl={https://openreview.net/forum?id=H1ggKyrYwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1ggKyrYwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "430;377;691",
        "wc_reply_reviewers": "0;0;65",
        "wc_reply_authors": "583;311;266",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            499.3333333333333,
            137.24511729829305
        ],
        "wc_reply_reviewers_avg": [
            21.666666666666668,
            30.641293851417057
        ],
        "wc_reply_authors_avg": [
            386.6666666666667,
            140.03888348915416
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8393744253259820026&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1gjM1SFDr",
        "title": "Noisy $\\ell^{0}$-Sparse Subspace Clustering on Dimensionality Reduced Data",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose Noisy-DR-L0-SSC (Noisy Dimension Reduction L0-Sparse Subspace Clustering) to efficiently partition noisy data in accordance to their underlying subspace structure.",
        "abstract": "High-dimensional data often lie in or close to low-dimensional subspaces. Sparse subspace clustering methods with sparsity induced by L0-norm, such as L0-Sparse Subspace Clustering (L0-SSC), are demonstrated to be more effective than its L1 counterpart such as Sparse Subspace Clustering (SSC). However, these L0-norm based subspace clustering methods are restricted to clean data that lie exactly in subspaces. Real data often suffer from noise and they may lie close to subspaces. We propose noisy L0-SSC to handle noisy data so as to improve the robustness. We show that the optimal solution to the optimization problem of noisy L0-SSC achieves subspace detection property (SDP), a key element with which data from different subspaces are separated, under deterministic and randomized models. Our results provide theoretical guarantee on the correctness of noisy L0-SSC in terms of SDP on noisy data. We further propose Noisy-DR-L0-SSC which provably recovers the subspaces on dimensionality reduced data. Noisy-DR-L0-SSC first projects the data onto a lower dimensional space by linear transformation, then performs noisy L0-SSC on the dimensionality reduced data so as to improve the efficiency. The experimental results demonstrate the effectiveness of noisy L0-SSC and Noisy-DR-L0-SSC.",
        "keywords": "Sparse Subspace Clustering (SSC);Noisy L0-SSC;Subspace Detection Property",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yingzhen Yang",
        "authorids": "superyyzg@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gjM1SFDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "261;346;566",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            391.0,
            128.51718432438003
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13650560534455093718&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1gmHaEKwB",
        "title": "Data-Independent Neural Pruning via Coresets",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose an efficient, provable and data independent method for network compression via neural pruning using coresets of neurons -- a novel construction proposed in this paper.",
        "abstract": "Previous work showed empirically that large neural networks can be significantly reduced in size while preserving their accuracy. Model compression became a central research topic, as it is crucial for deployment of neural networks on devices with limited computational and memory resources. The majority of the compression methods are based on heuristics and offer no worst-case guarantees on the trade-off between the compression rate and the approximation error for an arbitrarily new sample.\n\nWe propose the first efficient, data-independent neural pruning algorithm with a provable trade-off between its compression rate and the approximation error for any future test sample. Our method is based on the coreset framework, which finds a small weighted subset of points that provably approximates the original inputs. Specifically, we approximate the output of a layer of neurons by a coreset of neurons in the previous layer and discard the rest. We apply this framework in a layer-by-layer fashion from the top to the bottom. Unlike previous works, our coreset is data independent, meaning that it provably guarantees the accuracy of the function for any input $x\\in \\mathbb{R}^d$, including an adversarial one. We demonstrate the effectiveness of our method on popular network architectures. In particular, our coresets yield 90% compression of the LeNet-300-100 architecture on MNIST while improving the accuracy.",
        "keywords": "coresets;neural pruning;network compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ben Mussay;Margarita Osadchy;Vladimir Braverman;Samson Zhou;Dan Feldman",
        "authorids": "bengordoncshaifa@gmail.com;rita@cs.haifa.ac.il;vova@cs.jhu.edu;samsonzhou@gmail.com;dannyf.post@gmail.co",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nMussay2020Data-Independent,\ntitle={Data-Independent Neural Pruning via Coresets},\nauthor={Ben Mussay and Margarita Osadchy and Vladimir Braverman and Samson Zhou and Dan Feldman},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gmHaEKwB}\n}",
        "github": "https://github.com/BenMussay/Data-Independent-Neural-Pruning-via-Coresets",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1gmHaEKwB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "303;114;1152",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "424;50;780",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            523.0,
            451.4133360901071
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            418.0,
            298.0514496973076
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 72,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7647987783448915675&as_sdt=5,31&sciodt=0,31&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1gpET4YDB",
        "title": "Blockwise Self-Attention for Long Document Understanding",
        "track": "main",
        "status": "Reject",
        "tldr": "We present BlockBERT, a lightweight and efficient BERT model that is designed to better modeling long-distance dependencies.",
        "abstract": "We present BlockBERT, a lightweight and efficient BERT model that is designed to better modeling long-distance dependencies. Our model extends BERT by introducing sparse block structures into the attention matrix to reduce both memory consumption and training time, which also enables attention heads to capture either short- or long-range contextual information. We conduct experiments on several benchmark question answering datasets with various paragraph lengths. Results show that BlockBERT uses 18.7-36.1% less memory and reduces the training time by 12.0-25.1%, while having comparable and sometimes better prediction accuracy, compared to an advanced BERT-based model, RoBERTa.",
        "keywords": "BERT;Transformer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiezhong Qiu;Hao Ma;Omer Levy;Scott Wen-tau Yih;Sinong Wang;Jie Tang",
        "authorids": "qiujz16@mails.tsinghua.edu.cn;gabe.hao.ma@gmail.com;omerlevy@gmail.com;scottyih@gmail.com;sinongwang@fb.com;jietang@tsinghua.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nqiu2020blockwise,\ntitle={Blockwise Self-Attention for Long Document Understanding},\nauthor={Jiezhong Qiu and Hao Ma and Omer Levy and Scott Wen-tau Yih and Sinong Wang and Jie Tang},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gpET4YDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=H1gpET4YDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "102;84;254",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "279;11;336",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            146.66666666666666,
            76.25104735164116
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            208.66666666666666,
            141.69529122576924
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 284,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9605739178515121897&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1guaREYPr",
        "title": "From Inference to Generation: End-to-end Fully Self-supervised Generation of Human Face from Speech",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper proposes a method of end-to-end multi-modal generation of human face from speech based on a self-supervised learning framework.",
        "abstract": "This work seeks the possibility of generating the human face from voice solely based on the audio-visual data without any human-labeled annotations. To this end, we propose a multi-modal learning framework that links the inference stage and generation stage. First, the inference networks are trained to match the speaker identity between the two different modalities. Then the pre-trained inference networks cooperate with the generation network by giving conditional information about the voice. The proposed method exploits the recent development of GANs techniques and generates the human face directly from the speech waveform making our system fully end-to-end. We analyze the extent to which the network can naturally disentangle two latent factors that contribute to the generation of a face image one that comes directly from a speech signal and the other that is not related to it and explore whether the network can learn to generate natural human face image distribution by modeling these factors. Experimental results show that the proposed network can not only match the relationship between the human face and speech, but can also generate the high-quality human face sample conditioned on its speech. Finally, the correlation between the generated face and the corresponding speech is quantitatively measured to analyze the relationship between the two modalities.",
        "keywords": "Multi-modal learning;Self-supervised learning;Voice profiling;Conditional GANs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hyeong-Seok Choi;Changdae Park;Kyogu Lee",
        "authorids": "kekepa15@snu.ac.kr;cdpark@connect.ust.hk;kglee@snu.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nChoi2020From,\ntitle={From Inference to Generation: End-to-end Fully Self-supervised Generation of Human Face from Speech},\nauthor={Hyeong-Seok Choi and Changdae Park and Kyogu Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1guaREYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=H1guaREYPr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "268;127;952",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "806;225;730",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            449.0,
            360.30265055922087
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            587.0,
            257.84620739244286
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10088148266816273813&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1gx1CNKPH",
        "title": "Augmenting Transformers with KNN-Based Composite Memory",
        "track": "main",
        "status": "Reject",
        "tldr": "augment transformers with KNN-based search modules to read from multi-modal external information",
        "abstract": "Various machine learning tasks can benefit from access to external information of different modalities, such as text and images. Recent work has focused on learning architectures with large memories capable of storing this knowledge. We propose augmenting Transformer neural networks with KNN-based Information Fetching (KIF) modules. Each KIF module learns a read operation to access fixed external knowledge. We apply these modules to generative dialogue modeling, a challenging task where information must be flexibly retrieved and incorporated to maintain the topic and flow of conversation. We demonstrate the effectiveness of our approach by identifying relevant knowledge from Wikipedia, images, and human-written dialogue utterances, and show that leveraging this retrieved information improves model performance, measured by automatic and human evaluation.",
        "keywords": "knn;memory-augmented networks;language generation;dialogue",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Angela Fan;Claire Gardent;Chloe Braud;Antoine Bordes",
        "authorids": "angelafan@fb.com;claire.gardent@loria.fr;chloe.braud@loria.fr;abordes@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfan2020augmenting,\ntitle={Augmenting Transformers with {\\{}KNN{\\}}-Based Composite Memory},\nauthor={Angela Fan and Claire Gardent and Chloe Braud and Antoine Bordes},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gx1CNKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gx1CNKPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "139;174;379",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "455;410;815",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            230.66666666666666,
            105.85629672133611
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            560.0,
            181.24568960391858
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10756430252094562013&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1gx3kSKPS",
        "title": "Stein Bridging: Enabling Mutual Reinforcement between Explicit and Implicit Generative Models",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep generative models are generally categorized into explicit models and implicit models. The former assumes an explicit density form whose normalizing constant is often unknown; while the latter, including generative adversarial networks (GANs), generates samples using a push-forward mapping. In spite of substantial recent advances demonstrating the power of the two classes of generative models in many applications, both of them, when used alone, suffer from respective limitations and drawbacks. To mitigate these issues, we propose Stein Bridging, a novel joint training framework that connects an explicit density estimator and an implicit sample generator with Stein discrepancy. We show that the Stein Bridge induces new regularization schemes for both explicit and implicit models. Convergence analysis and extensive experiments demonstrate that the Stein Bridging i) improves the stability and sample quality of the GAN training, and ii) facilitates the density estimator to seek more modes in data and alleviate the mode-collapse issue. Additionally, we discuss several applications of Stein Bridging and useful tricks in practical implementation used in our experiments.",
        "keywords": "generative models;generative adversarial networks;energy models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qitian Wu;Rui Gao;Hongyuan Zha",
        "authorids": "echo740@sjtu.edu.cn;rui.gao@mccombs.utexas.edu;zha@cc.gatech.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwu2020stein,\ntitle={Stein Bridging: Enabling Mutual Reinforcement between Explicit and Implicit Generative Models},\nauthor={Qitian Wu and Rui Gao and Hongyuan Zha},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gx3kSKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1gx3kSKPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "248;314;496",
        "wc_reply_reviewers": "50;0;0",
        "wc_reply_authors": "528;240;663",
        "reply_reviewers": "2;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            352.6666666666667,
            104.8724092515387
        ],
        "wc_reply_reviewers_avg": [
            16.666666666666668,
            23.570226039551585
        ],
        "wc_reply_authors_avg": [
            477.0,
            176.41428513586987
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6422641851555417588&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1gy1erYDH",
        "title": "CaptainGAN: Navigate Through Embedding Space For Better Text Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "An effective gradient-based method for training a text generating GAN",
        "abstract": "Score-function-based text generation approaches such as REINFORCE, in general, suffer from high computational complexity and training instability problems. This is mainly due to the non-differentiable nature of the discrete space sampling and thus these methods have to treat the discriminator as a reward function and ignore the gradient information. In this paper, we propose a novel approach, CaptainGAN, which adopts the straight-through gradient estimator and introduces a \u201dre-centered\u201d gradient estimation technique to steer the generator toward better text tokens through the embedding space. Our method is stable to train and converges quickly without maximum likelihood pre-training. On multiple metrics of text quality and diversity, our method outperforms existing GAN-based methods on natural language generation.",
        "keywords": "Generative Adversarial Network;Text Generation;Straight-Through Estimator",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chun-Hsing Lin;Alvin Chiang;Chi-Liang Liu;Chien-Fu Lin;Po-Hsien Chu;Siang-Ruei Wu;Yi-En Tsai;Chung-Yang (Ric) Huang",
        "authorids": "jsaon92@gmail.com;alvin.chiang.180@gmail.com;liangtaiwan1230@gmail.com;gblin75468@gmail.com;cph@yoctol.com;raywu0@gmail.com;ypiheyn.imm02g@g2.nctu.edu.tw;cyhuang@ntu.edu.tw",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nlin2020captaingan,\ntitle={Captain{\\{}GAN{\\}}: Navigate Through Embedding Space For Better Text Generation},\nauthor={Chun-Hsing Lin and Alvin Chiang and Chi-Liang Liu and Chien-Fu Lin and Po-Hsien Chu and Siang-Ruei Wu and Yi-En Tsai and Chung-Yang (Ric) Huang},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gy1erYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1gy1erYDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "567;298;80",
        "wc_reply_reviewers": "191;29;0",
        "wc_reply_authors": "653;251;0",
        "reply_reviewers": "1;1;0",
        "reply_authors": "4;4;0",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            315.0,
            199.17998560765753
        ],
        "wc_reply_reviewers_avg": [
            73.33333333333333,
            84.040995287353
        ],
        "wc_reply_authors_avg": [
            301.3333333333333,
            268.9514619571511
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.8856180831641267
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ygmQpG63hQIJ:scholar.google.com/&scioq=CaptainGAN:+Navigate+Through+Embedding+Space+For+Better+Text+Generation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1gyy1BtDS",
        "title": "An Information Theoretic Approach to Distributed Representation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The problem of distributed representation learning is one in which multiple sources of information X1,...,XK are processed separately so as to extract useful information about some statistically correlated ground truth Y. We investigate this problem from information-theoretic grounds. For both discrete memoryless (DM) and memoryless vector Gaussian models, we establish fundamental limits of learning in terms of optimal tradeoffs between accuracy and complexity. We also develop a variational bound on the optimal tradeoff that generalizes the evidence lower bound (ELBO) to the distributed setting. Furthermore, we provide a variational inference type algorithm that allows to compute this bound and in which the mappings are parametrized by neural networks and the bound approximated by Markov sampling and optimized with stochastic gradient descent. Experimental results on synthetic and real datasets are provided to support the efficiency of the approaches and algorithms which we develop in this paper.",
        "keywords": "Information Bottleneck;Distributed Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Abdellatif Zaidi;Inaki Estella Aguerri",
        "authorids": "abdellatif.zaidi@u-pem.fr;inaki.estella@huawei.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzaidi2020an,\ntitle={An Information Theoretic Approach to Distributed Representation Learning},\nauthor={Abdellatif Zaidi and Inaki Estella Aguerri},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gyy1BtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1gyy1BtDS",
        "pdf_size": 0,
        "rating": "3;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "217;617;457;162",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            5.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            363.25,
            183.75850320461365
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SZMHzIZgY9kJ:scholar.google.com/&scioq=An+Information+Theoretic+Approach+to+Distributed+Representation+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1gzR2VKDH",
        "title": "Hierarchical Foresight: Self-Supervised Learning of Long-Horizon Tasks via Visual Subgoal Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "Hierarchical visual foresight learns to generate visual subgoals that break down long-horizon tasks into subtasks, using only self-supervision.",
        "abstract": "Video prediction models combined with planning algorithms have shown promise in enabling robots to learn to perform many vision-based tasks through only self-supervision, reaching novel goals in cluttered scenes with unseen objects. However, due to the compounding uncertainty in long horizon video prediction and poor scalability of sampling-based planning optimizers, one significant limitation of these approaches is the ability to plan over long horizons to reach distant goals. To that end, we propose a framework for subgoal generation and planning, hierarchical visual foresight (HVF), which generates subgoal images conditioned on a goal image, and uses them for planning. The subgoal images are directly optimized to decompose the task into easy to plan segments, and as a result, we observe that the method naturally identifies semantically meaningful states as subgoals. Across three out of four simulated vision-based manipulation tasks, we find that our method achieves more than 20% absolute performance improvement over planning without subgoals and model-free RL approaches. Further, our experiments illustrate that our approach extends to real, cluttered visual scenes.",
        "keywords": "video prediction;reinforcement learning;planning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Suraj Nair;Chelsea Finn",
        "authorids": "surajn@stanford.edu;chelseaf@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nNair2020Hierarchical,\ntitle={Hierarchical Foresight: Self-Supervised Learning of Long-Horizon Tasks via Visual Subgoal Generation},\nauthor={Suraj Nair and Chelsea Finn},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gzR2VKDH}\n}",
        "github": "https://github.com/suraj-nair-1/google-research/tree/master/hierarchical_foresight",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gzR2VKDH",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "878;323",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "378;557",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            600.5,
            277.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            467.5,
            89.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 166,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1596194838417483200&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1gz_nNYDS",
        "title": "AutoSlim: Towards One-Shot Architecture Search for Channel Numbers",
        "track": "main",
        "status": "Reject",
        "tldr": "We present an automated approach to search the number of channels in a neural network to achieve better accuracy under constrained resources (e.g., FLOPs, latency, memory footprint or model size).",
        "abstract": "\nWe study how to set the number of channels in a neural network to achieve better accuracy under constrained resources (e.g., FLOPs, latency, memory footprint or model size). A simple and one-shot approach, named AutoSlim, is presented. Instead of training many network samples and searching with reinforcement learning, we train a single slimmable network to approximate the network accuracy of different channel configurations. We then iteratively evaluate the trained slimmable model and greedily slim the layer with minimal accuracy drop. By this single pass, we can obtain the optimized channel configurations under different resource constraints. We present experiments with MobileNet v1, MobileNet v2, ResNet-50 and RL-searched MNasNet on ImageNet classification. We show significant improvements over their default channel configurations. We also achieve better accuracy than recent channel pruning methods and neural architecture search methods with 100X lower search cost.\n\nNotably, by setting optimized channel numbers, our AutoSlim-MobileNet-v2 at 305M FLOPs achieves 74.2% top-1 accuracy, 2.4% better than default MobileNet-v2 (301M FLOPs), and even 0.2% better than RL-searched MNasNet (317M FLOPs). Our AutoSlim-ResNet-50 at 570M FLOPs, without depthwise convolutions, achieves 1.3% better accuracy than MobileNet-v1 (569M FLOPs).\n",
        "keywords": "AutoSlim;Neural Architecture Search;Efficient Networks;Network Pruning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiahui Yu;Thomas Huang",
        "authorids": "jyu79@illinois.edu;t-huang1@illinois.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nyu2020autoslim,\ntitle={AutoSlim: Towards One-Shot Architecture Search for Channel Numbers},\nauthor={Jiahui Yu and Thomas Huang},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gz_nNYDS}\n}",
        "github": "[![github](/images/github_icon.svg) JiahuiYu/slimmable_networks](https://github.com/JiahuiYu/slimmable_networks) + [![Papers with Code](/images/pwc_icon.svg) 9 community implementations](https://paperswithcode.com/paper/?openreview=H1gz_nNYDS)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1gz_nNYDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "193;318;403",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "202;553;475",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            304.6666666666667,
            86.24899355290408
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            410.0,
            150.48587973627292
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 205,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1780887470801477741&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1gza2NtwH",
        "title": "Towards understanding the true loss surface of deep neural networks using random matrix theory and iterative spectral methods",
        "track": "main",
        "status": "Reject",
        "tldr": "Understanding the neural network Hessian eigenvalues under the data generating distribution.",
        "abstract": "The geometric properties of loss surfaces, such as the local flatness of a solution, are associated with generalization in deep learning. The Hessian is often used to understand these geometric properties. We investigate the differences between the eigenvalues of the neural network Hessian evaluated over the empirical dataset, the Empirical Hessian, and the eigenvalues of the Hessian under the data generating distribution, which we term the True Hessian. Under mild assumptions, we use random matrix theory to show that the True Hessian has eigenvalues of smaller absolute value than the Empirical Hessian. We support these results for different SGD schedules on both a 110-Layer ResNet and VGG-16. To perform these experiments we propose a framework for spectral visualization, based on GPU accelerated stochastic Lanczos quadrature. This approach is an order of magnitude faster than state-of-the-art methods for spectral visualization, and can be generically used to investigate the spectral properties of matrices in deep learning.",
        "keywords": "Random Matrix theory;deep learning;deep learning theory;hessian eigenvalues;true risk",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Diego Granziol;Timur Garipov;Dmitry Vetrov;Stefan Zohren;Stephen Roberts;Andrew Gordon Wilson",
        "authorids": "diego@robots.ox.ac.uk;timgaripov@gmail.com;vetrovd@yandex.ru;zohren@robots.ox.ac.uk;sjrob@robots.ox.ac.uk;andrewgw@cims.nyu.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ngranziol2020towards,\ntitle={Towards understanding the true loss surface of deep neural networks using random matrix theory and iterative spectral methods},\nauthor={Diego Granziol and Timur Garipov and Dmitry Vetrov and Stefan Zohren and Stephen Roberts and Andrew Gordon Wilson},\nyear={2020},\nurl={https://openreview.net/forum?id=H1gza2NtwH}\n}",
        "github": "https://drive.google.com/file/d/1JxmWXjMjJ12SooCZ4rxWHJVWIqAkekGY/view",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1gza2NtwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "852;424;155",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "402;460;334",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            477.0,
            287.0063878499339
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            398.6666666666667,
            51.49325737954005
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17458644924656498946&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1l-02VKPB",
        "title": "Topology-Aware Pooling via Graph Attention",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Pooling operations have shown to be effective on various tasks in computer vision and natural language processing. One challenge of performing pooling operations on graph data is the lack of locality that is not well-defined on graphs. Previous studies used global ranking methods to sample some of the important nodes, but most of them are not able to incorporate graph topology information in computing ranking scores. In this work, we propose the topology-aware pooling (TAP) layer that uses attention operators to generate ranking scores for each node by attending each node to its neighboring nodes. The ranking scores are generated locally while the selection is performed globally, which enables the pooling operation to consider topology information. To encourage better graph connectivity in the sampled graph, we propose to add a graph connectivity term to the computation of ranking scores in the TAP layer. Based on our TAP layer, we develop a network on graph data, known as the topology-aware pooling network. Experimental results on graph classification tasks demonstrate that our methods achieve consistently better performance than previous models.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongyang Gao;Shuiwang Ji",
        "authorids": "hongyang.gao@tamu.edu;sji@tamu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngao2020topologyaware,\ntitle={Topology-Aware Pooling via Graph Attention},\nauthor={Hongyang Gao and Shuiwang Ji},\nyear={2020},\nurl={https://openreview.net/forum?id=H1l-02VKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1l-02VKPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "626;453;153",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            410.6666666666667,
            195.4078356213543
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PvNgg69j3K4J:scholar.google.com/&scioq=Topology-Aware+Pooling+via+Graph+Attention&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1l0O6EYDH",
        "title": "A NEW POINTWISE CONVOLUTION IN DEEP NEURAL NETWORKS THROUGH EXTREMELY FAST AND NON PARAMETRIC TRANSFORMS",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce new pointwise convolution layers equipped with extremely fast conventional transforms in deep neural network.",
        "abstract": "    Some conventional transforms such as Discrete Walsh-Hadamard Transform (DWHT) and Discrete Cosine Transform (DCT) have been widely used as feature extractors in image processing but rarely applied in neural networks. However, we found that these conventional transforms have the ability to capture the cross-channel correlations without any learnable parameters in DNNs. This paper firstly proposes to apply conventional transforms on pointwise convolution, showing that such transforms significantly reduce the computational complexity of neural networks without accuracy performance degradation. Especially for DWHT, it requires no floating point multiplications but only additions and subtractions, which can considerably reduce computation overheads. In addition, its fast algorithm further reduces complexity of floating point addition from O(n^2) to O(nlog n). These non-parametric and low computational properties construct extremely efficient networks in the number parameters and operations, enjoying accuracy gain. Our proposed DWHT-based model gained 1.49% accuracy increase with 79.4% reduced parameters and 48.4% reduced FLOPs compared with its baseline model (MoblieNet-V1) on the CIFAR 100 dataset.",
        "keywords": "Pointwise Convolution;Discrete Walsh-Hadamard Transform;Discrete Cosine-Transform",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Joonhyun Jeong;Sung-Ho Bae",
        "authorids": "doublejtoh@khu.ac.kr;shbae@khu.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\njeong2020a,\ntitle={A {\\{}NEW{\\}} {\\{}POINTWISE{\\}} {\\{}CONVOLUTION{\\}} {\\{}IN{\\}} {\\{}DEEP{\\}} {\\{}NEURAL{\\}} {\\{}NETWORKS{\\}} {\\{}THROUGH{\\}} {\\{}EXTREMELY{\\}} {\\{}FAST{\\}} {\\{}AND{\\}} {\\{}NON{\\}} {\\{}PARAMETRIC{\\}} {\\{}TRANSFORMS{\\}}},\nauthor={Joonhyun Jeong and Sung-Ho Bae},\nyear={2020},\nurl={https://openreview.net/forum?id=H1l0O6EYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1l0O6EYDH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "298;310;285",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "426;769;174",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            297.6666666666667,
            10.208928554075703
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            456.3333333333333,
            243.85287003072614
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7355406598522483064&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1l0e6VKDS",
        "title": "Learning to Transfer Learn",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose learning to transfer learn (L2TL) to improve transfer learning on a target dataset by judicious extraction of information from a source dataset.",
        "abstract": "We propose learning to transfer learn (L2TL) to improve transfer learning on a target dataset by judicious extraction of information from a source dataset. L2TL considers joint optimization of vastly-shared weights between models for source and target tasks, and employs adaptive weights for scaling of constituent losses. The adaptation of the weights is based on reinforcement learning, guided with a performance metric on the target validation set. We demonstrate state-of-the-art performance of L2TL given fixed models, consistently outperforming fine-tuning baselines on various datasets. In the regimes of small-scale target datasets and significant label mismatch between source and target datasets, L2TL outperforms previous work by an even larger margin. ",
        "keywords": "transfer learning;adaptive training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Linchao Zhu;Sercan O. Arik;Yi Yang;Tomas Pfister",
        "authorids": "zhulinchao7@gmail.com;soarik@google.com;yi.yang@uts.edu.au;tpfister@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1l0e6VKDS",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "447;526",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "932;948",
        "reply_reviewers": "0;0",
        "reply_authors": "2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            486.5,
            39.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            940.0,
            8.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8492134416991714420&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1l2mxHKvr",
        "title": "Few-Shot Few-Shot Learning and the role of Spatial Attention",
        "track": "main",
        "status": "Reject",
        "tldr": "We study a new problem where a pretrained model is adapted for few-shot learning using limited base class data and introduce a spatial attention mechanism for this task.",
        "abstract": "Few-shot learning is often motivated by the ability of humans to learn new tasks from few examples. However, standard few-shot classification benchmarks assume that the representation is learned on a limited amount of base class data, ignoring the amount of prior knowledge that a human may have accumulated before learning new tasks. At the same time, even if a powerful representation is available, it may happen in some domain that base class data are limited or non-existent. This motivates us to study a problem where the representation is obtained from a classifier pre-trained on a large-scale dataset of a different domain, assuming no access to its training process, while the base class data are limited to few examples per class and their role is to adapt the representation to the domain at hand rather than learn from scratch. We adapt the representation in two stages, namely on the few base class data if available and on the even fewer data of new tasks. In doing so, we obtain from the pre-trained classifier a spatial attention map that allows focusing on objects and suppressing background clutter. This is important in the new problem, because when base class data are few, the network cannot learn where to focus implicitly. We also show that a pre-trained network may be easily adapted to novel classes, without meta-learning.",
        "keywords": "few-shot learning;spatial attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yann Lifchitz;Yannis Avrithis;Sylvaine Picard",
        "authorids": "yann.lifchitz@safrangroup.com;yannis@avrithis.net;sylvaine.picard@safrangroup.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlifchitz2020fewshot,\ntitle={Few-Shot Few-Shot Learning and the role of Spatial Attention},\nauthor={Yann Lifchitz and Yannis Avrithis and Sylvaine Picard},\nyear={2020},\nurl={https://openreview.net/forum?id=H1l2mxHKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1l2mxHKvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "445;307;218",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "725;415;599",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            323.3333333333333,
            93.38926895289178
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            579.6666666666666,
            127.29318738862483
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18107956312413459430&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 14
    },
    {
        "id": "H1l3s6NtvH",
        "title": "A Bayes-Optimal View on Adversarial Examples",
        "track": "main",
        "status": "Reject",
        "tldr": "We show analytically and empirically that the Bayes-optimal classifiers are, in some settings, vulnerable to adversarial examples. We then show that even when the optimal classifier is robust, trained CNNs are vulnerable.",
        "abstract": "Adversarial attacks on CNN classifiers can make an imperceptible change to an input image and alter the classification result. The source of these failures is still poorly understood, and many explanations invoke the \"unreasonably linear extrapolation\" used by CNNs along with the geometry of high dimensions.\nIn this paper we show that similar attacks can be used against the Bayes-Optimal classifier for certain class distributions, while for others the optimal classifier is robust to such attacks. We present analytical results showing conditions on the data distribution under which all points can be made arbitrarily close to the optimal decision boundary and show that this can happen even when the classes are easy to separate, when the ideal classifier has a smooth decision surface and when the data lies in low dimensions. We introduce new datasets of realistic images of faces and digits where the Bayes-Optimal classifier can be calculated efficiently and show that for some of these datasets the optimal classifier is robust and for others it is vulnerable to adversarial examples. In systematic experiments with many such datasets, we find that standard CNN training consistently finds a vulnerable classifier even when the optimal classifier is robust while large-margin methods often find a robust classifier with the exact same training data. Our results suggest that adversarial vulnerability is not an unavoidable consequence of machine learning in high dimensions, and may often be a result of suboptimal training methods used in current practice.",
        "keywords": "Adversarial Examples;Generative Models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eitan Richardson;Yair Weiss",
        "authorids": "eitan.richardson@gmail.com;yweiss@cs.huji.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nrichardson2020a,\ntitle={A Bayes-Optimal View on Adversarial Examples},\nauthor={Eitan Richardson and Yair Weiss},\nyear={2020},\nurl={https://openreview.net/forum?id=H1l3s6NtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1l3s6NtvH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "355;372;1270",
        "wc_reply_reviewers": "355;0;1015",
        "wc_reply_authors": "560;415;1288",
        "reply_reviewers": "1;0;3",
        "reply_authors": "2;2;3",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            665.6666666666666,
            427.3845523127334
        ],
        "wc_reply_reviewers_avg": [
            456.6666666666667,
            420.56179358356155
        ],
        "wc_reply_authors_avg": [
            754.3333333333334,
            381.97411197910026
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            1.247219128924647
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17213542463903498084&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1l7AkrFPS",
        "title": "Spatial Information is Overrated for Image Classification",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Spatial information at last layers is not necessary for a good classification accuracy.",
        "abstract": "Intuitively, image classification should profit from using spatial information. Recent work, however, suggests that this might be overrated in standard CNNs. In this paper, we are pushing the envelope and aim to further investigate the reliance on and necessity of spatial information. We propose and analyze three methods, namely Shuffle Conv, GAP+FC and 1x1 Conv, that destroy spatial information during both training and testing phases. We extensively evaluate these methods on several object recognition datasets (CIFAR100, Small-ImageNet, ImageNet) with a wide range of CNN architectures (VGG16, ResNet50, ResNet152, MobileNet, SqueezeNet). Interestingly, we consistently observe that spatial information can be completely deleted from a significant number of layers with no or only small performance drops.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yue Fan;Yongqin Xian;Max Maria Losch;Bernt Schiele",
        "authorids": "yfan@mpi-inf.mpg.de;yxian@mpi-inf.mpg.de;mlosch@mpi-inf.mpg.de;schiele@mpi-inf.mpg.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1l7AkrFPS",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "531;421;237",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            396.3333333333333,
            121.28570493764805
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3xUzKrlSWGIJ:scholar.google.com/&scioq=Spatial+Information+is+Overrated+for+Image+Classification&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1lBYCEFDB",
        "title": "A Coordinate-Free Construction of Scalable Natural Gradient",
        "track": "main",
        "status": "Reject",
        "tldr": "We explicitly construct a Riemannian metric under which the natural gradient matches the K-FAC update; exact affine invariances follows immediately.",
        "abstract": "Most neural networks are trained using first-order optimization methods, which are sensitive to the parameterization of the model. Natural gradient descent is invariant to smooth reparameterizations because it is defined in a coordinate-free way, but tractable approximations are typically defined in terms of coordinate systems, and hence may lose the invariance properties. We analyze the invariance properties of the Kronecker-Factored Approximate Curvature (K-FAC) algorithm by constructing the algorithm in a coordinate-free way. We explicitly construct a Riemannian metric under which the natural gradient matches the K-FAC update; invariance to affine transformations of the activations follows immediately. We extend our framework to analyze the invariance properties of K-FAC appied to convolutional networks and recurrent neural networks, as well as metrics other than the usual Fisher metric.",
        "keywords": "Natural gradient;second-order optimization;K-FAC;parameterization invariance;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kevin Luk;Roger Grosse",
        "authorids": "kevin.kh.luk@gmail.com;rgrosse@cs.toronto.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nluk2020a,\ntitle={A Coordinate-Free Construction of Scalable Natural Gradient},\nauthor={Kevin Luk and Roger Grosse},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lBYCEFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1lBYCEFDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "427;102;341",
        "wc_reply_reviewers": "132;0;0",
        "wc_reply_authors": "346;354;356",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            290.0,
            137.49424230369308
        ],
        "wc_reply_reviewers_avg": [
            44.0,
            62.22539674441618
        ],
        "wc_reply_authors_avg": [
            352.0,
            4.320493798938574
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9404268437282805697&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1lBj2VFPS",
        "title": "Linear Symmetric Quantization of Neural Networks for Low-precision Integer Hardware",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce an efficient quantization process that allows for performance acceleration on specialized integer-only neural network accelerator.",
        "abstract": "With the proliferation of specialized neural network processors that operate on low-precision integers, the performance of Deep Neural Network inference becomes increasingly dependent on the result of quantization. Despite plenty of prior work on the quantization of weights or activations for neural networks, there is still a wide gap between the software quantizers and the low-precision accelerator implementation, which degrades either the efficiency of networks or that of the hardware for the lack of software and hardware coordination at design-phase. In this paper, we propose a learned linear symmetric quantizer for integer neural network processors, which not only quantizes neural parameters and activations to low-bit integer but also accelerates hardware inference by using batch normalization fusion and low-precision accumulators (e.g., 16-bit) and multipliers (e.g., 4-bit). We use a unified way to quantize weights and activations, and the results outperform many previous approaches for various networks such as AlexNet, ResNet, and lightweight models like MobileNet while keeping friendly to the accelerator architecture. Additional, we also apply the method to object detection models and witness high performance and accuracy in YOLO-v2. Finally, we deploy the quantized models on our specialized integer-arithmetic-only DNN accelerator to show the effectiveness of the proposed quantizer. We show that even with linear symmetric quantization, the results can be better than asymmetric or non-linear methods in 4-bit networks. In evaluation, the proposed quantizer induces less than 0.4\\% accuracy drop in ResNet18, ResNet34, and AlexNet when quantizing the whole network as required by the integer processors.",
        "keywords": "quantization;integer-arithmetic-only DNN accelerator;acceleration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiandong Zhao;Ying Wang;Xuyi Cai;Cheng Liu;Lei Zhang",
        "authorids": "zhaoxiandong@ict.ac.cn;wangying2009@ict.ac.cn;caixuyi18s@ict.ac.cn;liucheng@ict.ac.cn;zlei@ict.ac.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nZhao2020Linear,\ntitle={Linear Symmetric Quantization of Neural Networks for Low-precision Integer Hardware},\nauthor={Xiandong Zhao and Ying Wang and Xuyi Cai and Cheng Liu and Lei Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lBj2VFPS}\n}",
        "github": "https://anonymous.4open.science/r/c05a5b6a-1d0c-4201-926f-e7b52034f7a5/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lBj2VFPS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "269;334;347",
        "wc_reply_reviewers": "0;133;0",
        "wc_reply_authors": "263;356;421",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            316.6666666666667,
            34.120700787384514
        ],
        "wc_reply_reviewers_avg": [
            44.333333333333336,
            62.69680126520721
        ],
        "wc_reply_authors_avg": [
            346.6666666666667,
            64.83997395297304
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 87,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7368652069047295827&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1lDSCEYPH",
        "title": "Beyond GANs: Transforming without a Target Distribution",
        "track": "main",
        "status": "Reject",
        "tldr": "A method for learning a transformation between one pair of source/target datasets and applying it a separate source dataset for which there is no target dataset",
        "abstract": "While generative neural networks can learn to transform a specific input dataset into a specific target dataset, they require having just such a paired set of input/output datasets. For instance, to fool the discriminator, a generative adversarial network (GAN) exclusively trained to transform images of black-haired *men* to blond-haired *men* would need to change gender-related characteristics as well as hair color when given images of black-haired *women* as input. This is problematic, as often it is possible to obtain *a* pair of (source, target) distributions but then have a second source distribution where the target distribution is unknown. The computational challenge is that generative models are good at generation within the manifold of the data that they are trained on. However, generating new samples outside of the manifold or extrapolating \"out-of-sample\" is a much harder problem that has been less well studied. To address this, we introduce a technique called *neuron editing* that learns how neurons encode an edit for a particular transformation in a latent space. We use an autoencoder to decompose the variation within the dataset into activations of different neurons and generate transformed data by defining an editing transformation on those neurons. By performing the transformation in a latent trained space, we encode fairly complex and non-linear transformations to the data with much simpler distribution shifts to the neuron's activations. Our technique is general and works on a wide variety of data domains and applications. We first demonstrate it on image transformations and then move to our two main biological applications: removal of batch artifacts representing unwanted noise and modeling the effect of drug treatments to predict synergy between drugs.",
        "keywords": "GAN;domain transfer;computational biology;latent space manipulations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matthew Amodio;David van Dijk;Ruth Montgomery;Guy Wolf;Smita Krishnaswamy",
        "authorids": "matthew.amodio@yale.edu;david.vandijk@yale.edu;ruth.montgomery@yale.edu;guy.wolf@umontreal.ca;smita.krishnaswamy@yale.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\namodio2020beyond,\ntitle={Beyond {\\{}GAN{\\}}s: Transforming without a Target Distribution},\nauthor={Matthew Amodio and David van Dijk and Ruth Montgomery and Guy Wolf and Smita Krishnaswamy},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lDSCEYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lDSCEYPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "680;149;397",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "632;236;345",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            408.6666666666667,
            216.93675473638754
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            404.3333333333333,
            167.02162202009123
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8-2qYf5bef0J:scholar.google.com/&scioq=Beyond+GANs:+Transforming+without+a+Target+Distribution&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1lDbaVYvH",
        "title": "SMiRL: Surprise Minimizing RL in Entropic Environments",
        "track": "main",
        "status": "Reject",
        "tldr": "Learning emergent behavior by minimizing Bayesian surprise with RL in natural environments with entropy.",
        "abstract": "All living organisms struggle against the forces of nature to carve out niches where\nthey can maintain relative stasis. We propose that such a search for order amidst\nchaos might offer a unifying principle for the emergence of useful behaviors in\nartificial agents. We formalize this idea into an unsupervised reinforcement learning\nmethod called surprise minimizing RL (SMiRL). SMiRL trains an agent with the\nobjective of maximizing the probability of observed states under a model trained on\nall previously seen states. The resulting agents acquire several proactive behaviors\nto seek and maintain stable states such as balancing and damage avoidance, that\nare closely tied to the affordances of the environment and its prevailing sources\nof entropy, such as winds, earthquakes, and other agents.  We demonstrate that\nour surprise minimizing agents can successfully play Tetris, Doom, and control\na  humanoid  to  avoid  falls,  without  any  task-specific  reward  supervision.   We\nfurther show that SMiRL can be used as an unsupervised pre-training objective\nthat substantially accelerates subsequent reward-driven learning",
        "keywords": "intrinsic motivation;reinforcement learning;unsurpervised RL",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Glen Berseth;Daniel Geng;Coline Devin;Dinesh Jayaraman;Chelsea Finn;Sergey Levine",
        "authorids": "gberseth@gmail.com;dangengdg@berkeley.edu;coline.devin@gmail.com;dinesh.jayaraman123@gmail.com;cbfinn@eecs.berkeley.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nberseth2020smirl,\ntitle={{\\{}SM{\\}}iRL: Surprise Minimizing {\\{}RL{\\}} in Entropic Environments},\nauthor={Glen Berseth and Daniel Geng and Coline Devin and Dinesh Jayaraman and Chelsea Finn and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lDbaVYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1lDbaVYvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "193;514;860",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "472;197;449",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            522.3333333333334,
            272.36535919402246
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            372.6666666666667,
            124.56948083521723
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12108015831688101714&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1lFsREYPS",
        "title": "ASGen: Answer-containing Sentence Generation to Pre-Train Question Generator for Scale-up Data in Question Answering",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose Answer-containing Sentence Generation (ASGen), a novel pre-training method for generating synthetic data for machine reading comprehension.",
        "abstract": "Numerous machine reading comprehension (MRC) datasets often involve manual annotation, requiring enormous human effort, and hence the size of the dataset remains significantly smaller than the size of the data available for unsupervised learning. Recently, researchers proposed a model for generating synthetic question-and-answer data from large corpora such as Wikipedia. This model is utilized to generate synthetic data for training an MRC model before fine-tuning it using the original MRC dataset. This technique shows better performance than other general pre-training techniques such as language modeling, because the characteristics of the generated data are similar to those of the downstream MRC data. However, it is difficult to have high-quality synthetic data comparable to human-annotated MRC datasets. To address this issue, we propose Answer-containing Sentence Generation (ASGen), a novel pre-training method for generating synthetic data involving two advanced techniques, (1) dynamically determining K answers and (2) pre-training the question generator on the answer-containing sentence generation task. We evaluate the question generation capability of our method by comparing the BLEU score with existing methods and test our method by fine-tuning the MRC model on the downstream MRC data after training on synthetic data. Experimental results show that our approach outperforms existing generation methods and increases the performance of the state-of-the-art MRC models across a range of MRC datasets such as SQuAD-v1.1, SQuAD-v2.0, KorQuAD and QUASAR-T without any architectural modifications to the original MRC model.",
        "keywords": "Question Answering;Machine Reading Comprehension;Data Augmentation;Question Generation;Answer Generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akhil Kedia;Sai Chetan Chinthakindi;Seohyun Back;Haejun Lee;Jaegul Choo",
        "authorids": "akhil.kedia@samsung.com;sai.chetan@samsung.com;scv.back@samsung.com;haejun82.lee@samsung.com;jchoo@korea.ac.kr",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nkedia2020asgen,\ntitle={{\\{}ASG{\\}}en: Answer-containing Sentence Generation to Pre-Train Question Generator for Scale-up Data in Question Answering},\nauthor={Akhil Kedia and Sai Chetan Chinthakindi and Seohyun Back and Haejun Lee and Jaegul Choo},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lFsREYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lFsREYPS",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "1250;219",
        "wc_reply_reviewers": "96;0",
        "wc_reply_authors": "1259;635",
        "reply_reviewers": "1;0",
        "reply_authors": "4;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            734.5,
            515.5
        ],
        "wc_reply_reviewers_avg": [
            48.0,
            48.0
        ],
        "wc_reply_authors_avg": [
            947.0,
            312.0
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            3.0,
            1.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7394416887824263146&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1lK5kBKvr",
        "title": "Semi-supervised 3D Face Reconstruction with Nonlinear Disentangled Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "We train our face reconstruction model with adversarial loss in semi-supervised manner on hybrid batches of unlabeled and labeled face images to exploit the value of large amounts of unlabeled face images from unconstrained photo collections.",
        "abstract": "Recovering 3D geometry shape, albedo and lighting from a single image has wide applications in many areas, which is also a typical ill-posed problem. In order to eliminate the ambiguity, face prior knowledge like linear 3D morphable models (3DMM) learned from limited scan data are often adopted to the reconstruction process. However, methods based on linear parametric models cannot generalize well for facial images in the wild with various ages, ethnicity, expressions, poses, and lightings. Recent methods aim to learn a nonlinear parametric model using convolutional neural networks (CNN) to regress the face shape and texture directly. However, the models were only trained on a dataset that is generated from a linear 3DMM. Moreover, the identity and expression representations are entangled in these models, which hurdles many facial editing applications. In this paper, we train our model with adversarial loss in a semi-supervised manner on hybrid batches of unlabeled and labeled face images to exploit the value of large amounts of unlabeled face images from unconstrained photo collections. A novel center loss is introduced to make sure that different facial images from the same person have the same identity shape and albedo. Besides, our proposed model disentangles identity, expression, pose, and lighting representations, which improves the overall reconstruction performance and facilitates facial editing applications, e.g., expression transfer. Comprehensive experiments demonstrate that our model produces high-quality reconstruction compared to state-of-the-art methods and is robust to various expression, pose, and lighting conditions.\n",
        "keywords": "3D face reconstruction;semi-supervised learning;disentangled representation;inverse rendering;graph convolutional networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhongpai Gao;Juyong Zhang;Yudong Guo;Chao Ma;Guangtao Zhai;Xiaokang Yang",
        "authorids": "gaozhongpai@sjtu.edu.cn;juyong@ustc.edu.cn;gyd2011@mail.ustc.edu.cn;chaoma@sjtu.edu.cn;zhaiguangtao@sjtu.edu.cn;xkyang@sjtu.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ngao2020semisupervised,\ntitle={Semi-supervised 3D Face Reconstruction with Nonlinear Disentangled Representations},\nauthor={Zhongpai Gao and Juyong Zhang and Yudong Guo and Chao Ma and Guangtao Zhai and Xiaokang Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lK5kBKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lK5kBKvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "384;225;437",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.6666666666667,
            90.08267807347252
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Vg4qqGMa-vkJ:scholar.google.com/&scioq=Semi-supervised+3D+Face+Reconstruction+with+Nonlinear+Disentangled+Representations&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1lKNp4Fvr",
        "title": "A shallow feature extraction network with a large receptive field for stereo matching tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduced a shallow featrue extraction network with a large receptive field for stereo matching tasks, which uses a simple structure to get better performance.",
        "abstract": "Stereo matching is one of the important basic tasks in the computer vision field. In recent years, stereo matching algorithms based on deep learning have achieved excellent performance and become the mainstream research direction. Existing algorithms generally use deep convolutional neural networks (DCNNs) to extract more abstract semantic information, but we believe that the detailed information of the spatial structure is more important for stereo matching tasks. Based on this point of view, this paper proposes a shallow feature extraction network with a large receptive field. The network consists of three parts: a primary feature extraction module, an atrous spatial pyramid pooling (ASPP) module and a feature fusion module. The primary feature extraction network contains only three convolution layers. This network utilizes the basic feature extraction ability of the shallow network to extract and retain the detailed information of the spatial structure. In this paper, the dilated convolution and atrous spatial pyramid pooling (ASPP) module is introduced to increase the size of receptive field. In addition, a feature fusion module is designed, which integrates the feature maps with multiscale receptive fields and mutually complements the feature information of different scales. We replaced the feature extraction part of the existing stereo matching algorithms with our shallow feature extraction network, and achieved state-of-the-art performance on the KITTI 2015 dataset. Compared with the reference network, the number of parameters is reduced by 42%, and the matching accuracy is improved by 1.9%.",
        "keywords": "stereo matching;feature extraction network;convolution neural network;receptive field",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jianguo Liu;Yunjian Feng;Guo Ji;Fuwu Yan",
        "authorids": "ljg424@163.com;1029515027@whut.edu.cn;18754806756@163.com;yanfw@whut.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nliu2020a,\ntitle={A shallow feature extraction network with a large receptive field for stereo matching tasks},\nauthor={Jianguo Liu and Yunjian Feng and Guo Ji and Fuwu Yan},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lKNp4Fvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=H1lKNp4Fvr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "311;312;88",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            237.0,
            105.35970134100924
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12240290043230471540&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "H1lK_lBtvS",
        "title": "Classification-Based Anomaly Detection for General Data",
        "track": "main",
        "status": "Poster",
        "tldr": "Anomaly detection method that uses: openset techniques for better generalization, random-transformation classification for non-image data.",
        "abstract": "Anomaly detection, finding patterns that substantially deviate from those seen previously, is one of the fundamental problems of artificial intelligence. Recently, classification-based methods were shown to achieve superior results on this task. In this work, we present a unifying view and propose an open-set method, GOAD, to relax current generalization assumptions. Furthermore, we extend the applicability of transformation-based methods to non-image data using random affine transformations. Our method is shown to obtain state-of-the-art accuracy and is applicable to broad data types. The strong performance of our method is extensively validated on multiple datasets from different domains.  ",
        "keywords": "anomaly detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liron Bergman;Yedid Hoshen",
        "authorids": "liron.bergman@mail.huji.ac.il;yedid@cs.huji.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nBergman2020Classification-Based,\ntitle={Classification-Based Anomaly Detection for General Data},\nauthor={Liron Bergman and Yedid Hoshen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lK_lBtvS}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=H1lK_lBtvS)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1lK_lBtvS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "1146;621;292",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "587;169;195",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            686.3333333333334,
            351.69146831973177
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            317.0,
            191.2136675728664
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 493,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16942173388068737&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1lKd6NYPS",
        "title": "Online Meta-Critic Learning for Off-Policy Actor-Critic Methods",
        "track": "main",
        "status": "Reject",
        "tldr": "We present Meta-Critic, an auxiliary critic module for off-policy actor-critic methods that can be meta-learned online during single task learning.",
        "abstract": "Off-Policy Actor-Critic (Off-PAC) methods have proven successful in a variety of continuous control tasks. Normally, the critic\u2019s action-value function is updated using temporal-difference, and the critic in turn provides a loss for the actor that trains it to take actions with higher expected return. In this paper, we introduce a novel and flexible meta-critic that observes the learning process and meta-learns an additional loss for the actor that accelerates and improves actor-critic learning. Compared to the vanilla critic, the meta-critic network is explicitly trained to accelerate the learning process; and compared to existing meta-learning algorithms, meta-critic is rapidly learned online for a single task, rather than slowly over a family of tasks. Crucially, our meta-critic framework is designed for off-policy based learners, which currently provide state-of-the-art reinforcement learning sample efficiency. We demonstrate that online meta-critic learning leads to improvements in a variety of continuous control environments when combined with contemporary Off-PAC methods DDPG, TD3 and the state-of-the-art SAC. ",
        "keywords": "off-policy actor-critic;reinforcement learning;meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei Zhou;Yiying Li;Yongxin Yang;Huaimin Wang;Timothy M. Hospedales",
        "authorids": "zhouwei14@nudt.edu.cn;liyiying10@nudt.edu.cn;yongxin.yang@ed.ac.uk;hmwang@nudt.edu.cn;t.hospedales@ed.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhou2020online,\ntitle={Online Meta-Critic Learning for Off-Policy Actor-Critic Methods},\nauthor={Wei Zhou and Yiying Li and Yongxin Yang and Huaimin Wang and Timothy M. Hospedales},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lKd6NYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lKd6NYPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "751;673;438",
        "wc_reply_reviewers": "0;0;95",
        "wc_reply_authors": "1244;879;461",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            620.6666666666666,
            133.0321598545087
        ],
        "wc_reply_reviewers_avg": [
            31.666666666666668,
            44.78342947514801
        ],
        "wc_reply_authors_avg": [
            861.3333333333334,
            319.9024156763365
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 51,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15413829867352499622&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "H1lMogrKDH",
        "title": "LEARNING DIFFICULT PERCEPTUAL TASKS WITH HODGKIN-HUXLEY NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "A network of static time Hodgkin-Huxley neurons can perform well on computer vision datasets.",
        "abstract": "This paper demonstrates that a computational neural network model using ion channel-based conductances to transmit information can solve standard computer vision datasets at near state-of-the-art performance. Although not fully biologically accurate, this model incorporates fundamental biophysical principles underlying the control of membrane potential and the processing of information by Ohmic ion channels. The key computational step employs Conductance-Weighted Averaging (CWA) in place of the traditional affine transformation, representing a fundamentally different computational principle. \nImportantly, CWA based networks are self-normalizing and range-limited. We also demonstrate for the first time that a network with excitatory and inhibitory neurons and nonnegative synapse strengths can successfully solve computer vision problems. Although CWA models do not yet surpass the current state-of-the-art in deep learning, the results are competitive on CIFAR-10. There remain avenues for improving these networks, e.g. by more closely modeling ion channel function and connectivity patterns of excitatory and inhibitory neurons found in the brain.  ",
        "keywords": "conductance-weighted averaging;neural modeling;normalization methods",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alan Lockett;Ankit Patel;Paul Pfaffinger",
        "authorids": "alan.lockett@gmail.com;ankitp@bcm.edu;paulp@bcm.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlockett2020learning,\ntitle={{\\{}LEARNING{\\}} {\\{}DIFFICULT{\\}} {\\{}PERCEPTUAL{\\}} {\\{}TASKS{\\}} {\\{}WITH{\\}} {\\{}HODGKIN{\\}}-{\\{}HUXLEY{\\}} {\\{}NETWORKS{\\}}},\nauthor={Alan Lockett and Ankit Patel and Paul Pfaffinger},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lMogrKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lMogrKDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "261;342;363",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "775;766;728",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            322.0,
            43.977266854592045
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            756.3333333333334,
            20.368821489936252
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1AyFmy7i4LkJ:scholar.google.com/&scioq=LEARNING+DIFFICULT+PERCEPTUAL+TASKS+WITH+HODGKIN-HUXLEY+NETWORKS&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1lNPxHKDH",
        "title": "A Function Space View of Bounded Norm Infinite Width ReLU Nets: The Multivariate Case",
        "track": "main",
        "status": "Poster",
        "tldr": "We characterize the space of functions realizable as a ReLU network with an unbounded number of units (infinite width), but where the Euclidean norm of the weights is bounded.",
        "abstract": "We give a tight characterization of the (vectorized Euclidean) norm of weights required to realize a function $f:\\mathbb{R}\\rightarrow \\mathbb{R}^d$ as a single hidden-layer ReLU network with an unbounded number of units (infinite width), extending the univariate characterization of Savarese et al. (2019) to the multivariate case.",
        "keywords": "inductive bias;regularization;infinite-width networks;ReLU networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Greg Ongie;Rebecca Willett;Daniel Soudry;Nathan Srebro",
        "authorids": "gongie@uchicago.edu;willett@uchicago.edu;daniel.soudry@technion.ac.il;nati@ttic.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nOngie2020A,\ntitle={A Function Space View of Bounded Norm Infinite Width ReLU Nets: The Multivariate Case},\nauthor={Greg Ongie and Rebecca Willett and Daniel Soudry and Nathan Srebro},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lNPxHKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1lNPxHKDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "135;120;381",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "45;0;598",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            212.0,
            119.65784554303157
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            214.33333333333334,
            271.9146107798467
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 173,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17237572427017855991&as_sdt=5,31&sciodt=0,31&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "H1lNb0NtPH",
        "title": "DIME: AN INFORMATION-THEORETIC DIFFICULTY MEASURE FOR AI DATASETS",
        "track": "main",
        "status": "Reject",
        "tldr": "We extend Fano\u2019s inequality to the common case of continuous-feature-discrete-label random variables, and design a neural-network based difficulty measure for AI datasets.",
        "abstract": "Evaluating the relative difficulty of widely-used benchmark datasets across time and across data modalities is important for accurately measuring progress in machine learning.  To help tackle this problem, we proposeDIME, an information-theoretic DIfficulty MEasure for datasets, based on conditional entropy estimation of the sample-label distribution.  Theoretically,  we prove a model-agnostic and modality-agnostic lower bound on the 0-1 error by extending Fano\u2019s inequality to the common supervised learning scenario where labels are discrete and features are continuous. Empirically, we estimate this lower bound using a neural network to compute DIME. DIME can be decomposed into components attributable to the data distribution and the number of samples.  DIME can also compute per-class difficulty scores. Through extensive experiments on both vision and language datasets, we show that DIME is well-aligned with empirically observed performance of state-of-the-art machine learning models. We hope that DIME can aid future dataset design and model-training strategies.",
        "keywords": "Information Theory;Fano\u2019s Inequality;Difficulty Measure;Donsker-Varadhan Representation;Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Peiliang Zhang;Huan Wang;Nikhil Naik;Caiming Xiong;Richard Socher",
        "authorids": "pez35@pitt.edu;huan.wang@salesforce.com;nnaik@salesforce.com;cxiong@salesforce.com;rsocher@salesforce.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang2020dime,\ntitle={{\\{}DIME{\\}}: {\\{}AN{\\}} {\\{}INFORMATION{\\}}-{\\{}THEORETIC{\\}} {\\{}DIFFICULTY{\\}} {\\{}MEASURE{\\}} {\\{}FOR{\\}} {\\{}AI{\\}} {\\{}DATASETS{\\}}},\nauthor={Peiliang Zhang and Huan Wang and Nikhil Naik and Caiming Xiong and Richard Socher},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lNb0NtPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lNb0NtPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "913;577;297",
        "wc_reply_reviewers": "186;0;0",
        "wc_reply_authors": "813;414;41",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            595.6666666666666,
            251.82710118033143
        ],
        "wc_reply_reviewers_avg": [
            62.0,
            87.68124086713189
        ],
        "wc_reply_authors_avg": [
            422.6666666666667,
            315.2272548848247
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10196180990681406649&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1lOUeSFvB",
        "title": "Improving Gradient Estimation in Evolutionary Strategies With Past Descent Directions",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a novel method to optimally incorporate surrogate gradient information.  Our approach, unlike previous work, needs no information about the quality of the surrogate gradients and is always guaranteed to find a descent direction that is better than the surrogate gradient. This allows to iteratively use the  previous gradient estimate as surrogate gradient for the current search point. We theoretically prove that this yields fast convergence  to the true gradient for linear functions and show under simplifying assumptions that it significantly improves gradient estimates for general functions. Finally, we evaluate our approach empirically on MNIST and reinforcement learning tasks and show that it considerably improves  the gradient estimation of ES at no  extra computational cost.",
        "keywords": "Evolutionary Strategies;Surrogate Gradients",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Florian Meier;Asier Mujika;Marcelo Gauy;Angelika Steger",
        "authorids": "meierflo@inf.ethz.ch;asierm@inf.ethz.ch;marcelo.matheus@inf.ethz.ch;steger@inf.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmeier2020improving,\ntitle={Improving Gradient Estimation in Evolutionary Strategies With Past Descent Directions},\nauthor={Florian Meier and Asier Mujika and Marcelo Gauy and Angelika Steger},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lOUeSFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lOUeSFvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "494;283;421",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "312;284;119",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            399.3333333333333,
            87.49222187651247
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            238.33333333333334,
            85.1521514049345
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11290252076693118432&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1lQIgrFDS",
        "title": "$\\ell_1$ Adversarial Robustness Certificates: a Randomized Smoothing Approach",
        "track": "main",
        "status": "Reject",
        "tldr": "We derive the first tight $\\ell_1$ robustness certificate under isotropic Laplace distributions. ",
        "abstract": "Robustness is an important property to guarantee the security of machine learning models. It has recently been demonstrated that strong robustness certificates can be obtained on ensemble classifiers generated by input randomization. However, tight robustness certificates are only known for symmetric norms including $\\ell_0$ and $\\ell_2$, while for asymmetric norms like $\\ell_1$, the existing techniques do not apply. By converting the likelihood ratio into a one-dimensional mixed random variable, we derive the first tight $\\ell_1$ robustness certificate under isotropic Laplace distributions. Empirically, the deep networks smoothed by Laplace distributions yield the state-of-the-art certified robustness in $\\ell_1$ norm on CIFAR-10 and ImageNet.  ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiaye Teng;Guang-He Lee;Yang Yuan",
        "authorids": "2016110299@live.sufe.edu.cn;guanghe@csail.mit.edu;yuanyang@tsinghua.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nteng2020ell,\ntitle={{\\$}{\\textbackslash}ell{\\_}1{\\$} Adversarial Robustness Certificates: a Randomized Smoothing Approach},\nauthor={Jiaye Teng and Guang-He Lee and Yang Yuan},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lQIgrFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lQIgrFDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "277;630;209",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "253;594;102",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            372.0,
            184.53364643518717
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            316.3333333333333,
            205.79007642633198
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2194290713310122878&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1lQJ1HYwS",
        "title": "Deep amortized clustering",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a \\textit{deep amortized clustering} (DAC), a neural architecture which learns to cluster datasets efficiently using a few forward passes. DAC implicitly learns what makes a cluster, how to group data points into clusters, and how to count the number of clusters in datasets. DAC is meta-learned using labelled datasets for training, a process distinct from traditional clustering algorithms which usually require hand-specified prior knowledge about cluster shapes/structures. We empirically show, on both synthetic and image data, that DAC can efficiently and accurately cluster new datasets coming from the same distribution used to generate training datasets. ",
        "keywords": "clustering;amortized inference;meta learning;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Juho Lee;Yoonho Lee;Yee Whye Teh",
        "authorids": "juho@aitrics.com;einet89@gmail.com;y.w.teh@stats.ox.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlee2020deep,\ntitle={Deep amortized clustering},\nauthor={Juho Lee and Yoonho Lee and Yee Whye Teh},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lQJ1HYwS}\n}",
        "github": "https://github.com/ICLR2020anonymous/dac",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1lQJ1HYwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "325;418;926",
        "wc_reply_reviewers": "130;0;0",
        "wc_reply_authors": "380;556;292",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            556.3333333333334,
            264.1367490945215
        ],
        "wc_reply_reviewers_avg": [
            43.333333333333336,
            61.282587702834114
        ],
        "wc_reply_authors_avg": [
            409.3333333333333,
            109.75528334536894
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5499654213699597127&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1lTQ1rFvS",
        "title": "R2D2: Reuse & Reduce via Dynamic Weight Diffusion for Training Efficient NLP Models",
        "track": "main",
        "status": "Reject",
        "tldr": "Efficient transform layers inspired by Hamilton Products save parameters",
        "abstract": "We propose R2D2 layers, a new neural block for training efficient NLP models. Our proposed method is characterized by a dynamic weight diffusion mechanism which learns to reuse and reduce parameters in the conventional transformation layer, commonly found in popular Transformer/LSTMs models. Our method is inspired by recent Quaternion methods which share parameters via the Hamilton product. This can be interpreted as a neural and learned approximation of the Hamilton product which imbues our method with increased flexibility and expressiveness, i.e., we are no longer restricted by the 4D nature of Quaternion weight sharing. We conduct extensive experiments in the NLP domain, showing that R2D2 (i) enables a parameter savings of up to 2 times to 16 times with minimal degradation of performance and (ii) outperforms other parameter savings alternative such as low-rank factorization and Quaternion methods.\n",
        "keywords": "Deep Learning;Natural Language Processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yi Tay;Aston Zhang;Shuai Zhang;Alvin Chan;Luu Anh Tuan;Siu Cheung Hui",
        "authorids": "ytay017@e.ntu.edu.sg;astonz@amazon.com;cheungshuai@outlook.com;guoweial001@e.ntu.edu.sg;tuanluu@csail.mit.edu;asschui@ntu.edu.sg",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ntay2020rd,\ntitle={R2D2: Reuse {\\&} Reduce via Dynamic Weight Diffusion for Training Efficient {\\{}NLP{\\}} Models},\nauthor={Yi Tay and Aston Zhang and Shuai Zhang and Alvin Chan and Luu Anh Tuan and Siu Cheung Hui},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lTQ1rFvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lTQ1rFvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "489;357;463",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "276;143;301",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            436.3333333333333,
            57.09251750935104
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            240.0,
            69.34455037468098
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nTZM1cSPn9kJ:scholar.google.com/&scioq=R2D2:+Reuse+%26+Reduce+via+Dynamic+Weight+Diffusion+for+Training+Efficient+NLP+Models&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1lTRJBtwB",
        "title": "Compositional Transfer in Hierarchical Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop a hierarchical, actor-critic algorithm for compositional transfer by sharing policy components and demonstrate component specialization and related direct benefits in multitask domains as well as its adaptation for single tasks.",
        "abstract": "The successful application of flexible, general learning algorithms to real-world robotics applications is often limited by their poor data-efficiency. To address the challenge, domains with more than one dominant task of interest encourage the sharing of information across tasks to limit required experiment time. To this end, we investigate compositional inductive biases in the form of hierarchical policies as a mechanism for knowledge transfer across tasks in reinforcement learning (RL). We demonstrate that this type of hierarchy enables positive transfer while mitigating negative interference. Furthermore, we demonstrate the benefits of additional incentives to efficiently decompose task solutions. Our experiments show that these incentives are naturally given in multitask learning and can be easily introduced for single objectives. We design an RL algorithm that enables stable and fast learning of structured policies and the effective reuse of both behavior components and transition data across tasks in an off-policy setting. Finally, we evaluate our algorithm in simulated environments as well as physical robot experiments and  demonstrate substantial improvements in data data-efficiency over competitive baselines.",
        "keywords": "Multitask;Transfer Learning;Reinforcement Learning;Hierarchical Reinforcement Learning;Compositional;Off-Policy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Markus Wulfmeier;Abbas Abdolmaleki;Roland Hafner;Jost Tobias Springenberg;Michael Neunert;Tim Hertweck;Thomas Lampe;Noah Siegel;Nicolas Heess;Martin Riedmiller",
        "authorids": "mwulfmeier@google.com;aabdolmaleki@google.com;;springenberg@google.com;;thertweck@google.com;thomaslampe@google.com;;;riedmiller@google.com",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@misc{\nwulfmeier2020compositional,\ntitle={Compositional Transfer in Hierarchical Reinforcement Learning},\nauthor={Markus Wulfmeier and Abbas Abdolmaleki and Roland Hafner and Jost Tobias Springenberg and Michael Neunert and Tim Hertweck and Thomas Lampe and Noah Siegel and Nicolas Heess and Martin Riedmiller},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lTRJBtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lTRJBtwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "518;491;455",
        "wc_reply_reviewers": "17;0;0",
        "wc_reply_authors": "1200;714;207",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            488.0,
            25.80697580112788
        ],
        "wc_reply_reviewers_avg": [
            5.666666666666667,
            8.013876853447538
        ],
        "wc_reply_authors_avg": [
            707.0,
            405.4207690782504
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 41,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11819573762041266679&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1lTUCVYvH",
        "title": "Rethinking Curriculum Learning With Incremental Labels And Adaptive Compensation",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel approach to curriculum learning by incrementally learning labels and adaptively smoothing labels for mis-classified samples which boost average performance and decreases standard deviation.",
        "abstract": "Like humans, deep networks learn better when samples are organized and introduced in a meaningful order or curriculum. While conventional approaches to curriculum learning emphasize the difficulty of samples as the core incremental strategy, it forces networks to learn from small subsets of data while introducing pre-computation overheads. In this work, we propose Learning with Incremental Labels and Adaptive Compensation (LILAC), which introduces a novel approach to curriculum learning. LILAC emphasizes incrementally learning labels instead of incrementally learning difficult samples. It works in two distinct phases: first, in the incremental label introduction phase, we unmask ground-truth labels in fixed increments during training, to improve the starting point from which networks learn. In the adaptive compensation phase, we compensate for failed predictions by adaptively altering the target vector to a smoother distribution. We evaluate LILAC against the closest comparable methods in batch and curriculum learning and label smoothing, across three standard image benchmarks, CIFAR-10, CIFAR-100, and STL-10. We show that our method outperforms batch learning with higher mean recognition accuracy as well as lower standard deviation in performance consistently across all benchmarks. We further extend LILAC to state-of-the-art performance across CIFAR-10 using simple data augmentation while exhibiting label order invariance among other important properties.",
        "keywords": "Curriculum Learning;Incremental Label Learning;Label Smoothing;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Madan Ravi Ganesh;Jason J. Corso",
        "authorids": "madantrg@umich.edu;jjcorso@umich.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nganesh2020rethinking,\ntitle={Rethinking Curriculum Learning With Incremental Labels And Adaptive Compensation},\nauthor={Madan Ravi Ganesh and Jason J. Corso},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lTUCVYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lTUCVYvH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "942;568;1067",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1065;753;753",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            859.0,
            212.00157232121336
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            857.0,
            147.07821048680188
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14229462007211046548&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1lUp1BYDH",
        "title": "Emergent Communication in Networked Multi-Agent Reinforcement Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A framework for studying emergent communication in a networked multi-agent reinforcement learning setup.",
        "abstract": "With the ever increasing demand and the resultant reduced quality of services, the focus has shifted towards easing network congestion to enable more efficient flow in systems like traffic, supply chains and electrical grids. A step in this direction is to re-imagine the traditional heuristics based training of systems as this approach is incapable of modelling the involved dynamics. While one can apply Multi-Agent Reinforcement Learning (MARL) to such problems by considering each vertex in the network as an agent, most MARL-based models assume the agents to be independent. In many real-world tasks, agents need to behave as a group, rather than as a collection of individuals. In this paper, we propose a framework that induces cooperation and coordination amongst agents, connected via an underlying network, using emergent communication in a MARL-based setup. We formulate the problem in a general network setting and demonstrate the utility of communication in networks with the help of a case study on traffic systems. Furthermore, we study the emergent communication protocol and show the formation of distinct communities with grounded vocabulary. To the best of our knowledge, this is the only work that studies emergent language in a networked MARL setting.",
        "keywords": "emergent communication;multi-agent reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shubham Gupta;Rishi Hazra;Amebdkar Dukkipati",
        "authorids": "shubhamg@iisc.ac.in;rishihazra@iisc.ac.in;ambedkar@iisc.ac.in",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lUp1BYDH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "716;442;257",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "401;384;315",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            471.6666666666667,
            188.556504940974
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            366.6666666666667,
            37.1872140511882
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:85QiGRzQQv4J:scholar.google.com/&scioq=Emergent+Communication+in+Networked+Multi-Agent+Reinforcement+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1lVIxHtPS",
        "title": "Learning Multi-Agent Communication Through Structured Attentive Reasoning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Novel architecture of memory based attention mechanism for multi-agent communication.",
        "abstract": "Learning communication via deep reinforcement learning has recently been shown to be an effective way to solve cooperative multi-agent tasks. However, learning which communicated information is beneficial for each agent's decision-making remains a challenging task. In order to address this problem, we introduce a fully differentiable framework for communication and reasoning, enabling agents to solve cooperative tasks in partially-observable environments. The framework is designed to facilitate explicit reasoning between agents, through a novel memory-based attention network that can learn selectively from its past memories. The model communicates through a series of reasoning steps that decompose each agent's intentions into learned representations that are used first to compute the relevance of communicated information, and second to extract information from memories given newly received information. By selectively interacting with new information, the model effectively learns a communication protocol directly, in an end-to-end manner. We empirically demonstrate the strength of our model in cooperative multi-agent tasks, where inter-agent communication and reasoning over prior information substantially improves performance compared to baselines.",
        "keywords": "Multi-Agent;Deep Reinforcement Learning;Communication",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Murtaza Rangwala;Ryan Williams",
        "authorids": "murtazar@vt.edu;rywilli1@vt.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lVIxHtPS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "433;285;604",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "404;696;622",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            440.6666666666667,
            130.3439893342058
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            574.0,
            123.94622489881112
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 42,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17079361444341989269&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "H1lVvgHKDr",
        "title": "Knowledge Transfer via Student-Teacher Collaboration",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel knowledge transfer method which employs a student-teacher collaboration network.",
        "abstract": "Accompanying with the flourish development in various fields, deep neural networks, however, are still facing with the plight of high computational costs and storage. One way to compress these heavy models is knowledge transfer (KT), in which a light student network is trained through absorbing the knowledge from a powerful teacher network. In this paper, we propose a novel knowledge transfer method which employs a Student-Teacher Collaboration (STC) network during the knowledge transfer process. This is done by connecting the front part of the student network to the back part of the teacher network as the STC network. The back part of the teacher network takes the intermediate representation from the front part of the student network as input to make the prediction. The difference between the prediction from the collaboration network and the output tensor from the teacher network is taken into account of the loss during the train process. Through back propagation, the teacher network provides guidance to the student network in a gradient signal manner. In this way, our method takes advantage of the knowledge from the entire teacher network, who instructs the student network in learning process. Through plentiful experiments, it is proved that our STC method outperforms other KT methods with conventional strategy.",
        "keywords": "Network Compression and Acceleration;Knowledge Transfer;Student-Teacher Collaboration;Deep Learning.",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianxiao Gao;Ruiqin Xiong;Zhenhua Liu;Siwei ma;Feng Wu;Tiejun Huang;Wen Gao",
        "authorids": "gtx@pku.edu.cn;rqxiong@pku.edu.cn;liu-zh@pku.edu.cn;swma@pku.edu.cn;fengwu@ustc.edu.cn;tjhuang@pku.edu.cn;wgao@pku.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\ngao2020knowledge,\ntitle={Knowledge Transfer via Student-Teacher Collaboration},\nauthor={Tianxiao Gao and Ruiqin Xiong and Zhenhua Liu and Siwei ma and Feng Wu and Tiejun Huang and Wen Gao},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lVvgHKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lVvgHKDr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "204;345;558",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "533;485;126",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.0,
            145.5128860273206
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            381.3333333333333,
            181.6082474877051
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eV5WaXFe3Y0J:scholar.google.com/&scioq=Knowledge+Transfer+via+Student-Teacher+Collaboration&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1lWzpNKvr",
        "title": "Efficient Multivariate Bandit Algorithm with Path Planning",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel way utilizing tree models to solve multivariate Multi-Armed Bandit problem.",
        "abstract": "In this paper, we solve the arms exponential exploding issues in multivariate Multi-Armed Bandit (Multivariate-MAB) problem when the arm dimension hierarchy is considered. We propose a framework called path planning (TS-PP) which utilizes decision graph/trees to model arm reward success rate with m-way dimension interaction, and adopts Thompson sampling (TS) for heuristic search of arm selection. Naturally, it is quite straightforward to combat the curse of dimensionality using a serial processes that operates sequentially by focusing on one dimension per each process.  For our best acknowledge, we are the first to solve Multivariate-MAB problem using graph path planning strategy and deploying alike Monte-Carlo tree search ideas. Our proposed method utilizing tree models has advantages comparing with traditional models such as general linear regression. Simulation studies validate our claim by achieving faster convergence speed, better efficient optimal arm allocation and lower cumulative regret.",
        "keywords": "Multivariate Multi-armed Bandit;Monte Carlo Tree Search;Thompson Sampling;Path Planning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Keyu Nie;Zezhong Zhang;Ted Tao Yuan;Rong Song;Pauline Berry Burke",
        "authorids": "keyunie@google.com;zezzhang@ebay.com;teyuan@ebay.com;rsong@ebay.com;pmburke10@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nnie2020efficient,\ntitle={Efficient Multivariate Bandit Algorithm with Path Planning},\nauthor={Keyu Nie and Zezhong Zhang and Ted Tao Yuan and Rong Song and Pauline Berry Burke},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lWzpNKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lWzpNKvr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "208;1290;340",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "69;382;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            612.6666666666666,
            481.96910919915007
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            150.33333333333334,
            166.21739446346228
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1146274917320776112&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1lXCaVKvS",
        "title": "Frustratingly easy quasi-multitask learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a computationally efficient alternative for traditional ensemble learning for neural nets.",
        "abstract": "We propose the technique of quasi-multitask learning (Q-MTL), a simple and easy to implement modification of standard multitask learning, in which the  tasks to be modeled are identical. We illustrate it through a series of sequence labeling experiments over a diverse set of languages, that applying Q-MTL consistently increases the generalization ability of the applied models. The proposed architecture can be regarded as a new regularization technique encouraging the model to develop an internal representation of the problem at hand that is beneficial to multiple output units of the classifier at the same time. This property hampers the convergence to such internal representations which are highly specific and tailored for a classifier with a particular set of parameters. Our experiments corroborate that by relying on the proposed algorithm, we can approximate the quality of an ensemble of classifiers at a fraction of computational resources required. Additionally, our results suggest that Q-MTL handles the presence of noisy training labels better than ensembles.\n",
        "keywords": "multitask learning;ensembling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "G\u00e1bor Berend;Norbert Kis-Szab\u00f3",
        "authorids": "berendg@inf.u-szeged.hu;kis-szabo.norbert@stud.u-szeged.hu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nberend2020frustratingly,\ntitle={Frustratingly easy quasi-multitask learning},\nauthor={G{\\'a}bor Berend and Norbert Kis-Szab{\\'o}},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lXCaVKvS}\n}",
        "github": "https://drive.google.com/drive/folders/16ORV5A0Zqo52h0vXt2eJwyGPZBmtRNWg?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lXCaVKvS",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "587;110",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "150;400",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.5,
            238.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            275.0,
            125.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zTv1tlaEXCMJ:scholar.google.com/&scioq=Frustratingly+easy+quasi-multitask+learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1lXVJStwB",
        "title": "Dynamic Instance Hardness",
        "track": "main",
        "status": "Reject",
        "tldr": "New understanding of training dynamics and metrics of memorization hardness lead to efficient and provable curriculum learning.",
        "abstract": "We introduce dynamic instance hardness (DIH) to facilitate the training of machine learning models. DIH is a property of each training sample and is computed as the running mean of the sample's instantaneous hardness as measured over the training history. We use DIH to evaluate how well a model retains knowledge about each training sample over time. We find that for deep neural nets (DNNs), the DIH of a sample in relatively early training stages reflects its DIH in later stages and as a result, DIH can be effectively used to reduce the set of training samples in future epochs. Specifically, during each epoch, only samples with high DIH are trained (since they are historically hard) while samples with low DIH can be safely ignored. DIH is updated each epoch only for the selected samples, so it does not require additional computation. Hence, using DIH during training leads to an appreciable speedup. Also, since the model is focused on the historically more challenging samples, resultant models are more accurate. The above, when formulated as an algorithm, can be seen as a form of curriculum learning, so we call our framework DIH curriculum learning (or DIHCL). The advantages of DIHCL, compared to other curriculum learning approaches, are: (1) DIHCL does not require additional inference steps over the data not selected by DIHCL in each epoch, (2) the dynamic instance hardness, compared to static instance hardness (e.g., instantaneous loss), is more stable as it integrates information over the entire training history up to the present time. Making certain mathematical assumptions, we formulate the problem of DIHCL as finding a curriculum that maximizes a multi-set function $f(\\cdot)$, and derive an approximation bound for a DIH-produced curriculum relative to the optimal curriculum. Empirically, DIHCL-trained DNNs significantly outperform random mini-batch SGD and other recently developed curriculum learning methods in terms of efficiency, early-stage convergence, and final performance, and this is shown in training several state-of-the-art DNNs on 11 modern datasets.",
        "keywords": "training dynamics;instance hardness;curriculum learning;neural nets memorization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianyi Zhou;Shengjie Wang;Jeff A. Bilmes",
        "authorids": "tianyizh@uw.edu;wangsj@cs.washington.edu;bilmes@uw.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhou2020dynamic,\ntitle={Dynamic Instance Hardness},\nauthor={Tianyi Zhou and Shengjie Wang and Jeff A. Bilmes},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lXVJStwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lXVJStwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "193;336;1033",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "371;606;848",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            520.6666666666666,
            366.9480738318283
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            608.3333333333334,
            194.74142400173164
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TastkJKjs0wJ:scholar.google.com/&scioq=Dynamic+Instance+Hardness&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1lZJpVFvr",
        "title": "Robust Local Features for Improving the Generalization of Adversarial Training",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a new stream of adversarial training approach called Robust Local Features for Adversarial Training (RLFAT) that significantly improves both the adversarially robust generalization and the standard generalization.",
        "abstract": "Adversarial training has been demonstrated as one of the most effective methods for training robust models to defend against adversarial examples. However, adversarially trained models often lack adversarially robust generalization on unseen testing data. Recent works show that adversarially trained models are more biased towards global structure features. Instead, in this work, we would like to investigate the relationship between the generalization of adversarial training and the robust local features, as the robust local features generalize well for unseen shape variation. To learn the robust local features, we develop a Random Block Shuffle (RBS) transformation to break up the global structure features on normal adversarial examples. We continue to propose a new approach called Robust Local Features for Adversarial Training (RLFAT), which first learns the robust local features by adversarial training on the RBS-transformed adversarial examples, and then transfers the robust local features into the training of normal adversarial examples. To demonstrate the generality of our argument, we implement RLFAT in currently state-of-the-art adversarial training frameworks. Extensive experiments on STL-10, CIFAR-10 and CIFAR-100 show that RLFAT significantly improves both the adversarially robust generalization and the standard generalization of adversarial training. Additionally, we demonstrate that our models capture more local features of the object on the images, aligning better with human perception.",
        "keywords": "adversarial robustness;adversarial training;adversarial example;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chuanbiao Song;Kun He;Jiadong Lin;Liwei Wang;John E. Hopcroft",
        "authorids": "cbsong@hust.edu.cn;brooklet60@hust.edu.cn;jdlin@hust.edu.cn;wanglw@cis.pku.edu.cn;jeh@cs.cornell.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nSong2020Robust,\ntitle={Robust Local Features for Improving the Generalization of Adversarial Training},\nauthor={Chuanbiao Song and Kun He and Jiadong Lin and Liwei Wang and John E. Hopcroft},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lZJpVFvr}\n}",
        "github": "https://github.com/JHL-HUST/RLFAT",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lZJpVFvr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "234;236;232",
        "wc_reply_reviewers": "0;98;0",
        "wc_reply_authors": "665;428;364",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            234.0,
            1.632993161855452
        ],
        "wc_reply_reviewers_avg": [
            32.666666666666664,
            46.19764303752111
        ],
        "wc_reply_authors_avg": [
            485.6666666666667,
            129.47157560209456
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 106,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11695646506050122270&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1l_0JBYwS",
        "title": "Spectral Embedding of Regularized Block Models",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Graph regularization forces spectral embedding to focus on the largest clusters, making the representation less sensitive to noise. ",
        "abstract": "Spectral embedding is a popular technique for the representation of graph data. Several regularization techniques have been proposed to improve the quality of the embedding with respect to downstream tasks like clustering. In this paper, we explain on a simple block model the impact of the complete graph regularization, whereby a constant is added to all entries of the adjacency matrix. Specifically, we show that the regularization forces the spectral embedding  to focus on  the  largest blocks, making the representation less sensitive to noise or outliers. We illustrate these results on both  on both synthetic and real data, showing how regularization improves standard clustering scores. ",
        "keywords": "Spectral embedding;regularization;block models;clustering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nathan De Lara;Thomas Bonald",
        "authorids": "ndelara@enst.fr;bonald@enst.fr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLara2020Spectral,\ntitle={Spectral  Embedding of Regularized Block Models},\nauthor={Nathan De Lara and Thomas Bonald},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1l_0JBYwS}\n}",
        "github": "https://github.com/nathandelara/Spectral-Embedding-of-Regularized-Block-Models/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1l_0JBYwS",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "138;202",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "132;0",
        "reply_reviewers": "0;0",
        "reply_authors": "1;0",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            170.0,
            32.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            66.0,
            66.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.5,
            0.5
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11225388998005232982&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1l_gA4KvH",
        "title": "Surrogate-Based Constrained Langevin Sampling With Applications to Optimal Material Configuration Design",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose surrogate based Constrained Langevin sampling with application in nano-porous material configuration design.",
        "abstract": "We consider the problem of generating configurations that satisfy physical constraints for optimal material nano-pattern design, where multiple (and often conflicting) properties need to be simultaneously satisfied.  Consider, for example, the trade-off between thermal resistance, electrical conductivity, and mechanical stability needed to design a nano-porous template with optimal thermoelectric efficiency.  To that end, we leverage the posterior regularization framework andshow that this constraint satisfaction problem can be formulated as sampling froma Gibbs distribution.  The main challenges come from the black-box nature ofthose physical constraints, since they are obtained via solving highly non-linearPDEs. To overcome those difficulties, we introduce Surrogate-based Constrained Langevin dynamics for black-box sampling. We explore two surrogate approaches. The first approach exploits zero-order approximation of gradients in the Langevin Sampling and we refer to it as Zero-Order Langevin. In practice, this approach can be prohibitive since we still need to often query the expensive PDE solvers. The second approach approximates the gradients in the Langevin dynamics with deep neural networks, allowing us an efficient sampling strategy using the surrogate model. We prove the convergence of those two approaches when the target distribution is log-concave and smooth. We show the effectiveness of both approaches in designing optimal nano-porous material configurations, where the goal is to produce nano-pattern templates with low thermal conductivity and reasonable mechanical stability.",
        "keywords": "Black-box Constrained Langevin sampling;surrogate methods;projected and proximal methods;approximation theory of gradients;nano-porous material configuration design",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thanh V Nguyen;Youssef Mroueh;Samuel C. Hoffman;Payel Das;Pierre Dognin;Giuseppe Romano;Chinmay Hegde",
        "authorids": "thanhng@iastate.edu;mroueh@us.ibm.com;shoffman@ibm.com;daspa@us.ibm.com;pdognin@us.ibm.com;romanog@mit.edu;chinmay@iastate.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nnguyen2020surrogatebased,\ntitle={Surrogate-Based Constrained Langevin Sampling With Applications to Optimal Material Configuration Design},\nauthor={Thanh V Nguyen and Youssef Mroueh and Samuel C. Hoffman and Payel Das and Pierre Dognin and Giuseppe Romano and Chinmay Hegde},\nyear={2020},\nurl={https://openreview.net/forum?id=H1l_gA4KvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1l_gA4KvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "218;725;309",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "324;610;540",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.3333333333333,
            220.70241402294528
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            491.3333333333333,
            121.72464372052559
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qMVD_b66ZLcJ:scholar.google.com/&scioq=Surrogate-Based+Constrained+Langevin+Sampling+With+Applications+to+Optimal+Material+Configuration+Design&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1lac2Vtwr",
        "title": "SesameBERT: Attention for Anywhere",
        "track": "main",
        "status": "Reject",
        "tldr": "We proposed SesameBERT, a generalized fine-tuning method that enables the extraction of global information among all layers through Squeeze and Excitation and enriches local information by capturing neighboring contexts via Gaussian blurring.",
        "abstract": "Fine-tuning with pre-trained models has achieved exceptional results for many language tasks. In this study, we focused on one such self-attention network model, namely BERT, which has performed well in terms of stacking layers across diverse language-understanding benchmarks. However, in many downstream tasks, information between layers is ignored by BERT for fine-tuning. In addition, although self-attention networks are well-known for their ability to capture global dependencies, room for improvement remains in terms of emphasizing the importance of local contexts. In light of these advantages and disadvantages, this paper proposes SesameBERT, a generalized fine-tuning method that (1) enables the extraction of global information among all layers through Squeeze and Excitation and (2) enriches local information by capturing neighboring contexts via Gaussian blurring. Furthermore, we demonstrated the effectiveness of our approach in the HANS dataset, which is used to determine whether models have adopted shallow heuristics instead of learning underlying generalizations. The experiments revealed that SesameBERT outperformed BERT with respect to GLUE benchmark and the HANS evaluation set.",
        "keywords": "Natural Language Processing;Deep Learning;Self Attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ta-Chun Su;Hsiang-Chih Cheng",
        "authorids": "gene11117@gmail.com;musicmilif@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsu2020sesamebert,\ntitle={Sesame{\\{}BERT{\\}}: Attention for Anywhere},\nauthor={Ta-Chun Su and Hsiang-Chih Cheng},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lac2Vtwr}\n}",
        "github": "https://github.com/ICLR2020Sesame/SesameBert",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1lac2Vtwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "249;269;352",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "39;76;125",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            290.0,
            44.594469014292194
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            80.0,
            35.223098481914775
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 17,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9655886722644649193&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1laeJrKDB",
        "title": "Controlling generative models with continuous factors of variations",
        "track": "main",
        "status": "Poster",
        "tldr": "A model to control the generation of images with GAN and beta-VAE with regard to scale and position of the objects",
        "abstract": "Recent deep generative models can provide photo-realistic images as well as visual or textual content embeddings useful to address various tasks of computer vision and natural language processing. Their usefulness is nevertheless often limited by the lack of control over the generative process or the poor understanding of the learned representation. To overcome these major issues, very recent works have shown the interest of studying the semantics of the latent space of generative models. In this paper, we propose to advance on the interpretability of the latent space of generative models by introducing a new method to find meaningful directions in the latent space of any generative model along which we can move to control precisely specific properties of the generated image like position or scale of the object in the image. Our method is weakly supervised and particularly well suited for the search of directions encoding simple transformations of the generated image, such as translation, zoom or color variations. We demonstrate the effectiveness of our method qualitatively and quantitatively, both for GANs and variational auto-encoders.",
        "keywords": "Generative models;factor of variation;GAN;beta-VAE;interpretable representation;interpretability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Antoine Plumerault;Herv\u00e9 Le Borgne;C\u00e9line Hudelot",
        "authorids": "antoine.plumerault@cea.fr;herve.le-borgne@cea.fr;celine.hudelot@centralesupelec.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nPlumerault2020Controlling,\ntitle={Controlling generative models with continuous factors of variations},\nauthor={Antoine Plumerault and Herv\u00e9 Le Borgne and C\u00e9line Hudelot},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1laeJrKDB}\n}",
        "github": "[![github](/images/github_icon.svg) AntoinePlumerault/Controlling-generative-models-with-continuous-factors-of-variations](https://github.com/AntoinePlumerault/Controlling-generative-models-with-continuous-factors-of-variations)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1laeJrKDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "715;448;358",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "472;1180;120",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            507.0,
            151.59815302304972
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            590.6666666666666,
            440.8033071059648
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 131,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9062279682169095695&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1ldzA4tPr",
        "title": "Learning Compositional Koopman Operators for Model-Based Control",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Learning compositional Koopman operators for efficient system identification and model-based control.",
        "abstract": "Finding an embedding space for a linear approximation of a nonlinear dynamical system enables efficient system identification and control synthesis. The Koopman operator theory lays the foundation for identifying the nonlinear-to-linear coordinate transformations with data-driven methods. Recently, researchers have proposed to use deep neural networks as a more expressive class of basis functions for calculating the Koopman operators. These approaches, however, assume a fixed dimensional state space; they are therefore not applicable to scenarios with a variable number of objects. In this paper, we propose to learn compositional Koopman operators, using graph neural networks to encode the state into object-centric embeddings and using a block-wise linear transition matrix to regularize the shared structure across objects. The learned dynamics can quickly adapt to new environments of unknown physical parameters and produce control signals to achieve a specified goal. Our experiments on manipulating ropes and controlling soft robots show that the proposed method has better efficiency and generalization ability than existing baselines.",
        "keywords": "Koopman operators;graph neural networks;compositionality",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yunzhu Li;Hao He;Jiajun Wu;Dina Katabi;Antonio Torralba",
        "authorids": "liyunzhu@mit.edu;haohe@mit.edu;jiajunwu.cs@gmail.com;dina@csail.mit.edu;torralba@csail.mit.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLi2020Learning,\ntitle={Learning Compositional Koopman Operators for Model-Based Control},\nauthor={Yunzhu Li and Hao He and Jiajun Wu and Dina Katabi and Antonio Torralba},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1ldzA4tPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1ldzA4tPr",
        "pdf_size": 0,
        "rating": "6;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "106;341;889;312",
        "wc_reply_reviewers": "0;0;162;0",
        "wc_reply_authors": "34;189;840;314",
        "reply_reviewers": "0;0;1;0",
        "reply_authors": "1;1;2;1",
        "rating_avg": [
            6.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            412.0,
            289.9163672509712
        ],
        "wc_reply_reviewers_avg": [
            40.5,
            70.14805770653953
        ],
        "wc_reply_authors_avg": [
            344.25,
            302.91944061086605
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 152,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13454912750283547901&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1leCRNYvS",
        "title": "Hierarchical Bayes Autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Autoencoders are powerful generative models for complex data, such as images. However, standard models like the variational autoencoder (VAE) typically have unimodal Gaussian decoders, which cannot effectively represent the possible semantic variations in the space of images. To address this problem, we present a new probabilistic generative model called the \\emph{Hierarchical Bayes Autoencoder (HBAE)}. The HBAE contains a multimodal decoder in the form of an energy-based model (EBM), instead of the commonly adopted unimodal Gaussian distribution. The HBAE can be trained using variational inference, similar to a VAE, to recover latent codes conditioned on inputs. For the decoder, we use an adversarial approximation where a conditional generator is trained to match the EBM distribution. During inference time, the HBAE consists of two sampling steps: first a latent code for the input is sampled, and then this code is passed to the conditional generator to output a stochastic reconstruction. The HBAE is also capable of modeling sets, by inferring a latent code for a set of examples, and sampling set members through the multimodal decoder. In both single image and set cases, the decoder generates plausible variations consistent with the input data, and generates realistic unconditional samples. To the best our knowledge, Set-HBAE is the first model that is able to generate complex image sets.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shuangfei Zhai;Carlos Guestrin;Joshua M. Susskind",
        "authorids": "szhai@apple.com;guestrin@apple.com;jsusskind@apple.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhai2020hierarchical,\ntitle={Hierarchical Bayes Autoencoders},\nauthor={Shuangfei Zhai and Carlos Guestrin and Joshua M. Susskind},\nyear={2020},\nurl={https://openreview.net/forum?id=H1leCRNYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1leCRNYvS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "1320;139;627",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1510;0;975",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;0;2",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            695.3333333333334,
            484.5563836014225
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            828.3333333333334,
            625.1177666825845
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "H1lefTEKDS",
        "title": "Benchmarking Model-Based Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Benchmarking Model-Based Reinforcement Learning in continuous control tasks",
        "abstract": "Model-based reinforcement learning (MBRL) is widely seen as having the potential\nto be significantly more sample efficient than model-free RL. However, research in\nmodel-based RL has not been very standardized. It is fairly common for authors to\nexperiment with self-designed environments, and there are several separate lines of\nresearch, which are sometimes closed-sourced or not reproducible. Accordingly, it\nis an open question how these various existing algorithms perform relative to each\nother. To facilitate research in MBRL, in this paper we gather a wide collection\nof MBRL algorithms and propose over 18 benchmarking environments specially\ndesigned for MBRL. We benchmark these algorithms with unified problem settings,\nincluding noisy environments. Beyond cataloguing performance, we explore\nand unify the underlying algorithmic differences across MBRL algorithms. We\ncharacterize three key research challenges for future MBRL research: the dynamics\nbottleneck, the planning horizon dilemma, and the early-termination dilemma.\nFinally, to facilitate future research on MBRL, we open-source our benchmark.",
        "keywords": "Reinforcement learning;model based Reinforcement learning;Benchmarking",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tingwu Wang;Xuchan Bao;Ignasi Clavera;Jerrick Hoang;Yeming Wen;Eric Langlois;Shunshi Zhang;Guodong Zhang;Pieter Abbeel;Jimmy Ba",
        "authorids": "tingwuwang@cs.toronto.edu;xuchan.bao@mail.utoronto.ca;iclavera@berkeley.edu;jhoang@cs.toronto.edu;ywen@cs.toronto.edu;edl@cs.toronto.edu;matthew.zhang@mail.utoronto.ca;gdzhang@cs.toronto.edu;pabbeel@cs.berkeley.edu;jba@cs.toronto.edu",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@misc{\nwang2020benchmarking,\ntitle={Benchmarking Model-Based Reinforcement Learning},\nauthor={Tingwu Wang and Xuchan Bao and Ignasi Clavera and Jerrick Hoang and Yeming Wen and Eric Langlois and Shunshi Zhang and Guodong Zhang and Pieter Abbeel and Jimmy Ba},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lefTEKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lefTEKDS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "598;261;405",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "969;326;391",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            421.3333333333333,
            138.06359243318116
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            562.0,
            289.01326382480556
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 511,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5226380127483106270&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "H1lfwAVFwr",
        "title": "CAPACITY-LIMITED REINFORCEMENT LEARNING: APPLICATIONS IN DEEP ACTOR-CRITIC METHODS FOR CONTINUOUS CONTROL",
        "track": "main",
        "status": "Reject",
        "tldr": "Applying a limit to the amount of information used to represent policies affords some improvements in generalization in Reinforcement Learning",
        "abstract": "Biological and artificial agents must learn to act optimally in spite of a limited capacity for processing, storing, and attending to information. We formalize this type of bounded rationality in terms of an information-theoretic constraint on the complexity of policies that agents seek to learn. We present the Capacity-Limited Reinforcement Learning (CLRL) objective which defines an optimal policy subject to an information capacity constraint. This objective is optimized by drawing from methods used in rate distortion theory and information theory, and applied to the reinforcement learning setting. Using this objective we implement a novel Capacity-Limited Actor-Critic (CLAC) algorithm and situate it within a broader family of RL algorithms such as the Soft Actor Critic (SAC) and discuss their similarities and differences. Our experiments show that compared to alternative approaches, CLAC offers improvements in generalization between training and modified test environments. This is achieved in the CLAC model while displaying high sample efficiency and minimal requirements for hyper-parameter tuning.",
        "keywords": "Reinforcement Learning;Generalization;Information Theory;Rate-Distortion Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tyler James Malloy;Matthew Riemer;Miao Liu;Tim Klinger;Gerald Tesauro;Chris R. Sims",
        "authorids": "mallot@rpi.edu;mdriemer@us.ibm.com;miao.liu1@ibm.com;tklinger@us.ibm.com;gtesauro@us.ibm.com;simsc3@rpi.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nmalloy2020capacitylimited,\ntitle={{\\{}CAPACITY{\\}}-{\\{}LIMITED{\\}} {\\{}REINFORCEMENT{\\}} {\\{}LEARNING{\\}}: {\\{}APPLICATIONS{\\}} {\\{}IN{\\}} {\\{}DEEP{\\}} {\\{}ACTOR{\\}}-{\\{}CRITIC{\\}} {\\{}METHODS{\\}} {\\{}FOR{\\}} {\\{}CONTINUOUS{\\}} {\\{}CONTROL{\\}}},\nauthor={Tyler James Malloy and Matthew Riemer and Miao Liu and Tim Klinger and Gerald Tesauro and Chris R. Sims},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lfwAVFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lfwAVFwr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "301;826;417",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            514.6666666666666,
            225.1819017791814
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9L3ksefkyGkJ:scholar.google.com/&scioq=CAPACITY-LIMITED+REINFORCEMENT+LEARNING:+APPLICATIONS+IN+DEEP+ACTOR-CRITIC+METHODS+FOR+CONTINUOUS+CONTROL&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1lhqpEYPr",
        "title": "Actor-Critic Provably Finds Nash Equilibria of Linear-Quadratic Mean-Field Games",
        "track": "main",
        "status": "Poster",
        "tldr": "Actor-Critic method with function approximation finds the Nash equilibrium pairs in mean-field games with theoretical guarantee. ",
        "abstract": "We study discrete-time mean-field Markov games with infinite numbers of agents where each agent aims to minimize its ergodic cost. We consider the setting where the agents have identical linear state transitions and quadratic cost func- tions, while the aggregated effect of the agents is captured by the population mean of their states, namely, the mean-field state. For such a game, based on the Nash certainty equivalence principle, we provide sufficient conditions for the existence and uniqueness of its Nash equilibrium. Moreover, to find the Nash equilibrium, we propose a mean-field actor-critic algorithm with linear function approxima- tion, which does not require knowing the model of dynamics. Specifically, at each iteration of our algorithm, we use the single-agent actor-critic algorithm to approximately obtain the optimal policy of the each agent given the current mean- field state, and then update the mean-field state. In particular, we prove that our algorithm converges to the Nash equilibrium at a linear rate. To the best of our knowledge, this is the first success of applying model-free reinforcement learn- ing with function approximation to discrete-time mean-field Markov games with provable non-asymptotic global convergence guarantees.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zuyue Fu;Zhuoran Yang;Yongxin Chen;Zhaoran Wang",
        "authorids": "zuyuefu2022@u.northwestern.edu;zy6@princeton.edu;yongchen@gatech.edu;zhaoranwang@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nFu2020Actor-Critic,\ntitle={Actor-Critic Provably Finds Nash Equilibria of Linear-Quadratic Mean-Field Games},\nauthor={Zuyue Fu and Zhuoran Yang and Yongxin Chen and Zhaoran Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lhqpEYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lhqpEYPr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "338;937;997",
        "wc_reply_reviewers": "6;943;808",
        "wc_reply_authors": "252;1991;1122",
        "reply_reviewers": "1;3;3",
        "reply_authors": "1;5;3",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            757.3333333333334,
            297.5234817997097
        ],
        "wc_reply_reviewers_avg": [
            585.6666666666666,
            413.57492939275494
        ],
        "wc_reply_authors_avg": [
            1121.6666666666667,
            709.9438162433856
        ],
        "reply_reviewers_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            3.0,
            1.632993161855452
        ],
        "replies_avg": [
            20,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 74,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6125044268940754159&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1livgrFvr",
        "title": "Out-of-Distribution Image Detection Using the Normalized Compression Distance",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose MALCOM which utilizes both the global average and spatial pattern of the feature maps to accurately identify out-of-distribution samples. ",
        "abstract": "On detection of the out-of-distribution images, whose underlying distribution is different from that of the training dataset, we tackle to apply out-of-distribution detection methods to already deployed convolutional neural networks. Most recent approaches have to utilize out-of-distribution samples for validation or retrain the model, which makes it less practical for real-world applications. We propose a novel out-of-distribution detection method MALCOM, which neither uses any out-of-distribution samples nor retrain the model. Inspired by the method using the global average pooling on the feature maps of the convolutional neural networks, the goal of our method is to extract informative sequential patterns from the feature maps. To this end, we introduce a similarity metric which focuses on the shared patterns between two sequences. In short, MALCOM uses both the global average and spatial pattern of the feature maps to accurately identify out-of-distribution samples. ",
        "keywords": "Out-of-Distribution Detection;Normalized Compression Distance;Convolutional Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sehun Yu;Donga Lee;Hwanjo Yu",
        "authorids": "hunu12@postech.ac.kr;dongha0914@postech.ac.kr;hwanjoyu@postech.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyu2020outofdistribution,\ntitle={Out-of-Distribution Image Detection Using the Normalized Compression Distance},\nauthor={Sehun Yu and Donga Lee and Hwanjo Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=H1livgrFvr}\n}",
        "github": "https://github.com/malcom2020/malcom",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1livgrFvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "370;635;215",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "445;872;148",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.6666666666667,
            173.41344302626086
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            488.3333333333333,
            297.1557765811655
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Qv7fqc1VT94J:scholar.google.com/&scioq=Out-of-Distribution+Image+Detection+Using+the+Normalized+Compression+Distance&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1lj0nNFwB",
        "title": "The Implicit Bias of Depth: How Incremental Learning Drives Generalization",
        "track": "main",
        "status": "Poster",
        "tldr": "We study the sparsity-inducing bias of deep models, caused by their learning dynamics.",
        "abstract": "A leading hypothesis for the surprising generalization of neural networks is that the dynamics of gradient descent bias the model towards simple solutions, by searching through the solution space in an incremental order of complexity. We formally define the notion of incremental learning dynamics and derive the conditions on depth and initialization for which this phenomenon arises in deep linear models. Our main theoretical contribution is a dynamical depth separation result, proving that while shallow models can exhibit incremental learning dynamics, they require the initialization to be exponentially small for these dynamics to present themselves. However, once the model becomes deeper, the dependence becomes polynomial and incremental learning can arise in more natural settings. We complement our theoretical findings by experimenting with deep matrix sensing, quadratic neural networks and with binary classification using diagonal and convolutional linear networks, showing all of these models exhibit incremental learning.",
        "keywords": "gradient flow;gradient descent;implicit regularization;implicit bias;generalization;optimization;quadratic network;matrix sensing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Gissin;Shai Shalev-Shwartz;Amit Daniely",
        "authorids": "daniel.gissin@mail.huji.ac.il;shais@cs.huji.ac.il;amit.daniely@mail.huji.ac.il",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\ngissin2020the,\ntitle={The Implicit Bias of Depth: How Incremental Learning Drives Generalization},\nauthor={Daniel Gissin and Shai Shalev-Shwartz and Amit Daniely},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lj0nNFwB}\n}",
        "github": "https://github.com/dsgissin/Incremental-Learning",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lj0nNFwB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "806;605;375",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "410;254;489",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            595.3333333333334,
            176.08773065971658
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            384.3333333333333,
            97.63992808045055
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 85,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13677656727804857978&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1lkCTVFwB",
        "title": "On Global Feature Pooling for Fine-grained Visual Categorization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A benchmark of nine representative global pooling schemes reveals some interesting findings.",
        "abstract": " Global feature pooling is a modern variant of feature pooling providing better interpretatability and regularization. Although alternative pooling methods exist (eg. max, lp norm, stochastic), the averaging operation is still the dominating global pooling scheme in popular models. As fine-grained recognition requires learning subtle, discriminative features, we consider the question: is average pooling the optimal strategy? We first ask: ``is there a difference between features learned by global average and max pooling?'' Visualization and quantitative analysis show that max pooling encourages learning features of different spatial scales. We then ask ``is there a single global feature pooling variant that's most suitable for fine-grained recognition?'' A thorough evaluation of nine representative pooling algorithms finds that: max pooling outperforms average pooling consistently across models, datasets, and image resolutions; it does so by reducing the generalization gap; and generalized pooling's performance increases almost monotonically as it changes from average to max. We finally ask: ``what's the best way to combine two heterogeneous pooling schemes?'' Common strategies struggle because of potential gradient conflict but the ``freeze-and-train'' trick works best. We also find that post-global batch normalization helps with faster convergence and improves model performance consistently.",
        "keywords": "global pooling;fine-grained recognition;benchmark",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pei Guo;Connor Anderson;Ryan Farrell",
        "authorids": "peiguo@cs.byu.edu;thecatalystak@gmail.com;farrell@cs.byu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lkCTVFwB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "237;258;223",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            239.33333333333334,
            14.38363267359428
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1451570427946782506&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1lkYkrKDB",
        "title": "UNIVERSAL MODAL EMBEDDING OF DYNAMICS IN VIDEOS AND ITS APPLICATIONS",
        "track": "main",
        "status": "Reject",
        "tldr": "Dynamic information extraction in multivariate time series data",
        "abstract": "Extracting underlying dynamics of objects in image sequences is one of the challenging problems in computer vision. On the other hand, dynamic mode decomposition (DMD) has recently attracted attention as a way of obtaining modal representations of nonlinear dynamics from (general multivariate time-series) data without explicit prior knowledge about the dynamics. In this paper, we propose a convolutional autoencoder based DMD (CAE-DMD) that is an extended DMD (EDMD) approach, to extract underlying dynamics in videos. To this end, we develop a modified CAE model by incorporating DMD on the encoder, which gives a more meaningful compressed representation of input image sequences. On the reconstruction side, a decoder is used to minimize the reconstruction error after applying the DMD, which in result gives an accurate reconstruction of inputs. We empirically investigated the performance of CAE-DMD in two applications: background/foreground extraction and video classification, on publicly available datasets.",
        "keywords": "Non-linear dynamics;Convolutional Autoencoder;Foreground modeling;Video classification;Dynamic mode decomposition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Israr Ul Haq;Yoshinobu Kawahara",
        "authorids": "israr.haq@riken.jp;kawahara@imi.kyushu-u.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nhaq2020universal,\ntitle={{\\{}UNIVERSAL{\\}} {\\{}MODAL{\\}} {\\{}EMBEDDING{\\}} {\\{}OF{\\}} {\\{}DYNAMICS{\\}} {\\{}IN{\\}} {\\{}VIDEOS{\\}} {\\{}AND{\\}} {\\{}ITS{\\}} {\\{}APPLICATIONS{\\}}},\nauthor={Israr Ul Haq and Yoshinobu Kawahara},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lkYkrKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lkYkrKDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "189;619;313",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            373.6666666666667,
            180.71217139110718
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ff-vkyBBUpgJ:scholar.google.com/&scioq=UNIVERSAL+MODAL+EMBEDDING+OF+DYNAMICS+IN+VIDEOS+AND+ITS+APPLICATIONS&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1lma24tPB",
        "title": "Principled Weight Initialization for Hypernetworks",
        "track": "main",
        "status": "Talk",
        "tldr": "The first principled weight initialization method for hypernetworks",
        "abstract": "Hypernetworks are meta neural networks that generate weights for a main neural network in an end-to-end differentiable manner. Despite extensive applications ranging from multi-task learning to Bayesian deep learning, the problem of optimizing hypernetworks has not been studied to date. We observe that classical weight initialization methods like Glorot & Bengio (2010) and He et al. (2015), when applied directly on a hypernet, fail to produce weights for the mainnet in the correct scale. We develop principled techniques for weight initialization in hypernets, and show that they lead to more stable mainnet weights, lower training loss, and faster convergence.",
        "keywords": "hypernetworks;initialization;optimization;meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Oscar Chang;Lampros Flokas;Hod Lipson",
        "authorids": "oscar.chang@columbia.edu;lamflokas@cs.columbia.edu;hod.lipson@columbia.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nChang2020Principled,\ntitle={Principled Weight Initialization for Hypernetworks},\nauthor={Oscar Chang and Lampros Flokas and Hod Lipson},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lma24tPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1lma24tPB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "501;257;463",
        "wc_reply_reviewers": "0;36;0",
        "wc_reply_authors": "329;386;232",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            407.0,
            107.19452722348593
        ],
        "wc_reply_reviewers_avg": [
            12.0,
            16.97056274847714
        ],
        "wc_reply_authors_avg": [
            315.6666666666667,
            63.573229865687615
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 92,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11280531152928695924&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1lmhaVtvr",
        "title": "Dynamical Distance Learning for Semi-Supervised and Unsupervised Skill Discovery",
        "track": "main",
        "status": "Poster",
        "tldr": "We show how to automatically learn dynamical distances in reinforcement learning setting and use them to provide well-shaped reward functions for reaching new goals.",
        "abstract": "Reinforcement learning requires manual specification of a reward function to learn a task. While in principle this reward function only needs to specify the task goal, in practice reinforcement learning can be very time-consuming or even infeasible unless the reward function is shaped so as to provide a smooth gradient towards a successful outcome. This shaping is difficult to specify by hand, particularly when the task is learned from raw observations, such as images. In this paper, we study how we can automatically learn dynamical distances: a measure of the expected number of time steps to reach a given goal state from any other state. These dynamical distances can be used to provide well-shaped reward functions for reaching new goals, making it possible to learn complex tasks efficiently. We show that dynamical distances can be used in a semi-supervised regime, where unsupervised interaction with the environment is used to learn the dynamical distances, while a small amount of preference supervision is used to determine the task goal, without any manually engineered reward function or goal examples. We evaluate our method both on a real-world robot and in simulation. We show that our method can learn to turn a valve with a real-world 9-DoF hand, using raw image observations and just ten preference labels, without any other supervision. Videos of the learned skills can be found on the project website: https://sites.google.com/view/dynamical-distance-learning",
        "keywords": "reinforcement learning;semi-supervised learning;unsupervised learning;robotics;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kristian Hartikainen;Xinyang Geng;Tuomas Haarnoja;Sergey Levine",
        "authorids": "kristian.hartikainen@gmail.com;young.geng@berkeley.edu;tuomash@google.com;svlevine@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nHartikainen2020Dynamical,\ntitle={Dynamical Distance Learning for Semi-Supervised and Unsupervised Skill Discovery},\nauthor={Kristian Hartikainen and Xinyang Geng and Tuomas Haarnoja and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lmhaVtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lmhaVtvr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "487;831;541",
        "wc_reply_reviewers": "271;228;62",
        "wc_reply_authors": "1040;1410;832",
        "reply_reviewers": "2;1;1",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            619.6666666666666,
            151.0526030523436
        ],
        "wc_reply_reviewers_avg": [
            187.0,
            90.11474167230723
        ],
        "wc_reply_authors_avg": [
            1094.0,
            239.0369566963792
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 100,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4642091494992555186&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1lmyRNFvr",
        "title": "Augmenting Genetic Algorithms with Deep Neural Networks for Exploring the Chemical Space",
        "track": "main",
        "status": "Poster",
        "tldr": "Tackling inverse design via genetic algorithms augmented with deep neural networks. ",
        "abstract": "Challenges in natural sciences can often be phrased as optimization problems. Machine learning techniques have recently been applied to solve such problems. One example in chemistry is the design of tailor-made organic materials and molecules, which requires efficient methods to explore the chemical space. We present a genetic algorithm (GA) that is enhanced with a neural network (DNN) based discriminator model to improve the diversity of generated molecules and at the same time steer the GA. We show that our algorithm outperforms other generative models in optimization tasks. We furthermore present a way to increase interpretability of genetic algorithms, which helped us to derive design principles",
        "keywords": "Generative model;Chemical Space;Inverse Molecular Design",
        "primary_area": "",
        "supplementary_material": "",
        "author": "AkshatKumar Nigam;Pascal Friederich;Mario Krenn;Alan Aspuru-Guzik",
        "authorids": "akshat.nigam@mail.utoronto.ca;pascal.friederich@utoronto.ca;mario.krenn@utoronto.ca;alan@aspuru.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nNigam2020Augmenting,\ntitle={Augmenting Genetic Algorithms with Deep Neural Networks for Exploring the Chemical Space},\nauthor={AkshatKumar Nigam and Pascal Friederich and Mario Krenn and Alan Aspuru-Guzik},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lmyRNFvr}\n}",
        "github": "https://github.com/aspuru-guzik-group/GA",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lmyRNFvr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "396;569;588",
        "wc_reply_reviewers": "0;0;55",
        "wc_reply_authors": "1068;1340;1197",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;2;2",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            517.6666666666666,
            86.38029610713056
        ],
        "wc_reply_reviewers_avg": [
            18.333333333333332,
            25.927248643506744
        ],
        "wc_reply_authors_avg": [
            1201.6666666666667,
            111.09255400590787
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 169,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4690781735136459726&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "H1lnZlHYDS",
        "title": "Provable Convergence and Global Optimality of Generative Adversarial Network",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We establish global convergence to optimality for IPM-based GANs where the generator is an overparametrized neural network. ",
        "abstract": "Generative adversarial networks (GANs) train implicit generative models through solving minimax problems. Such minimax problems are known as nonconvex- nonconcave, for which the dynamics of first-order methods are not well understood. In this paper, we consider GANs in the type of the integral probability metrics (IPMs) with the generator represented by an overparametrized neural network. When the discriminator is solved to approximate optimality in each iteration, we prove that stochastic gradient descent on a regularized IPM objective converges globally to a stationary point with a sublinear rate. Moreover, we prove that when the width of the generator network is sufficiently large and the discriminator function class has enough discriminative ability, the obtained stationary point corresponds to a generator that yields a distribution that is close to the distribution of the observed data in terms of the total variation. To the best of our knowledge, we seem to first establish both the global convergence and global optimality of training GANs when the generator is parametrized by a neural network.",
        "keywords": "generative adversarial network;IPM-based GANs;overparametrized neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qi Cai;Zhuoran Yang;Jason D. Lee;Shaolei S. Du;Zhaoran Wang",
        "authorids": "qicai2022@u.northwestern.edu;zy6@princeton.edu;jasonlee@princeton.edu;ssdu@ias.edu;zhaoranwang@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lnZlHYDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "212;382;227",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            273.6666666666667,
            76.84761255599003
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5062970996406557310&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1loF2NFwr",
        "title": "Evaluating The Search Phase of Neural Architecture Search",
        "track": "main",
        "status": "Poster",
        "tldr": "We empirically disprove a fundamental hypothesis of the widely-adopted weight sharing strategy in neural architecture search and explain why the state-of-the-arts NAS algorithms performs similarly to random search.",
        "abstract": "\nNeural Architecture Search (NAS) aims to facilitate the design of deep networks for new tasks. Existing techniques rely on two stages: searching over the architecture space and validating the best architecture. NAS algorithms are currently compared solely based on their results on the downstream task. While intuitive, this fails to explicitly evaluate the effectiveness of their search strategies. In this paper, we propose to evaluate the NAS search phase.\nTo this end, we compare the quality of the solutions obtained by NAS search policies with that of random architecture selection. We find that: (i) On average, the state-of-the-art NAS algorithms perform similarly to the random policy; (ii) the widely-used weight sharing strategy degrades the ranking of the NAS candidates to the point of not reflecting their true performance, thus reducing the effectiveness of the search process.\nWe believe that our evaluation framework will be key to designing NAS strategies that consistently discover architectures superior to random ones.",
        "keywords": "Neural architecture search;parameter sharing;random search;evaluation framework",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kaicheng Yu;Christian Sciuto;Martin Jaggi;Claudiu Musat;Mathieu Salzmann",
        "authorids": "kaicheng.yu@epfl.ch;sciutochristian@gmail.com;martin.jaggi@epfl.ch;claudiu.musat@swisscom.com;mathieu.salzmann@epfl.ch",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nYu2020Evaluating,\ntitle={Evaluating The Search Phase of Neural Architecture Search},\nauthor={Kaicheng Yu and Christian Sciuto and Martin Jaggi and Claudiu Musat and Mathieu Salzmann},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1loF2NFwr}\n}",
        "github": "https://github.com/kcyu2014/eval-nas",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1loF2NFwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "381;253;487",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "548;179;217",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            373.6666666666667,
            95.67073162095546
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            314.6666666666667,
            165.71930753200996
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 437,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14035367419965317698&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1lpE0VFPS",
        "title": "Deep Multivariate Mixture of Gaussians for Object Detection under Occlusion",
        "track": "main",
        "status": "Withdraw",
        "tldr": "a deep multivariate mixture of Gaussians model for bounding box regression under occlusion",
        "abstract": "In this paper, we consider the problem of detecting object under occlusion. Most object detectors formulate bounding box regression as a unimodal task (i.e., regressing a single set of bounding box coordinates independently). However, we observe that the bounding box borders of an occluded object can have multiple plausible configurations. Also, the occluded bounding box borders have correlations with visible ones. Motivated by these two observations, we propose a deep multivariate mixture of Gaussians model for bounding box regression under occlusion. The mixture components potentially learn different configurations of an occluded part, and the covariances between variates help to learn the relationship between the occluded parts and the visible ones. Quantitatively, our model improves the AP of the baselines by 3.9% and 1.2% on CrowdHuman and MS-COCO respectively with almost no computational or memory overhead. Qualitatively, our model enjoys explainability since we can interpret the resulting bounding boxes via the covariance matrices and the mixture components.",
        "keywords": "object detection;multivariate mixture of Gaussian",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yihui He;Jianren Wang",
        "authorids": "he2@andrew.cmu.edu;jianrenw@andrew.cmu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lpE0VFPS",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "228;668;203",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            366.3333333333333,
            213.55457278071935
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=130848014982078675&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1lqSC4YvB",
        "title": "Generalized Transformation-based Gradient",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a novel generalized transformation-based gradient model and propose a polynomial-based gradient estimator based upon the model.",
        "abstract": "The reparameterization trick has become one of the most useful tools in the field of variational inference. However, the reparameterization trick is based on the standardization transformation which restricts the scope of application of this method to distributions that have tractable inverse cumulative distribution functions or are expressible as deterministic transformations of such distributions. In this paper, we generalized the reparameterization trick by allowing a general transformation. Unlike other similar works, we develop the generalized transformation-based gradient model formally and rigorously. We discover that the proposed model is a special case of control variate indicating that the proposed model can combine the advantages of CV and generalized reparameterization. Based on the proposed gradient model, we propose a new polynomial-based gradient estimator which has better theoretical performance than the reparameterization trick under certain condition and can be applied to a larger class of variational distributions. In studies of synthetic and real data, we show that our proposed gradient estimator has a significantly lower gradient variance than other state-of-the-art methods thus enabling a faster inference procedure.",
        "keywords": "variational inference;stochastic optimization;stochastic gradient",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anbang Wu",
        "authorids": "wab@zju.edu.cn",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1lqSC4YvB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "680;475;342",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "63;26;36",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            499.0,
            139.0275752024276
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            41.666666666666664,
            15.627610892974722
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3762877779735319198&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1ls_eSKPH",
        "title": "Overcoming Catastrophic Forgetting via Hessian-free Curvature Estimates",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper provides an approach to address catastrophic forgetting via Hessian-free curvature estimates",
        "abstract": "Learning neural networks with gradient descent over a long sequence of tasks is problematic as their fine-tuning to new tasks overwrites the network weights that are important for previous tasks. This leads to a poor performance on old tasks \u2013 a phenomenon framed as catastrophic forgetting.  While early approaches use task rehearsal and growing networks that both limit the scalability of the task sequence orthogonal approaches build on regularization.  Based on the Fisher information matrix (FIM) changes to parameters that are relevant to old tasks are penalized, which forces the task to be mapped into the available remaining capacity of the network. This requires to calculate the Hessian around a mode, which makes learning tractable. In this paper, we introduce Hessian-free curvature estimates as an alternative method to actually calculating the Hessian.  In contrast to previous work, we exploit the fact that most regions in the loss surface are flat and hence only calculate a Hessian-vector-product around the surface that is relevant for the current task. Our experiments show that on a variety of well-known task sequences we either significantly outperform or are en par with previous work.",
        "keywords": "catastrophic forgetting;multi-task learning;continual learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Leonid Butyrev;Georgios Kontes;Christoffer L\u00f6ffler;Christopher Mutschler",
        "authorids": "butyreld@iis.fraunhofer.de;georgios.kontes@iis.fraunhofer.de;christoffer.loeffler@iis.fraunhofer.de;christopher.mutschler@iis.fraunhofer.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nbutyrev2020overcoming,\ntitle={Overcoming Catastrophic Forgetting via Hessian-free Curvature Estimates},\nauthor={Leonid Butyrev and Georgios Kontes and Christoffer L{\\\"o}ffler and Christopher Mutschler},\nyear={2020},\nurl={https://openreview.net/forum?id=H1ls_eSKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1ls_eSKPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "735;351;203",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            429.6666666666667,
            224.19832490205832
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3024462725890540052&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1lxVyStPH",
        "title": "Generalized Convolutional Forest Networks for Domain Generalization and Visual Recognition",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "When constructing random forests, it is of prime importance to ensure high accuracy and low correlation of individual tree classifiers for good performance. Nevertheless, it is typically difficult for existing random forest methods to strike a good balance between these conflicting factors. In this work, we propose a generalized convolutional forest networks to learn a feature space to maximize the strength of individual tree classifiers while minimizing the respective correlation. The feature space is iteratively constructed by a probabilistic triplet sampling method based on the distribution obtained from the splits of the random forest. The sampling process is designed to pull the data of the same label together for higher strength and push away the data frequently falling to the same leaf nodes. We perform extensive experiments on five image classification and two domain generalization datasets with ResNet-50 and DenseNet-161 backbone networks. Experimental results show that the proposed algorithm performs favorably against state-of-the-art methods.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jongbin Ryu;Gitaek Kwon;Ming-Hsuan Yang;Jongwoo Lim",
        "authorids": "jongbin.ryu@gmail.com;kwongitack@gmail.com;mhyang@ucmerced.edu;jlim@hanyang.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nRyu2020Generalized,\ntitle={Generalized Convolutional Forest Networks for Domain Generalization and Visual Recognition},\nauthor={Jongbin Ryu and Gitaek Kwon and Ming-Hsuan Yang and Jongwoo Lim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lxVyStPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1lxVyStPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "367;370;106",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "642;444;437",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            281.0,
            123.74974747448982
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            507.6666666666667,
            95.03098909770901
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8800304970897309581&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "H1lxeRNYvB",
        "title": "Neural Operator Search",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel idea of Neural Operator Search (NOS) that incorporates  additional operators into a NAS search space, searching more advanced architectures with self-calibration. ",
        "abstract": "Existing neural architecture search (NAS) methods explore a limited feature-transformation-only search space while ignoring other advanced feature operations such as feature self-calibration by attention and dynamic convolutions. This disables the NAS algorithms to discover more advanced network architectures. We address this limitation by additionally exploiting feature self-calibration operations, resulting in a heterogeneous search space. To solve the challenges of operation heterogeneity and significantly larger search space, we formulate a neural operator search (NOS) method. NOS presents a novel heterogeneous residual block for integrating the heterogeneous operations in a unified structure, and an attention guided search strategy for facilitating the search process over a vast space. Extensive experiments show that NOS can search novel cell architectures with highly competitive performance on the CIFAR and ImageNet benchmarks.\n",
        "keywords": "deep learning;autoML;neural architecture search;image classification;attention learning;dynamic convolution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei Li;Shaogang Gong;Xiatian Zhu",
        "authorids": "w.li@qmul.ac.uk;s.gong@qmul.ac.uk;eddy.zhuxt@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nli2020neural,\ntitle={Neural Operator Search},\nauthor={Wei Li and Shaogang Gong and Xiatian Zhu},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lxeRNYvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1lxeRNYvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "284;148;423",
        "wc_reply_reviewers": "0;0;73",
        "wc_reply_authors": "882;483;765",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            285.0,
            112.27050666433578
        ],
        "wc_reply_reviewers_avg": [
            24.333333333333332,
            34.41253001774532
        ],
        "wc_reply_authors_avg": [
            710.0,
            167.46940019000485
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12664099872736806399&as_sdt=5,31&sciodt=0,31&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1lyRpEFPB",
        "title": "Exploring by Exploiting Bad Models in Model-Based Reinforcement Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Exploration for reinforcement learning (RL) is well-studied for model-free methods but a relatively unexplored topic for model-based methods.\nIn this work, we investigate several exploration techniques injected into the two stages of model-based RL:\n(1) during optimization: adding transition-space and action-space noise when optimizing a policy using learned dynamics, and  \n(2) after optimization: injecting action-space noise when executing an optimized policy on the real environment.\nWhen given a good deterministic dynamics model, like the ground-truth simulation, exploration can significantly improve performance. \nHowever, using randomly initialized neural networks to model environment dynamics can _implicitly_ induce exploration in model-based RL, reducing the need for explicit exploratory techniques.\nSurprisingly, we show that in the case of a local optimizer, using a learned model with this implicit exploration can actually _outperform_ using the ground-truth model without exploration, while adding exploration to the ground-truth model reduces the performance gap. However, the learned models are highly local, in that they perform well _only_ for the task for which it is optimized, and fail to generalize to new targets.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yixin Lin;Sarah Bechtle;Ludovic Righetti;Akshara Rai;Franziska Meier",
        "authorids": "yixinlin@fb.com;sbechtle@tuebingen.mpg.de;ludovic.righetti@nyu.edu;akshararai@fb.com;fmeier@fb.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1lyRpEFPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "361;758;435",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            518.0,
            172.37362520602352
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-KJ00S77u2AJ:scholar.google.com/&scioq=Exploring+by+Exploiting+Bad+Models+in+Model-Based+Reinforcement+Learning&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "H1lyiaVFwB",
        "title": "DUAL ADVERSARIAL MODEL FOR GENERATING 3D POINT CLOUD",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Three-dimensional data, such as point clouds, are often composed of three coordinates with few featrues. In view of this, it is hard for common neural networks to learn and represent the characteristics directly. In this paper, we focus on latent space\u2019s representation of data characteristics, introduce a novel generative framework based on AutoEncoder(AE) and Generative Adversarial Network(GAN) with extra well-designed loss. We embed this framework directly into the raw 3D-GAN, and experiments demonstrate the potential of the framework in regard of improving the performance on the public dataset compared with other point cloud generation models proposed in recent years. It even achieves state of-the-art performance. We also perform experiments on MNIST and exhibit an excellent result on 2D dataset.",
        "keywords": "point cloud;generative;latent space",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuhang Zhang;Zhenwei Miao;Tiebin Mi;Robert Caiming Qiu",
        "authorids": "hang_universe@sjtu.edu.cn;zhenwei.mzw@alibaba-inc.com;mitiebin@sjtu.edu.cn;rcqiu@sjtu.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020dual,\ntitle={{\\{}DUAL{\\}} {\\{}ADVERSARIAL{\\}} {\\{}MODEL{\\}} {\\{}FOR{\\}} {\\{}GENERATING{\\}} 3D {\\{}POINT{\\}} {\\{}CLOUD{\\}}},\nauthor={Yuhang Zhang and Zhenwei Miao and Tiebin Mi and Robert Caiming Qiu},\nyear={2020},\nurl={https://openreview.net/forum?id=H1lyiaVFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1lyiaVFwB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "250;274;242",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "753;394;448",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            255.33333333333334,
            13.59738536958076
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            531.6666666666666,
            158.05132780910833
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rAk7G1IfI48J:scholar.google.com/&scioq=DUAL+ADVERSARIAL+MODEL+FOR+GENERATING+3D+POINT+CLOUD&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1x-3xSKDr",
        "title": "Batch Normalization is a Cause of Adversarial Vulnerability",
        "track": "main",
        "status": "Reject",
        "tldr": "Batch normalization reduces robustness at test-time to common corruptions and adversarial examples.",
        "abstract": "Batch normalization (BN) is often used in an attempt to stabilize and accelerate training in deep neural networks. In many cases it indeed decreases the number of parameter updates required to achieve low training error. However, it also reduces robustness to small adversarial input perturbations and common corruptions by double-digit percentages, as we show on five standard datasets. Furthermore, we find that substituting weight decay for BN is sufficient to nullify a relationship between adversarial vulnerability and the input dimension. A recent mean-field analysis found that BN induces gradient explosion when used on multiple layers, but this cannot fully explain the vulnerability we observe, given that it occurs already for a single BN layer. We argue that the actual cause is the tilting of the decision boundary with respect to the nearest-centroid classifier along input dimensions of low variance. As a result, the constant introduced for numerical stability in the BN step acts as an important hyperparameter that can be tuned to recover some robustness at the cost of standard test accuracy. We explain this mechanism explicitly on a linear ``toy model and show in experiments that it still holds for nonlinear ``real-world models.",
        "keywords": "batch normalization;adversarial examples;robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Angus Galloway;Anna Golubeva;Thomas Tanay;Medhat Moussa;Graham W. Taylor",
        "authorids": "gallowaa@uoguelph.ca;agolubeva@perimeterinstitute.ca;thomas.tanay.13@ucl.ac.uk;mmoussa@uoguelph.ca;gwtaylor@uoguelph.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ngalloway2020batch,\ntitle={Batch Normalization is a Cause of Adversarial Vulnerability},\nauthor={Angus Galloway and Anna Golubeva and Thomas Tanay and Medhat Moussa and Graham W. Taylor},\nyear={2020},\nurl={https://openreview.net/forum?id=H1x-3xSKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1x-3xSKDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "634;664;522",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1177;837;1084",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            606.6666666666666,
            61.108282762831934
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1032.6666666666667,
            143.47202592220626
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            20,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 98,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9253741881753371108&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "H1x-pANtDB",
        "title": "A closer look at network resolution for efficient network design",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "There is growing interest in designing lightweight neural networks for mobile and embedded vision applications. Previous works typically reduce computations from the structure level. For example, group convolution based methods reduce computations by factorizing a vanilla convolution into depth-wise and point-wise convolutions. Pruning based methods prune redundant connections in the network structure. In this paper, we explore the importance of network input for achieving optimal accuracy-efficiency trade-off. Reducing input scale is a simple yet effective way to reduce computational cost. It does not require careful network module design, specific hardware optimization and network retraining after pruning. Moreover, different input scales contain different representations to learn. We propose a framework to mutually learn from different input resolutions and network widths. With the shared knowledge, our framework is able to find better width-resolution balance and capture multi-scale representations. It achieves consistently better ImageNet top-1 accuracy over US-Net under different computation constraints, and outperforms the best compound scale model of EfficientNet by 1.5%. The superiority of our framework is also validated on COCO object detection and instance segmentation as well as transfer learning.",
        "keywords": "deep learning;computer vision;efficient network design;dynamic neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Taojiannan Yang;Sijie Zhu;Yan Shen;Mi Zhang;Andrew Willis;Chen Chen",
        "authorids": "tyang30@uncc.edu;szhu3@uncc.edu;yanshen6@msu.edu;mizhang@msu.edu;arwillis@uncc.edu;chen.chen@uncc.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nyang2020a,\ntitle={A closer look at network resolution for efficient network design},\nauthor={Taojiannan Yang and Sijie Zhu and Yan Shen and Mi Zhang and Andrew Willis and Chen Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=H1x-pANtDB}\n}",
        "github": "https://drive.google.com/open?id=1HbASxAn7L0Elp09bdWqAmyQoSJ-smxI_",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1x-pANtDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "450;377;194",
        "wc_reply_reviewers": "0;306;0",
        "wc_reply_authors": "1672;1271;149",
        "reply_reviewers": "0;1;0",
        "reply_authors": "3;3;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            340.3333333333333,
            107.67956579077058
        ],
        "wc_reply_reviewers_avg": [
            102.0,
            144.2497833620557
        ],
        "wc_reply_authors_avg": [
            1030.6666666666667,
            644.5682448137064
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3295210304836588656&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "H1x5wRVtvS",
        "title": "Variational Hetero-Encoder Randomized GANs for Joint Image-Text Modeling",
        "track": "main",
        "status": "Poster",
        "tldr": "A novel Bayesian deep learning framework that captures and relates hierarchical semantic and visual concepts, performing well on a variety of image and text modeling and generation tasks.",
        "abstract": "For bidirectional joint image-text modeling, we develop variational hetero-encoder (VHE) randomized generative adversarial network (GAN), a versatile deep generative model that integrates a probabilistic text decoder, probabilistic image encoder, and GAN into a coherent end-to-end multi-modality learning framework. VHE randomized GAN (VHE-GAN) encodes an image to decode its associated text, and feeds the variational posterior as the source of randomness into the GAN image generator. We plug three off-the-shelf modules, including a deep topic model, a ladder-structured image encoder, and StackGAN++, into VHE-GAN, which already achieves competitive performance. This further motivates the development of VHE-raster-scan-GAN that generates photo-realistic images in not only a multi-scale low-to-high-resolution manner, but also a hierarchical-semantic coarse-to-fine fashion. By capturing and relating hierarchical semantic and visual concepts with end-to-end training, VHE-raster-scan-GAN achieves state-of-the-art performance in a wide variety of image-text multi-modality learning and generation tasks. ",
        "keywords": "Deep topic model;image generation;text generation;raster-scan-GAN;zero-shot learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hao Zhang;Bo Chen;Long Tian;Zhengjue Wang;Mingyuan Zhou",
        "authorids": "zhanghao_xidian@163.com;bchen@mail.xidian.edu.cn;tianlong_xidian@163.com;zhengjuewang@163.com;mingyuan.zhou@mccombs.utexas.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nZhang2020Variational,\ntitle={Variational Hetero-Encoder Randomized GANs for Joint Image-Text Modeling},\nauthor={Hao Zhang and Bo Chen and Long Tian and Zhengjue Wang and Mingyuan Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1x5wRVtvS}\n}",
        "github": "https://github.com/BoChenGroup/VHE-GAN",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1x5wRVtvS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "174;186;332",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "435;53;228",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            230.66666666666666,
            71.82076456166575
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            238.66666666666666,
            156.13313407331435
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6283375856940214417&as_sdt=5,44&sciodt=0,44&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1x8b6EtvH",
        "title": "Network Pruning for Low-Rank Binary Index",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new pruning technique to generate a low-rank binary index matrix.",
        "abstract": "  Pruning is an efficient model compression technique to remove redundancy in the connectivity of deep neural networks (DNNs). A critical problem to represent sparse matrices after pruning is that if fewer bits are used for quantization and pruning rate is enhanced, then the amount of index becomes relatively larger. Moreover, an irregular index form leads to low parallelism for convolutions and matrix multiplications. In this paper, we propose a new network pruning technique that generates a low-rank binary index matrix to compress index data significantly. Specifically, the proposed compression method finds a particular fine-grained pruning mask that can be decomposed into two binary matrices while decompressing index data is performed by simple binary matrix multiplication. We also propose a tile-based factorization technique that not only lowers memory requirements but also enhances compression ratio. Various DNN models (including conv layers and LSTM layers) can be pruned with much fewer indices compared to previous sparse matrix formats while maintaining the same pruning rate.",
        "keywords": "Pruning;Model compression;Index compression;low-rank;binary matrix decomposition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongsoo Lee;Se Jung Kwon;Byeongwook Kim;Parichay Kapoor;Gu-Yeon Wei",
        "authorids": "dslee3@gmail.com;mogndrewk@gmail.com;quddnr145@gmail.com;kparichay@gmail.com;gywei@g.harvard.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nlee2020network,\ntitle={Network Pruning for Low-Rank Binary Index},\nauthor={Dongsoo Lee and Se Jung Kwon and Byeongwook Kim and Parichay Kapoor and Gu-Yeon Wei},\nyear={2020},\nurl={https://openreview.net/forum?id=H1x8b6EtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1x8b6EtvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "439;430;228",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.6666666666667,
            97.41434984413516
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:U7OnVqq3C2sJ:scholar.google.com/&scioq=Network+Pruning+for+Low-Rank+Binary+Index&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1x9004YPr",
        "title": "Contextual Temperature for Language Modeling",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose contextual temperature, a mechanism that enables temperature scaling for language models based on the context of each word. Contextual temperature co-adapts with model parameters and can be learned during training.",
        "abstract": "Temperature scaling has been widely used to improve performance for NLP tasks that utilize Softmax decision layer. Current practices in using temperature either assume a \ufb01xed value or a dynamically changing temperature but with a \ufb01xed schedule. Little has been known on an optimal trajectory of temperature that can change with the context. In this paper, we propose contextual temperature, a mechanism that allows temperatures to change over the context for each vocabulary, and to co-adopt with model parameters during training. Experimental results illustrated that contextual temperature improves over state-of-the-art language models signi\ufb01cantly. Our model CT-MoS achieved a perplexity of 55.31 in the test set of Penn Treebank and a perplexity of 62.89 in the test set of WikiText-2. The in-depth analysis showed that the behavior of temperature schedule varies dramatically by vocabulary. The optimal temperature trajectory drops as the context becomes longer to suppress uncertainties in language modeling. These evidence further justi\ufb01ed the need for contextual temperature and explained its performance advantage over \ufb01xed temperature or scheduling.",
        "keywords": "natural language processing;language modeling;sequence modeling;temperature scaling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pei-Hsin Wang;Sheng-Iou Hsieh;Shieh-Chieh Chang;Jia-Yu Pan;Yu-Ting Chen;Wei Wei;Da-Cheng Juan",
        "authorids": "peihsin@gapp.nthu.edu.tw;steins1111@gapp.nthu.edu.tw;scchang@cs.nthu.edu.tw;jypan@google.com;yutingchen@google.com;wewei@google.com;dacheng@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nwang2020contextual,\ntitle={Contextual Temperature for Language Modeling},\nauthor={Pei-Hsin Wang and Sheng-Iou Hsieh and Shieh-Chieh Chang and Jia-Yu Pan and Yu-Ting Chen and Wei Wei and Da-Cheng Juan},\nyear={2020},\nurl={https://openreview.net/forum?id=H1x9004YPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1x9004YPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "151;188;291",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "633;199;1055",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            210.0,
            59.23399924592857
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            629.0,
            349.47198266336983
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9018713183487994287&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1xFWgrFPS",
        "title": "Explanation by Progressive Exaggeration",
        "track": "main",
        "status": "Spotlight",
        "tldr": "A method to explain a classifier, by generating visual perturbation of an image by exaggerating  or diminishing the semantic features that the classifier associates with a target label.",
        "abstract": "As machine learning methods see greater adoption and implementation in high stakes applications such as medical image diagnosis, the need for model interpretability and explanation has become more critical. Classical approaches that assess feature importance (eg saliency maps) do not explain how and why a particular region of an image is relevant to the prediction. We propose a method that explains the outcome of a classification black-box by gradually exaggerating the semantic effect of a given class. Given a query input to a classifier, our method produces a progressive set of plausible variations of that query, which gradually change the posterior probability from its original class to its negation. These counter-factually generated samples preserve features unrelated to the classification decision, such that a user can employ our method as a ``tuning knob'' to traverse a data manifold while crossing the decision boundary.  Our method is model agnostic and only requires the output value and gradient of the predictor with respect to its input.",
        "keywords": "Explain;deep learning;black box;GAN;counterfactual",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sumedha Singla;Brian Pollack;Junxiang Chen;Kayhan Batmanghelich",
        "authorids": "sumedha.singla@pitt.edu;kayhan@pitt.edu;cjx880409@gmail.com;kayhan@pitt.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nSingla2020Explanation,\ntitle={Explanation  by Progressive  Exaggeration},\nauthor={Sumedha Singla and Brian Pollack and Junxiang Chen and Kayhan Batmanghelich},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1xFWgrFPS}\n}",
        "github": "https://github.com/batmanlab/Explanation_by_Progressive_Exaggeration.git",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1xFWgrFPS",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "162;250",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "463;486",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            206.0,
            44.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            474.5,
            11.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 128,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14406325811451832998&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "H1xJhJStPS",
        "title": "Equilibrium Propagation with Continual Weight Updates",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a continual version of Equilibrium Propagation, where neuron and synapse dynamics occur simultaneously throughout the second phase, with theoretical guarantees and numerical simulations.",
        "abstract": "Equilibrium Propagation (EP) is a learning algorithm that bridges Machine Learning and Neuroscience, by computing gradients closely matching those of Backpropagation Through Time (BPTT), but with a learning rule local in space.\nGiven an input x and associated target y, EP proceeds in two phases: in the first phase neurons evolve freely towards a first steady state; in the second phase output neurons are nudged towards y until they reach a second steady state.\nHowever, in existing implementations of EP, the learning rule is not local in time:\nthe weight update is performed after the dynamics of the second phase have converged and requires information of the first phase that is no longer available physically.\nThis is a major impediment to the biological plausibility of EP and its efficient hardware implementation.\nIn this work, we propose a version of EP named Continual Equilibrium Propagation (C-EP) where neuron and synapse dynamics occur simultaneously throughout the second phase, so that the weight update becomes local in time. We prove theoretically that, provided the learning rates are sufficiently small, at each time step of the second phase the dynamics of neurons and synapses follow the gradients of the loss given by BPTT (Theorem 1).\nWe demonstrate training with C-EP on MNIST and generalize C-EP to neural networks where neurons are connected by asymmetric connections. We show through experiments that the more the network updates follows the gradients of BPTT, the best it performs in terms of training. These results bring EP a step closer to biology while maintaining its intimate link with backpropagation.",
        "keywords": "Biologically Plausible Neural Networks;Equilibrium Propagation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Maxence Ernoult;Julie Grollier;Damien Querlioz;Yoshua Bengio;Benjamin Scellier",
        "authorids": "maxence.ernoult@u-psud.fr;julie.grollier@cnrs-thales.fr;damien.querlioz@u-psud.fr;yoshua.bengio@mila.quebec;benjamin.scellier@umontreal.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nernoult2020equilibrium,\ntitle={Equilibrium Propagation with Continual Weight Updates},\nauthor={Maxence Ernoult and Julie Grollier and Damien Querlioz and Yoshua Bengio and Benjamin Scellier},\nyear={2020},\nurl={https://openreview.net/forum?id=H1xJhJStPS}\n}",
        "github": "https://drive.google.com/open?id=1oZtzBTu8zZgvAopyK2sQg2bpcsrzwTrp",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1xJhJStPS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "293;684;115",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "325;1391;15",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            364.0,
            237.6566150282097
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            577.0,
            589.3340874806638
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 49,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=91156664249078988&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "H1xKBCEYDr",
        "title": "Black-box Adversarial Attacks with Bayesian Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that a relatively simple black-box adversarial attack scheme using Bayesian optimization and dimension upsampling  is preferable to existing methods when the number of available queries is very low.",
        "abstract": "We focus on the problem of black-box adversarial attacks, where the aim is to generate adversarial examples using information limited to loss function evaluations of input-output pairs. We use Bayesian optimization (BO) to specifically\ncater to scenarios involving low query budgets to develop query efficient adversarial attacks. We alleviate the issues surrounding BO in regards to optimizing high dimensional deep learning models by effective dimension upsampling techniques. Our proposed approach achieves performance comparable to the state of the art black-box adversarial attacks albeit with a much lower average query count. In particular, in low query budget regimes, our proposed method reduces the query count up to 80% with respect to the state of the art methods.",
        "keywords": "black-box adversarial attacks;bayesian optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Satya Narayan Shukla;Anit Kumar Sahu;Devin Willmott;J. Zico Kolter",
        "authorids": "snshukla@cs.umass.edu;anit.sahu@gmail.com;devin.willmott@uky.edu;zkolter@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nshukla2020blackbox,\ntitle={Black-box Adversarial Attacks with Bayesian Optimization},\nauthor={Satya Narayan Shukla and Anit Kumar Sahu and Devin Willmott and J. Zico Kolter},\nyear={2020},\nurl={https://openreview.net/forum?id=H1xKBCEYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1xKBCEYDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "141;151;298",
        "wc_reply_reviewers": "0;0;137",
        "wc_reply_authors": "612;480;675",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            196.66666666666666,
            71.76969338717346
        ],
        "wc_reply_reviewers_avg": [
            45.666666666666664,
            64.58241934837135
        ],
        "wc_reply_authors_avg": [
            589.0,
            81.25269226308751
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15145016688982647326&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1xPR3NtPB",
        "title": "Are Pre-trained Language Models Aware of Phrases? Simple but Strong Baselines for Grammar Induction",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "With the recent success and popularity of pre-trained language models (LMs) in natural language processing, there has been a rise in efforts to understand their inner workings. \nIn line with such interest, we propose a novel method that assists us in investigating the extent to which pre-trained LMs capture the syntactic notion of constituency. \nOur method provides an effective way of extracting constituency trees from the pre-trained LMs without training. \nIn addition, we report intriguing findings in the induced trees, including the fact that pre-trained LMs outperform other approaches in correctly demarcating adverb phrases in sentences.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Taeuk Kim;Jihun Choi;Daniel Edmiston;Sang-goo Lee",
        "authorids": "taeuk@europa.snu.ac.kr;jhchoi@europa.snu.ac.kr;danedmiston@uchicago.edu;sglee@europa.snu.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nKim2020Are,\ntitle={Are Pre-trained Language Models Aware of Phrases? Simple but Strong Baselines for Grammar Induction},\nauthor={Taeuk Kim and Jihun Choi and Daniel Edmiston and Sang-goo Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1xPR3NtPB}\n}",
        "github": "[![github](/images/github_icon.svg) galsang/trees_from_transformers](https://github.com/galsang/trees_from_transformers)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1xPR3NtPB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "182;681;492",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "99;1324;1500",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            451.6666666666667,
            205.70259005553515
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            974.3333333333334,
            623.1106554127355
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 115,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12987326770571285349&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "H1xSOTVtvH",
        "title": "Robust Domain Randomization for Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We produce reinforcement learning agents that generalize well to a wide range of environments using a novel regularization technique.",
        "abstract": "Producing agents that can generalize to a wide range of environments is a significant challenge in reinforcement learning. One method for overcoming this issue is domain randomization, whereby at the start of each training episode some parameters of the environment are randomized so that the agent is exposed to many possible variations. However, domain randomization is highly inefficient and may lead to policies with high variance across domains. In this work, we formalize the domain randomization problem, and show that minimizing the policy's Lipschitz constant with respect to the randomization parameters leads to low variance in the learned policies. We propose a method where the agent only needs to be trained on one variation of the environment, and its learned state representations are regularized during training to minimize this constant. We conduct experiments that demonstrate that our technique leads to more efficient and robust learning than standard domain randomization, while achieving equal generalization scores.",
        "keywords": "reinforcement learning;domain randomization;domain adaptation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Reda Bahi Slaoui;William R. Clements;Jakob N. Foerster;S\u00e9bastien Toth",
        "authorids": "reda.bahi.slaoui@gmail.com;william.clements@unchartech.com;jakobfoerster@gmail.com;sebastien.toth@unchartech.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nslaoui2020robust,\ntitle={Robust Domain Randomization for Reinforcement Learning},\nauthor={Reda Bahi Slaoui and William R. Clements and Jakob N. Foerster and S{\\'e}bastien Toth},\nyear={2020},\nurl={https://openreview.net/forum?id=H1xSOTVtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1xSOTVtvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "349;348;491",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "463;499;479",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            396.0,
            67.17638473948018
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            480.3333333333333,
            14.727148022916348
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9431650803515266113&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "H1xTEgBKvB",
        "title": "Mem2Mem: Learning to Summarize Long Texts with Memory-to-Memory Transfer",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "We introduce the Mem2Mem mechanism, a conditional memory-to-memory mechanism that can be appended to general sequence-to-sequence frameworks, and demonstrate its effectiveness in improving long text neural abstractive summarization. Mem2Mem seamlessly transfers \"memories\" via readable/writable external memory modules that augment both the encoder and decoder. By enabling a memory transfer, Mem2Mem uses representations of highly salient input sentences and performs an implicit sentence extraction step. By allowing the decoder to read and write over encoded input memories, the models learn to store information about the input sequence while keeping track of what has been generated by the decoder. We evaluate Mem2Mem on abstractive text summarization and surpass the current state-of-the-art with less model capacity than competing models and with a full end-to-end training setup. To our knowledge, Mem2Mem is the first mechanism that can effectively use and update memory cells filled with different contextual information.",
        "keywords": "Abstractive summarization;Memory augmented networks;Memory Augmented Encoder Decoder;Memory transfer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jaehong Park;Jonathan Pilault;Christopher Pal",
        "authorids": "jaehong.park@elementai.com;jonathan.pilault@elementai.com;christopher.pal@elementai.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1xTEgBKvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "336;196;131",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            221.0,
            85.53751613570894
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tfxTlW0SoKwJ:scholar.google.com/&scioq=Mem2Mem:+Learning+to+Summarize+Long+Texts+with+Memory-to-Memory+Transfer&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1xTup4KPr",
        "title": "Needles in Haystacks: On Classifying Tiny Objects in Large Images",
        "track": "main",
        "status": "Reject",
        "tldr": "We study low- and very-low-signal-to-noise classification scenarios, where objects that correlate with class label occupy tiny proportion of the entire image (e.g. medical or hyperspectral imaging).",
        "abstract": "In some important computer vision domains, such as medical or hyperspectral imaging, we care about the classification of tiny objects in large images. However, most Convolutional Neural Networks (CNNs) for image classification were developed using biased datasets that contain large objects, in mostly central image positions. To assess whether classical CNN architectures work well for tiny object classification we build a comprehensive testbed containing two datasets: one derived from MNIST digits and one from histopathology images. This testbed allows controlled experiments to stress-test CNN architectures with a broad spectrum of signal-to-noise ratios. Our observations indicate that: (1) There exists a limit to signal-to-noise below which CNNs fail to generalize and that this limit is affected by dataset size - more data leading to better performances; however, the amount of training data required for the model to generalize scales rapidly with the inverse of the object-to-image ratio (2) in general, higher capacity models exhibit better generalization; (3) when knowing the approximate object sizes, adapting receptive field is beneficial; and (4) for very small signal-to-noise ratio the choice of global pooling operation affects optimization, whereas for relatively large signal-to-noise values, all tested global pooling operations exhibit similar performance.",
        "keywords": "computer vision;CNNs;small objects;low signal-to-noise image classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nick Pawlowski;Suvrat Bhooshan;Nicolas Ballas;Francesco Ciompi;Ben Glocker;Michal Drozdzal",
        "authorids": "pawlowski.nick@gmail.com;sbh@fb.com;ballasn@fb.com;f.ciompi@gmail.com;b.glocker@imperial.ac.uk;mdrozdzal@fb.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\npawlowski2020needles,\ntitle={Needles in Haystacks: On Classifying Tiny Objects in Large Images},\nauthor={Nick Pawlowski and Suvrat Bhooshan and Nicolas Ballas and Francesco Ciompi and Ben Glocker and Michal Drozdzal},\nyear={2020},\nurl={https://openreview.net/forum?id=H1xTup4KPr}\n}",
        "github": "https://anonymousfiles.io/k3Ax0cz6/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=H1xTup4KPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "196;422;299",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "633;625;485",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            305.6666666666667,
            92.38446237809087
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            581.0,
            67.96077299933151
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 26,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14388914280860522942&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1xauR4Kvr",
        "title": "The Discriminative Jackknife: Quantifying Uncertainty in Deep Learning via Higher-Order Influence Functions",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep learning models achieve high predictive accuracy in a broad spectrum of tasks, but rigorously quantifying their predictive uncertainty remains challenging. Usable estimates of predictive uncertainty should (1) cover the true prediction target with a high probability, and (2) discriminate between high- and low-confidence prediction instances. State-of-the-art methods for uncertainty quantification are based predominantly on Bayesian neural networks. However, Bayesian methods may fall short of (1) and (2) \u2014 i.e., Bayesian credible intervals do not guarantee frequentist coverage, and approximate posterior inference may undermine discriminative accuracy. To this end, this paper tackles the following question: can we devise an alternative frequentist approach for uncertainty quantification that satisfies (1) and (2)? \n\nTo address this question, we develop the discriminative jackknife (DJ), a formal inference procedure that constructs predictive confidence intervals for a wide range of deep learning models, is easy to implement, and provides rigorous theoretical guarantees on (1) and (2). The DJ procedure uses higher-order influence functions (HOIFs) of the trained model parameters to construct a jackknife (leave-one-out) estimator of predictive confidence intervals. DJ computes HOIFs using a recursive formula that requires only oracle access to loss gradients and Hessian-vector products, hence it can be applied in a post-hoc fashion without compromising model accuracy or interfering with model training. Experiments demonstrate that DJ performs competitively compared to existing Bayesian and non-Bayesian baselines.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ahmed M. Alaa;Mihaela van der Schaar",
        "authorids": "a7med3laa@hotmail.com;mihaelaucla@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nalaa2020the,\ntitle={The Discriminative Jackknife: Quantifying Uncertainty in Deep Learning via Higher-Order Influence Functions},\nauthor={Ahmed M. Alaa and Mihaela van der Schaar},\nyear={2020},\nurl={https://openreview.net/forum?id=H1xauR4Kvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=H1xauR4Kvr",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "454;930;252;555",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "596;1338;333;982",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;3;1;2",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            547.75,
            246.1832396813398
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            812.25,
            381.33736703868925
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.75,
            0.82915619758885
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 70,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2471473475315461795&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "H1xjh6EYvS",
        "title": "Zero-Shot Medical Image Artifact Reduction",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We introduce a \u201cZero-Shot\u201d medical image Artifact Reduction framework, which leverages the power of deep learning but without using general pre-trained networks or any clean image reference. ",
        "abstract": "Medical images may contain various types of artifacts with different patterns and mixtures, which depend on many factors such as scan setting, machine condition, patients\u2019 characteristics, surrounding environment, etc. However, existing deep learning based artifact reduction methods are restricted by their training set with specific predetermined artifact type and pattern. As such, they have limited clinical adoption. In this paper, we introduce a \u201cZero-Shot\u201d medical image Artifact Reduction (ZSAR) framework, which leverages the power of deep learning but without using general pre-trained networks or any clean image reference. Specifically, we utilize the low internal visual entropy of an image and train a light-weight image-specific artifact reduction network to reduce artifacts in an image at test-time. We use Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) as vehicles to show that ZSAR can reduce artifacts better than state-of-the-art both qualitatively and quantitatively, while using shorter execution time. To the best of our knowledge, this is the first deep learning framework that reduces artifacts in medical images without using a priori training set.",
        "keywords": "zero shot;medical image;artifact reduction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yujen Chen;Shaocheng Wen;Yenjung Chang;Xiaowei Xu;Qianjun Jia;Yiyu Shi;Tsungyi Ho;Meiping Huang;Haiyun Yuan;Jian Zhuang",
        "authorids": "s107062520@m107.nthu.edu.tw;tommy8616@yahoo.com.tw;s106062649@m106.nthu.edu.tw;xxu8@nd.edu;jiaqianjun@126.com;yshi4@nd.edu;tyho@cs.nthu.edu.tw;meipinghuang.gdg@gmail.com;xiaowei.xu@outlook.com;xiaowei.xu.xxw@gmail.com",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=H1xjh6EYvS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            2,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5559764903253188967&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "H1xk6AEFwr",
        "title": "Classification as Decoder: Trading Flexibility for Control in Multi Domain Dialogue",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Avoid generating responses one word at a time by using weak supervision to training a classifier  to pick a full response.",
        "abstract": "Generative seq2seq dialogue systems are trained to predict the next word in dialogues that have already occurred. They can learn from large unlabeled conversation datasets, build a deep understanding of conversational context, and generate a wide variety of responses. This flexibility comes at the cost of control. Undesirable responses in the training data will be reproduced by the model at inference time, and longer generations often don\u2019t make sense. Instead of generating responses one word at a time, we train a classifier to choose from a predefined list of full responses. The classifier is trained on (conversation context, response class) pairs, where each response class is a noisily labeled group of interchangeable responses. At inference, we generate the exemplar response associated with the predicted response class. Experts can edit and improve these exemplar responses over time without retraining the classifier or invalidating old training data.\nHuman evaluation of 775 unseen doctor/patient conversations shows that this tradeoff improves responses. Only 12% of our discriminative approach\u2019s responses are worse than the doctor\u2019s response in the same conversational context, compared to 18% for the generative model. A discriminative model trained without any manual labeling of response classes achieves equal performance to the generative model.",
        "keywords": "NLP;dialogue;chatbot;weak supervision;language model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sam Shleifer;Manish Chablani;Namit Katariya;Anitha Kannan;Xavier Amatriain",
        "authorids": "sshleifer@gmail.com;manish@curai.com;namit@curai.com;anitha@curai.com;xavier@curai.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=H1xk6AEFwr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "259;919;782",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "35;11;87",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            653.3333333333334,
            284.3897950974253
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            44.333333333333336,
            31.72100600898752
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_aP_ydNHaVgJ:scholar.google.com/&scioq=Classification+as+Decoder:+Trading+Flexibility+for+Control+in+Multi+Domain+Dialogue&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "H1xscnEKDr",
        "title": "Defending Against Physically Realizable Attacks on Image Classification",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Defending Against Physically Realizable Attacks on Image Classification",
        "abstract": "We study the problem of defending deep neural network approaches for image classification from physically realizable attacks. First, we demonstrate that the two most scalable and effective methods for learning robust models, adversarial training with PGD attacks and randomized smoothing, exhibit very limited effectiveness against three of the highest profile physical attacks. Next, we propose a new abstract adversarial model, rectangular occlusion attacks, in which an adversary places a small adversarially crafted rectangle in an image, and develop two approaches for efficiently computing the resulting adversarial examples. Finally, we demonstrate that adversarial training using our new attack yields image classification models that exhibit high robustness against the physically realizable attacks we study, offering the first effective generic defense against such attacks.",
        "keywords": "defense against physical attacks;adversarial machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tong Wu;Liang Tong;Yevgeniy Vorobeychik",
        "authorids": "tongwu@wustl.edu;liangtong@wustl.edu;yvorobeychik@wustl.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nWu2020Defending,\ntitle={Defending Against Physically Realizable Attacks on Image Classification},\nauthor={Tong Wu and Liang Tong and Yevgeniy Vorobeychik},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=H1xscnEKDr}\n}",
        "github": "https://github.com/tongwu2020/phattacks",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1xscnEKDr",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "320;394;742",
        "wc_reply_reviewers": "267;14;103",
        "wc_reply_authors": "1855;140;724",
        "reply_reviewers": "1;1;1",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            485.3333333333333,
            183.98792230892644
        ],
        "wc_reply_reviewers_avg": [
            128.0,
            104.78867623301034
        ],
        "wc_reply_authors_avg": [
            906.3333333333334,
            711.9177543009367
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 149,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1916491151191652203&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "H1xtU6VtwH",
        "title": "Extractor-Attention Network: A New Attention Network with Hybrid Encoders for Chinese Text Classification",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a novel attention networks with the hybird encoder to solve the text representation issue of Chinese text classification, especially the language phenomena about pronunciations such as the polyphone and the homophone.",
        "abstract": "Chinese text classification has received more and more attention today. However, the problem of Chinese text representation still hinders the improvement of Chinese text classification, especially the polyphone and the homophone in social media. To cope with it effectively, we propose a new structure, the Extractor, based on attention mechanisms and design novel attention networks named Extractor-attention network (EAN). Unlike most of previous works, EAN uses a combination of a word encoder and a Pinyin character encoder instead of a single encoder. It improves the capability of Chinese text representation. Moreover, compared with the hybrid encoder methods, EAN has more complex combination architecture and more reducing parameters structures. Thus, EAN can take advantage of a large amount of information that comes from multi-inputs and alleviates efficiency issues. The proposed model achieves the state of the art results on 5 large datasets for Chinese text classification. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junhao Qiu;Ronghua Shi;Fangfang Li (the corresponding author);Jinjing Shi;Wangmin Liao",
        "authorids": "qiujunhao@csu.edu.cn;shirh@csu.edu.cn;lifangfang@csu.edu.cn;shijinjing@csu.edu.cn;0909123117@csu.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nqiu2020extractorattention,\ntitle={Extractor-Attention Network: A New Attention Network with Hybrid Encoders for Chinese Text Classification},\nauthor={Junhao Qiu and Ronghua Shi and Fangfang Li (the corresponding author) and Jinjing Shi and Wangmin Liao},\nyear={2020},\nurl={https://openreview.net/forum?id=H1xtU6VtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1xtU6VtwH",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "300;461;126",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.6666666666667,
            136.79749835269487
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tYfIG6dAEdEJ:scholar.google.com/&scioq=Extractor-Attention+Network:+A+New+Attention+Network+with+Hybrid+Encoders+for+Chinese+Text+Classification&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "H1xzdlStvB",
        "title": "Multi-Precision Policy Enforced Training (MuPPET) : A precision-switching strategy for quantised fixed-point training of CNNs",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Large-scale convolutional neural networks (CNNs) suffer from very long training times,  spanning from hours to weeks, limiting the productivity and experimentation of deep learning practitioners.  As networks grow in size and complexity one approach  of  reducing  training  time  is  the  use  of  low-precision  data  representation and computations during the training stage.  However, in doing so the final accuracy suffers due to the problem of vanishing gradients.  Existing state-of-the-art methods combat this issue by means of a mixed-precision approach employing two different precision levels, FP32 (32-bit floating-point precision) and FP16/FP8 (16-/8-bit floating-point precision), leveraging the hardware support of recent GPU architectures for FP16 operations to obtaining performance gains. This work pushes the boundary of quantised training by employing a multilevel optimisation approach that utilises multiple precisions including low-precision fixed-point representations.  The training strategy, named MuPPET, combines the use of  multiple  number  representation regimes  together  with  a  precision-switching mechanism that decides at run time the transition between different precisions. Overall, the proposed strategy tailors the training process to the hardware-level capabilities of the utilised hardware architecture and yields improvements in training time and energy efficiency compared to state-of-the-art approaches. Applying MuPPET on the training of AlexNet, ResNet18 and GoogLeNet on ImageNet (ILSVRC12) and targeting an NVIDIA Turing GPU, the proposed method achieves the same accuracy as the standard full-precision training with an average training-time speedup of 1.28\u00d7 across the networks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aditya Rajagopal;Diederik A. Vink;Stylianos I. Venieris;Christos-Savvas Bouganis",
        "authorids": "aditya.rajagopal14@imperial.ac.uk;diederik.vink14@imperial.ac.uk;stelios.ven10@gmail.com;christos-savvas.bouganis@imperial.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nrajagopal2020multiprecision,\ntitle={Multi-Precision Policy Enforced Training (Mu{\\{}PPET{\\}}) : A precision-switching strategy for quantised fixed-point training of {\\{}CNN{\\}}s},\nauthor={Aditya Rajagopal and Diederik A. Vink and Stylianos I. Venieris and Christos-Savvas Bouganis},\nyear={2020},\nurl={https://openreview.net/forum?id=H1xzdlStvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=H1xzdlStvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "371;636;245",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "522;301;190",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.3333333333333,
            162.95261751673567
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            337.6666666666667,
            137.99597417638273
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10997782253572555285&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "HJe-blSYvH",
        "title": "Unsupervised Learning of Efficient and Robust Speech Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We present an unsupervised method for learning speech representations based on a bidirectional contrastive predictive coding that implicitly discovers phonetic structure from large-scale corpora of unlabelled raw audio signals. The representations, which we learn from up to 8000 hours of publicly accessible speech data, are evaluated by looking at their impact on the behaviour of supervised speech recognition systems. First, across a variety of datasets, we find that the features learned from the largest and most diverse pretraining dataset result in significant improvements over standard audio features as well as over features learned from smaller amounts of pretraining data. Second, they significantly improve sample efficiency in low-data scenarios. Finally, the features confer significant robustness advantages to the resulting recognition systems: we see significant improvements in out-of-domain transfer relative to baseline feature sets, and the features likewise provide improvements in four different low-resource African language datasets.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kazuya Kawakami;Luyu Wang;Chris Dyer;Phil Blunsom;Aaron van den Oord",
        "authorids": "kawakamik@google.com;luyuwang@google.com;cdyer@google.com;pblunsom@google.com;avdnoord@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nkawakami2020unsupervised,\ntitle={Unsupervised Learning of Efficient and Robust Speech Representations},\nauthor={Kazuya Kawakami and Luyu Wang and Chris Dyer and Phil Blunsom and Aaron van den Oord},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe-blSYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJe-blSYvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "360;603;282",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "465;92;206",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            415.0,
            136.69674465765453
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            254.33333333333334,
            156.06480135579008
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16937087732948171188&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJe-oRVtPB",
        "title": "STABILITY AND CONVERGENCE THEORY FOR LEARNING RESNET: A FULL CHARACTERIZATION",
        "track": "main",
        "status": "Reject",
        "tldr": "We characterize the stability and convergence of gradient descent learning ResNet, unveiling the theorectical and practical importance of tau =1/sqrt(L) in the residual block.",
        "abstract": "ResNet structure has achieved great success since its debut. In this paper, we study the stability of learning ResNet. Specifically, we consider the ResNet block $h_l = \\phi(h_{l-1}+\\tau\\cdot g(h_{l-1}))$ where $\\phi(\\cdot)$ is ReLU activation and $\\tau$ is a scalar. We show that for standard initialization used in practice, $\\tau =1/\\Omega(\\sqrt{L})$ is a sharp value in characterizing the stability of forward/backward process of ResNet, where $L$ is the number of residual blocks. Specifically, stability is guaranteed for $\\tau\\le 1/\\Omega(\\sqrt{L})$ while conversely forward process explodes when $\\tau>L^{-\\frac{1}{2}+c}$ for a positive constant $c$.  Moreover,  if ResNet is properly over-parameterized, we show for $\\tau \\le 1/\\tilde{\\Omega}(\\sqrt{L})$ gradient descent is guaranteed to find the global minima \\footnote{We use $\\tilde{\\Omega}(\\cdot)$ to hide logarithmic factor.}, which significantly enlarges the range of $\\tau\\le 1/\\tilde{\\Omega}(L)$ that admits  global convergence  in previous work. We also demonstrate that the over-parameterization requirement of ResNet only weakly depends on the depth, which corroborates the advantage of ResNet over vanilla feedforward network. Empirically, with $\\tau\\le1/\\sqrt{L}$, deep ResNet can be easily trained even without normalization layer. Moreover, adding $\\tau=1/\\sqrt{L}$ can also improve the performance of ResNet with normalization layer.",
        "keywords": "ResNet;stability;convergence theory;over-parameterization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Huishuai Zhang;Da Yu;Mingyang Yi;Wei Chen;Tie-yan Liu",
        "authorids": "huishuai.zhang@microsoft.com;yuda3@mail2.sysu.edu.cn;v-minyi@microsoft.com;wche@microsoft.com;tie-yan.liu@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang2020stability,\ntitle={{\\{}STABILITY{\\}} {\\{}AND{\\}} {\\{}CONVERGENCE{\\}} {\\{}THEORY{\\}} {\\{}FOR{\\}} {\\{}LEARNING{\\}} {\\{}RESNET{\\}}: A {\\{}FULL{\\}} {\\{}CHARACTERIZATION{\\}}},\nauthor={Huishuai Zhang and Da Yu and Mingyang Yi and Wei Chen and Tie-yan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe-oRVtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJe-oRVtPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "480;486;326",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "633;443;273",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            430.6666666666667,
            74.05103345366325
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            449.6666666666667,
            147.04496666741852
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "HJe1gaVtwS",
        "title": "Fast Bilinear Matrix Normalization via Rank-1 Update",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Bilinear pooling has achieved an impressive improvement over classical average and max pooling in many computer vision tasks. Recent studies discover that matrix normalization is vital for improving the performance of  bilinear pooling since it effectively suppresses the burstiness. Nevertheless, exiting matrix normalization methods such as matrix square-root and matrix logarithm are based on singular value decomposition (SVD), which is not supported well in the GPU platform, limiting its efficiency in training and inference. To boost the efficiency in the GPU platform, recent methods rely on  Newton-Schulz (NS) iteration which approximates the matrix square-root through several times of matrix-matrix multiplications. Despite that Newton-Schulz iteration is well supported by GPU, it takes $\\mathcal{O}(KD^3)$ computation complexity where $D$ is dimension of local features and $K$ is the number of iterations, which is still costly.  Meanwhile, NS iteration is  applicable only to full bilinear matrix.  In contrast,  a compact bilinear  feature obtained from tensor sketch or random projection has broken the matrix structure,  cannot be normalized by NS iteration. To overcome these limitations, we propose a rank-1 update normalization (RUN), which reduces the computational cost from $\\mathcal{O}(KD^3)$ to $\\mathcal{O}(KDN)$ where $N$ is the number of local feature per image.  More importantly, it supports  the normalization on compact bilinear features. Meanwhile, the proposed RUN is differentiable, and thus it is feasible to  plug it in a convolutional neural network as a layer to support an end-to-end training. Comprehensive experiments  on four public benchmarks show that, for full bilinear pooling, the proposed RUN achieves comparable accuracies with a $330\\times$ speedup over NS iteration. For compact bilinear pooling,  our RUN achieves comparable accuracies with a $5400\\times$ speedup over the SVD-based normalization.",
        "keywords": "Computer Vision;Bilinear Pooling;Efficient Network;Fine-grained Classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tan Yu;Yunfeng Cai;Ping Li",
        "authorids": "tanyu01@baidu.com;caiyunfeng@baidu.com;liping11@baidu.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://www.dropbox.com/s/ewpvsosz5vqucrx/ICLR2020clean.tar.gz?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJe1gaVtwS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "243;191;226",
        "wc_reply_reviewers": "28;0;51",
        "wc_reply_authors": "215;249;126",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            220.0,
            21.64871050817269
        ],
        "wc_reply_reviewers_avg": [
            26.333333333333332,
            20.8539897594894
        ],
        "wc_reply_authors_avg": [
            196.66666666666666,
            51.86092513208336
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1nuMJl0fWscJ:scholar.google.com/&scioq=Fast+Bilinear+Matrix+Normalization+via+Rank-1+Update&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJe4Cp4KwH",
        "title": "GNN-FiLM: Graph Neural Networks with Feature-wise Linear Modulation",
        "track": "main",
        "status": "Reject",
        "tldr": "new GNN formalism + extensive experiments; showing differences between GGNN/GCN/GAT are smaller than thought",
        "abstract": "This paper presents a new Graph Neural Network (GNN) type using feature-wise linear modulation (FiLM). Many standard GNN variants propagate information along the edges of a graph by computing ``messages'' based only on the representation of the source of each edge. In GNN-FiLM, the representation of the target node of an edge is additionally used to compute a transformation that can be applied to all incoming messages, allowing feature-wise modulation of the passed information.\n\nResults of experiments comparing different GNN architectures on three tasks from the literature are presented, based on re-implementations of baseline methods. Hyperparameters for all methods were found using extensive search, yielding somewhat surprising results: differences between baseline models are smaller than reported in the literature. Nonetheless, GNN-FiLM outperforms baseline methods on a regression task on molecular graphs and performs competitively on other tasks.\n",
        "keywords": "Graph Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marc Brockschmidt",
        "authorids": "mabrocks@microsoft.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nbrockschmidt2020gnnfilm,\ntitle={{\\{}GNN{\\}}-Fi{\\{}LM{\\}}: Graph Neural Networks with Feature-wise Linear Modulation},\nauthor={Marc Brockschmidt},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe4Cp4KwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJe4Cp4KwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "549;492;624",
        "wc_reply_reviewers": "393;135;164",
        "wc_reply_authors": "1532;915;737",
        "reply_reviewers": "1;2;1",
        "reply_authors": "4;3;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            555.0,
            54.055527006958314
        ],
        "wc_reply_reviewers_avg": [
            230.66666666666666,
            115.395936766518
        ],
        "wc_reply_authors_avg": [
            1061.3333333333333,
            340.6526415899666
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.247219128924647
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 201,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17006226546313472447&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJe4ipEKDB",
        "title": "Restoration of Video Frames from a Single Blurred Image with Motion Understanding",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present a novel unified architecture that restores video frames from a single motion-blurred image in an end-to-end manner.",
        "abstract": "We propose a novel framework to generate clean video frames from a single motion-blurred image.\nWhile a broad range of literature focuses on recovering a single image from a blurred image, in this work, we tackle a more challenging task i.e.  video restoration from a blurred image. We formulate video restoration from a single blurred image as an inverse problem by setting clean image sequence and their respective motion as latent factors, and the blurred image as an observation. Our framework is based on an encoder-decoder structure with spatial transformer network modules to restore a video sequence and its underlying motion in an end-to-end manner. We design a loss function and regularizers with complementary properties to stabilize the training and analyze variant models of the proposed network. The effectiveness and transferability of our network are highlighted through a large set of experiments on two different types of datasets: camera rotation blurs generated from panorama scenes and dynamic motion blurs in high speed videos. Our code and models will be publicly available. ",
        "keywords": "Blur-to-Video;Motion deblurring;Encoder-Decoder",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dawit Mureja Argaw;Junsik Kim;Francois Rameau;Chaoning Zhang;In so Kweon",
        "authorids": "dawitmureja@kaist.ac.kr;mibastro@gmail.com;rameau.fr@gmail.com;chaoningzhang1990@gmail.com;iskweon77@kaist.ac.kr",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJe4ipEKDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "374;647;384",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            468.3333333333333,
            126.40235581489594
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=32579985879233348&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJe5_6VKwS",
        "title": "Model-based Saliency for the Detection of Adversarial Examples",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that gradients are unable to capture shifts in saliency due to adversarial perturbations and present an alternative adversarial defense using learnt saliency models that is effective against both black-box and white-box attacks.",
        "abstract": "Adversarial perturbations cause a shift in the salient features of an image, which may result in a misclassification. We demonstrate that gradient-based saliency approaches are unable to capture this shift, and develop a new defense which detects adversarial examples based on learnt saliency models instead. We study two approaches: a CNN trained to distinguish between natural and adversarial images using the saliency masks produced by our learnt saliency model, and a CNN trained on the salient pixels themselves as its input. On MNIST, CIFAR-10 and ASSIRA, our defenses are able to detect various adversarial attacks, including strong attacks such as C&W and DeepFool, contrary to gradient-based saliency and detectors which rely on the input image. The latter are unable to detect adversarial images when the L_2- and L_infinity- norms of the perturbations are too small. Lastly, we find that the salient pixel based detector improves on saliency map based detectors as it is more robust to white-box attacks.",
        "keywords": "Adversarial Examples;Defense;Model-based Saliency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lisa Schut;Yarin Gal",
        "authorids": "lisaschut94@gmail.com;yarin.gal@cs.ox.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nschut2020modelbased,\ntitle={Model-based Saliency for the Detection of Adversarial Examples},\nauthor={Lisa Schut and Yarin Gal},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe5_6VKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJe5_6VKwS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "320;200;167",
        "wc_reply_reviewers": "177;0;0",
        "wc_reply_authors": "555;392;46",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            229.0,
            65.7419196555744
        ],
        "wc_reply_reviewers_avg": [
            59.0,
            83.43860018001261
        ],
        "wc_reply_authors_avg": [
            331.0,
            212.22786496279573
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PjOkLy_V7TAJ:scholar.google.com/&scioq=Model-based+Saliency+for+the+Detection+of+Adversarial+Examples&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJe6uANtwH",
        "title": "Capsules with Inverted Dot-Product Attention Routing",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a new routing method for Capsule networks, and it performs at-par with ResNet-18 on CIFAR-10/ CIFAR-100.",
        "abstract": "We introduce a new routing algorithm for capsule networks, in which a child capsule is routed to a parent based only on agreement between the parent's state and the child's vote. \nThe new mechanism 1) designs routing via inverted dot-product attention; 2) imposes Layer Normalization as normalization; and 3) replaces sequential iterative routing with concurrent iterative routing.\nWhen compared to previously proposed routing algorithms, our method improves performance on benchmark datasets such as CIFAR-10 and CIFAR-100, and it performs at-par with a powerful CNN (ResNet-18) with 4x fewer parameters.\nOn a different task of recognizing digits from overlayed digit images, the proposed capsule model performs favorably against CNNs given the same number of layers and neurons per layer.  We believe that our work raises the possibility of applying capsule networks to complex real-world tasks.",
        "keywords": "capsule networks;routing;attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yao-Hung Hubert Tsai;Nitish Srivastava;Hanlin Goh;Ruslan Salakhutdinov",
        "authorids": "yaohungt@cs.cmu.edu;nitish_srivastava@apple.com;hanlin@apple.com;rsalakhutdinov@apple.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nTsai2020Capsules,\ntitle={Capsules with Inverted Dot-Product Attention Routing},\nauthor={Yao-Hung Hubert Tsai and Nitish Srivastava and Hanlin Goh and Ruslan Salakhutdinov},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe6uANtwH}\n}",
        "github": "https://github.com/apple/ml-capsules-inverted-attention-routing",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer5",
        "site": "https://openreview.net/forum?id=HJe6uANtwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "469;98;319",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "625;225;361",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.3333333333333,
            152.38183035461356
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            403.6666666666667,
            166.06290642069615
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 115,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17172538805497160703&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJe7bxBYvr",
        "title": "Avoiding Negative Side-Effects and Promoting Safe Exploration with Imaginative Planning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "  With the recent proliferation of the usage of reinforcement learning (RL) agents for solving real-world tasks, safety emerges as a necessary ingredient for their successful application. In this paper, we focus on ensuring the safety of the agent while making sure that the agent does not cause any unnecessary disruptions to its environment. The current approaches to this problem, such as manually constraining the agent or adding a safety penalty to the reward function, can introduce bad incentives. In complex domains, these approaches are simply intractable, as they require knowing apriori all the possible unsafe scenarios an agent could encounter. We propose a model-based approach to safety that allows the agent to look into the future and be aware of the future consequences of its actions. We learn the transition dynamics of the environment and generate a directed graph called the imaginative module. This graph encapsulates all possible trajectories that can be followed by the agent, allowing the agent to efficiently traverse through the imagined environment without ever taking any action in reality. A baseline state, which can either represent a safe or an unsafe state (based on whichever is easier to define) is taken as a human input, and the imaginative module is used to predict whether the current actions of the agent can cause it to end up in dangerous states in the future. Our imaginative module can be seen as a ``plug-and-play'' approach to ensuring safety, as it is compatible with any existing RL algorithm and any task with discrete action space. Our method induces the agent to act safely while learning to solve the task. We experimentally validate our proposal on two gridworld environments and a self-driving car simulator, demonstrating that our approach to safety visits unsafe states significantly less frequently than a baseline.",
        "keywords": "Reinforcement Learning;AI-Safety;Model-Based Reinforcement Learning;Safe-Exploration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dhruv Ramani;Benjamin Eysenbach",
        "authorids": "dhruvramani98@gmail.com;beysenba@cs.cmu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nramani2020avoiding,\ntitle={Avoiding Negative Side-Effects and Promoting Safe Exploration with Imaginative Planning},\nauthor={Dhruv Ramani and Benjamin Eysenbach},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe7bxBYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJe7bxBYvr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "368;527;297",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            397.3333333333333,
            96.16074505165238
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:69wO0DjqqKIJ:scholar.google.com/&scioq=Avoiding+Negative+Side-Effects+and+Promoting+Safe+Exploration+with+Imaginative+Planning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJe7unNFDH",
        "title": "Scaling Up Neural Architecture Search with Big Single-Stage Models",
        "track": "main",
        "status": "Reject",
        "tldr": "We scale up neural architecture search with big single-stage models, surpassing all state-of-the-art models from 200 to 1000 MFLOPs including EfficientNets.",
        "abstract": "Neural architecture search (NAS) methods have shown promising results discovering models that are both accurate and fast. For NAS, training a one-shot model has became a popular strategy to approximate the quality of multiple architectures (child models) using a single set of shared weights. To avoid performance degradation due to parameter sharing, most existing methods have a two-stage workflow where the best child model induced from the one-shot model has to be retrained or finetuned. In this work, we propose BigNAS, an approach that simplifies this workflow and scales up neural architecture search to target a wide range of model sizes simultaneously. We propose several techniques to bridge the gap between the distinct initialization and learning dynamics across small and big models with shared parameters, which enable us to train a single-stage model: a single model from which we can directly slice high-quality child models without retraining or finetuning. With BigNAS we are able to train a single set of shared weights on ImageNet and use these weights to obtain child models whose sizes range from 200 to 1000 MFLOPs. Our discovered model family, BigNASModels, achieve top-1 accuracies ranging from 76.5% to 80.9%, surpassing all state-of-the-art models in this range including EfficientNets.",
        "keywords": "Single-Stage Neural Architecture Search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiahui Yu;Pengchong Jin;Hanxiao Liu;Gabriel Bender;Pieter-Jan Kindermans;Mingxing Tan;Thomas Huang;Xiaodan Song;Quoc Le",
        "authorids": "jyu79@illinois.edu;pengchong@google.com;hanxiaol@google.com;gbender@google.com;pikinder@google.com;tanmingxing@google.com;t-huang1@illinois.edu;xiaodansong@google.com;qvl@google.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@misc{\nyu2020scaling,\ntitle={Scaling Up Neural Architecture Search with Big Single-Stage Models},\nauthor={Jiahui Yu and Pengchong Jin and Hanxiao Liu and Gabriel Bender and Pieter-Jan Kindermans and Mingxing Tan and Thomas Huang and Xiaodan Song and Quoc Le},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe7unNFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJe7unNFDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "461;320;330",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "608;337;213",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            370.3333333333333,
            64.24086619036895
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            386.0,
            164.9383723293845
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17041428800150270592&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJe88xBKPr",
        "title": "Mixed Precision Training With 8-bit Floating Point",
        "track": "main",
        "status": "Reject",
        "tldr": "We demonstrated state-of-the-art training results using 8-bit floating point representation, across Resnet, GNMT, Transformer.",
        "abstract": "Reduced precision computation is one of the key areas addressing the widening\u2019compute gap\u2019, driven by an exponential growth in deep learning applications. In recent years, deep neural network training has largely migrated to 16-bit precision,with significant gains in performance and energy efficiency. However, attempts to train DNNs at 8-bit precision have met with significant challenges, because of the higher precision and dynamic range requirements of back-propagation.   In this paper,  we  propose  a  method  to  train  deep  neural  networks  using  8-bit  floating point representation for weights, activations, errors, and gradients.  We demonstrate state-of-the-art accuracy across multiple data sets (imagenet-1K, WMT16)and a broader set of workloads (Resnet-18/34/50, GNMT, and Transformer) than previously reported.   We propose an enhanced loss scaling method to augment the reduced subnormal range of 8-bit floating point, to improve error propagation.We also examine the impact of quantization noise on generalization, and propose a stochastic rounding technique to address gradient noise. As a result of applying all these techniques,  we report slightly higher validation accuracy compared to full precision baseline.",
        "keywords": "8-bit training;8-bit floating point;low precision training;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Naveen Mellempudi;Sudarshan Srinivasan;Dipankar Das;Bharat Kaul",
        "authorids": "naveen.k.mellempudi@intel.com;sudarshan.srinivasan@intel.com;dipankar.das@intel.com;bharat.kaul@intel.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmellempudi2020mixed,\ntitle={Mixed Precision Training With 8-bit Floating Point},\nauthor={Naveen Mellempudi and Sudarshan Srinivasan and Dipankar Das and Bharat Kaul},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe88xBKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJe88xBKPr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "337;342;137",
        "wc_reply_reviewers": "252;0;0",
        "wc_reply_authors": "1610;791;439",
        "reply_reviewers": "2;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            272.0,
            95.48123724935002
        ],
        "wc_reply_reviewers_avg": [
            84.0,
            118.79393923933998
        ],
        "wc_reply_authors_avg": [
            946.6666666666666,
            490.56724536216467
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 84,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16787795796605737629&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJe9cR4KvB",
        "title": "Learning to Contextually Aggregate Multi-Source Supervision for Sequence Labeling",
        "track": "main",
        "status": "Reject",
        "tldr": "A model to contextually aggregate multi-source supervision for sequence learning.",
        "abstract": "Sequence labeling is a fundamental framework for various natural language processing problems including part-of-speech tagging and named entity recognition. Its performance is largely influenced by the annotation quality and quantity in supervised learning scenarios.  In many cases, ground truth labels are costly and time-consuming to collect or even non-existent,  while imperfect ones could be easily accessed or transferred from different domains. A typical example is crowd-sourced datasets which have multiple annotations for each sentence which may be noisy or incomplete.   Additionally,  predictions from multiple source models in transfer learning can be seen as a case of multi-source supervision.  In this paper, we propose a novel framework named Consensus Network (CONNET) to conduct training with imperfect annotations from multiple sources.   It learns the representation for every weak supervision source and dynamically aggregates them by a context-aware attention mechanism.  Finally, it leads to a model reflecting the consensus among multiple sources.  We evaluate the proposed framework in two practical settings of multi-source learning:  learning with crowd annotations and unsupervised cross-domain model adaptation. Extensive experimental results show that our model achieves significant improvements over existing methods in both settings.",
        "keywords": "crowdsourcing;domain adaptation;sequence labeling;named entity recognition;weak supervision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ouyu Lan*;Xiao Huang*;Bill Yuchen Lin;He Jiang;Xiang Ren",
        "authorids": "olan@usc.edu;huan183@usc.edu;yuchen.lin@usc.edu;jian567@usc.edu;xiangren@usc.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nlan*2020learning,\ntitle={Learning to Contextually Aggregate Multi-Source Supervision for Sequence Labeling},\nauthor={Ouyu Lan* and Xiao Huang* and Bill Yuchen Lin and He Jiang and Xiang Ren},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe9cR4KvB}\n}",
        "github": "https://www.dropbox.com/sh/ru7vdss4xsv2j29/AADji_r6MXGVi5-97mngNCV3a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJe9cR4KvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "249;237;304",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "583;635;124",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            263.3333333333333,
            29.169999809545576
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            447.3333333333333,
            229.6146530361007
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9657841048916803409&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJeANgBYwr",
        "title": "Towards Scalable Imitation Learning for Multi-Agent Systems with Graph Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Improve the scalability of graph neural networks on imitation learning and prediction of swarm motion",
        "abstract": "We propose an implementation of GNN that predicts and imitates the motion be- haviors from observed swarm trajectory data. The network\u2019s ability to capture interaction dynamics in swarms is demonstrated through transfer learning. We finally discuss the inherent availability and challenges in the scalability of GNN, and proposed a method to improve it with layer-wise tuning and mixing of data enabled by padding.",
        "keywords": "Graph Neural Networks;Scalability;Swarms;Imitation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Siyu Zhou;Chaitanya Rajasekhar;Mariano J. Phielipp;Heni Ben Amor",
        "authorids": "siyu.zhou.ac@gmail.com;crajase1@asu.edu;mariano.j.phielipp@intel.com;hbenamor@asu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhou2020towards,\ntitle={Towards Scalable Imitation Learning for Multi-Agent Systems with Graph Neural Networks},\nauthor={Siyu Zhou and Chaitanya Rajasekhar and Mariano J. Phielipp and Heni Ben Amor},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeANgBYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJeANgBYwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "262;362;373",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "104;97;369",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            332.3333333333333,
            49.93551397107629
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            190.0,
            126.60437064598783
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2999484889750575813&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJeEP04KDH",
        "title": "Quantized Reinforcement Learning (QuaRL)",
        "track": "main",
        "status": "Reject",
        "tldr": "We conduct 350+ experiments to show that RL models can be quantized to 6-8 bits without harming quality; show narrower weight distribution facilitates quantization; show quantization speeds training by 50% and inference by 18x.",
        "abstract": "Recent work has shown that quantization can help reduce the memory, compute, and energy demands of deep neural networks without significantly harming their quality. However, whether these prior techniques, applied traditionally to image-based models, work with the same efficacy to the sequential decision making process in reinforcement learning remains an unanswered question. To address this void, we conduct the first comprehensive empirical study that quantifies the effects of quantization on various deep reinforcement learning policies with the intent to reduce their computational resource demands. We apply techniques such as post-training quantization and quantization aware training to a spectrum of reinforcement learning tasks (such as Pong, Breakout, BeamRider and more) and training algorithms (such as PPO, A2C, DDPG, and DQN). Across this spectrum of tasks and learning algorithms, we show that policies can be quantized to 6-8 bits of precision without loss of accuracy. Additionally, we show that certain tasks and reinforcement learning algorithms yield policies that are more difficult to quantize due to their effect of widening the models' distribution of weights and that quantization aware training consistently improves results over post-training quantization and oftentimes even over the full precision baseline. Finally, we demonstrate the real-world applications of quantization for reinforcement learning. We use half-precision training to train a Pong model 50 % faster, and we deploy a quantized reinforcement learning based navigation policy to an embedded system, achieving an 18x speedup and a 4x reduction in memory usage over an unquantized policy.",
        "keywords": "Deep Reinforcement Learning;Quantization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Srivatsan Krishnan;Sharad Chitlangia;Maximilian Lam;Zishen Wan;Aleksandra Faust;Vijay Janapa Reddi",
        "authorids": "srivatsan@seas.harvard.edu;f20170472@goa.bits-pilani.ac.in;maxlam@g.harvard.edu;zishenwan@g.harvard.edu;sandrafaust@google.com;vj@eecs.harvard.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nkrishnan2020quantized,\ntitle={Quantized Reinforcement Learning (Qua{\\{}RL{\\}})},\nauthor={Srivatsan Krishnan and Sharad Chitlangia and Maximilian Lam and Zishen Wan and Aleksandra Faust and Vijay Janapa Reddi},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeEP04KDH}\n}",
        "github": "https://github.com/quarl-iclr/quarl",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJeEP04KDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1086;335;246",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "731;358;184",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            555.6666666666666,
            376.75839591028216
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            424.3333333333333,
            228.18462310642718
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=942934226440470920&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJeFmkBtvB",
        "title": "Annealed Denoising score matching: learning Energy based model in high-dimensional spaces",
        "track": "main",
        "status": "Reject",
        "tldr": "Learned energy based model with score matching",
        "abstract": "Energy  based  models  outputs  unmormalized  log-probability  values  given  datasamples.  Such a estimation is essential in a variety of application problems suchas sample generation, denoising, sample restoration, outlier detection, Bayesianreasoning, and many more.  However, standard maximum likelihood training iscomputationally expensive due to the requirement of sampling model distribution.Score matching potentially alleviates this problem, and denoising score matching(Vincent, 2011) is a particular convenient version.  However,  previous attemptsfailed to produce models capable of high quality sample synthesis.  We believethat  it  is  because  they  only  performed  denoising  score  matching  over  a  singlenoise scale. To overcome this limitation, here we instead learn an energy functionusing all noise scales.   When sampled using Annealed Langevin dynamics andsingle step denoising jump, our model produced high-quality samples comparableto state-of-the-art techniques such as GANs, in addition to assigning likelihood totest data comparable to previous likelihood models.  Our model set a new sam-ple quality baseline in likelihood-based models.  We further demonstrate that our model learns sample distribution and generalize well on an image inpainting tasks.",
        "keywords": "Energy based models;score matching;annealing;likelihood;generative model;unsupervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zengyi Li;Yubei Chen;Friedrich T. Sommer",
        "authorids": "zengyi_li@berkeley.edu;yubeic@eecs.berkeley.edu;fsommer@berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nli2020annealed,\ntitle={Annealed Denoising score matching: learning Energy based model in high-dimensional spaces},\nauthor={Zengyi Li and Yubei Chen and Friedrich T. Sommer},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeFmkBtvB}\n}",
        "github": "https://anonymous.4open.science/r/b85d6f62-99f6-458b-a101-3d35583ac11d/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJeFmkBtvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "279;613;513",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "139;223;233",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            468.3333333333333,
            139.96507500881145
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            198.33333333333334,
            42.153159892099296
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14063879876599615449&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJeICn4YwS",
        "title": "Macro Action Ensemble Searching Methodology for Deep Reinforcement Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In this paper, we propose to improve the performance of deep reinforcement learn- ing (DRL) methods by searching for a feasible macro action ensemble to augment the action space of an agent. A macro action ensemble is composed of multiple macro actions, which are typically defined as sequences of primitive actions. A well-defined macro action ensemble enables a DRL agent to achieve higher performance than conventional DRL methods on a variety of tasks. However, macro actions generated by previous approaches are either not necessarily promising, or limited to specific forms. As a result, in this study, we investigate a search- ing method to learn the macro action ensemble from the environment of interest. The proposed method is inspired by the concepts of neural architecture search techniques, which are capable of developing network architectures for different tasks. These search techniques, such as NASNet or MetaQNN, have been proven to generate high-performance neural network architectures in large search spaces. In order to search in large macro action ensemble spaces, we propose to embrace Deep Q-Learning to search the macro action ensemble space for a good ensemble. Our approach iteratively discovers new ensembles of macro actions with better performance on the learning task. The proposed method is able to search finite macro action ensemble spaces directly, that the other contemporary methods have yet to achieve. Our experimental results show that the scores attained by the policy trained with the discovered macro action ensemble outperforms those without it. Moreover, the policies using our macro action ensemble are more efficient in exploration and able to converge faster. We further perform a comprehensive set of ablative analyses to validate the proposed methodology.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu-Ming Chen;Chien Liu;Tsu-Ching Hsiao;Kuan-Yu Chang;Chun-Yee Lee",
        "authorids": "allenchen0958@gapp.nthu.edu.tw;liu_chien@gapp.nthu.edu.tw;zexlus1126@gapp.nthu.edu.tw;kychang@elsa.cs.nthu.edu.tw;cylee@cs.nthu.edu.tw",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=HJeICn4YwS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zkO4Kj-sNKMJ:scholar.google.com/&scioq=Macro+Action+Ensemble+Searching+Methodology+for+Deep+Reinforcement+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJeIU0VYwB",
        "title": "ADA+: A GENERIC FRAMEWORK WITH MORE ADAPTIVE EXPLICIT ADJUSTMENT FOR LEARNING RATE",
        "track": "main",
        "status": "Reject",
        "tldr": "This work proposes a novel generic framework, in which we explicitly analyze different behaviors brought by various types of \u03a6(\u00b7),  and based on the framework we propose a more adaptive optimization algorithm.",
        "abstract": "Although adaptive algorithms have achieved significant success in training deep neural networks with faster training speed, they tend to have poor generalization performance compared to SGD with Momentum(SGDM). One of the state-of-the-art algorithms, PADAM, is proposed to close the generalization gap of adaptive methods while lacking an internal explanation. This work pro- poses a general framework, in which we use an explicit function \u03a6(\u00b7) as an adjustment to the actual step size, and present a more adaptive specific form AdaPlus(Ada+). Based on this framework, we analyze various behaviors brought by different types of \u03a6(\u00b7), such as a constant function in SGDM, a linear function in Adam, a concave function in Padam and a concave function with offset term in AdaPlus. Empirically, we conduct experiments on classic benchmarks both in CNN and RNN architectures and achieve better performance(even than SGDM).\n",
        "keywords": "Optimization;Adaptive Methods;Convergence;Convolutional Neural Network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yue Zhao;Xiangsheng Huang;Ludan Kou",
        "authorids": "oasis.random.time@gmail.com;xiangsheng.huang@ia.ac.cn;2015019051@mail.buct.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhao2020ada,\ntitle={{\\{}ADA{\\}}+: A {\\{}GENERIC{\\}} {\\{}FRAMEWORK{\\}} {\\{}WITH{\\}} {\\{}MORE{\\}} {\\{}ADAPTIVE{\\}} {\\{}EXPLICIT{\\}} {\\{}ADJUSTMENT{\\}} {\\{}FOR{\\}} {\\{}LEARNING{\\}} {\\{}RATE{\\}}},\nauthor={Yue Zhao and Xiangsheng Huang and Ludan Kou},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeIU0VYwB}\n}",
        "github": "https://anonfiles.com/daV7Ed6enb/AdaPlus_zip",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJeIU0VYwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "434;380;221",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            345.0,
            90.41017641836565
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MkJ0yp74EqsJ:scholar.google.com/&scioq=ADA%2B:+A+GENERIC+FRAMEWORK+WITH+MORE+ADAPTIVE+EXPLICIT+ADJUSTMENT+FOR+LEARNING+RATE&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJeIX6EKvr",
        "title": "Leveraging inductive bias of neural networks for learning without explicit human annotations",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Classification problems today are typically solved by first collecting examples along with candidate labels, second obtaining clean labels from workers, \nand third training a large, overparameterized deep neural network on the clean examples. The second, labeling step is often the most expensive one as it requires manually going through all examples.\nIn this paper we skip the labeling step entirely and propose to directly train the deep neural network on the noisy raw labels and early stop the training to avoid overfitting.\nWith this procedure we exploit an intriguing property of large overparameterized neural networks: While they are capable of perfectly fitting the noisy data, gradient descent fits clean labels much faster than the noisy ones, thus early stopping resembles training on the clean labels.\nOur results show that early stopping the training of standard deep networks such as ResNet-18 on part of the Tiny Images dataset, which does not involve any human labeled data, and of which only about half of the labels are correct, gives a significantly higher test performance than when trained on the clean CIFAR-10 training dataset, which is a labeled version of the Tiny Images dataset, for the same classification problem.\nIn addition, our results show that the noise generated through the label collection process is not nearly as adversarial for learning as the noise generated by randomly flipping labels, which is the noise most prevalent in works demonstrating noise robustness of neural networks.",
        "keywords": "dataset construction;deep learning;candidate examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fatih Furkan Yilmaz;Reinhard Heckel",
        "authorids": "fy11@rice.edu;rh43@rice.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nyilmaz2020leveraging,\ntitle={Leveraging inductive bias of neural networks for learning without explicit human annotations},\nauthor={Fatih Furkan Yilmaz and Reinhard Heckel},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeIX6EKvr}\n}",
        "github": "https://www.dropbox.com/sh/z3kt1rpk61idg0m/AAA--xI-5QWnArath3aM-ztha?dl=0",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJeIX6EKvr",
        "pdf_size": 0,
        "rating": "3;6",
        "confidence": "0;0",
        "wc_review": "388;260",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "670;372",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            324.0,
            64.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            521.0,
            149.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:n6PSFTozx5kJ:scholar.google.com/&scioq=Leveraging+inductive+bias+of+neural+networks+for+learning+without+explicit+human+annotations&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJeIrlSFDH",
        "title": "Resolving Lexical Ambiguity in English\u2013Japanese Neural Machine Translation",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "The paper solves a lexical ambiguity problem caused from homonym in neural translation by BERT.",
        "abstract": "Lexical ambiguity, i.e., the presence of two or more meanings for a single word, is an inherent and challenging problem for machine translation systems. Even though the use of recurrent neural networks and attention mechanisms are expected to solve this problem, machine translation systems are not always able to correctly translate lexically ambiguous sentences. In this work, I attempt to resolve the problem of lexical ambiguity in English--Japanese neural machine translation systems by combining a pretrained Bidirectional Encoder Representations from Transformer (BERT) language model that can produce contextualized word embeddings and a Transformer translation model, which is a state-of-the-art architecture for the machine translation task. These two proposed architectures have been shown to be more effective in translating ambiguous sentences than a vanilla Transformer model and the Google Translate system. Furthermore, one of the proposed models, the Transformer_BERT-WE, achieves a higher BLEU score compared to the vanilla Transformer model in terms of general translation, which is concrete proof that the use of contextualized word embeddings from BERT can not only solve the problem of lexical ambiguity, but also boost the translation quality in general.\n",
        "keywords": "Neural Machine Translation;Lexical Ambiguity;Transformer;BERT",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Quang Minh Do;Incheon Paik",
        "authorids": "quminh.do@gmail.com;paikic@u-aizu.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nanonymous2020resolving,\ntitle={Resolving Lexical Ambiguity in English{\\textendash}Japanese Neural Machine Translation},\nauthor={Anonymous},\nbooktitle={Submitted to International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeIrlSFDH},\nnote={under review}\n}",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=HJeIrlSFDH",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13977174822049452304&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJeLBpEFPB",
        "title": "Unsupervised Universal Self-Attention Network for Graph Classification",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Existing graph embedding models often have weaknesses in exploiting graph structure similarities, potential dependencies among nodes and global network properties. To this end, we present U2GAN, a novel unsupervised model leveraging on the strength of the recently introduced universal self-attention network (Dehghani et al., 2019), to learn low-dimensional embeddings of graphs which can be used for graph classification. In particular, given an input graph, U2GAN first applies a self-attention computation, which is then followed by a recurrent transition to iteratively memorize its attention on vector representations of each node and its neighbors across each iteration. Thus, U2GAN can address the weaknesses in the existing models in order to produce plausible node embeddings whose sum is the final embedding of the whole graph. Experimental results show that our unsupervised U2GAN produces new state-of-the-art performances on a range of well-known benchmark datasets for the graph classification task. It even outperforms supervised methods in most of benchmark cases.",
        "keywords": "Graph embedding;graph classification;universal self-attention network;graph neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dai Quoc Nguyen;Tu Dinh Nguyen;Dinh Phung",
        "authorids": "dai.nguyen@monash.edu;tu.dinh.nguyen@monash.edu;dinh.phung@monash.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nnguyen2020unsupervised,\ntitle={Unsupervised Universal Self-Attention Network for Graph Classification},\nauthor={Dai Quoc Nguyen and Tu Dinh Nguyen and Dinh Phung},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeLBpEFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJeLBpEFPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "242;365;354",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "312;372;280",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            320.3333333333333,
            55.57177541002467
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            321.3333333333333,
            38.134265722866914
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13563027840733789397&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJeN6grYDr",
        "title": "WEEGNET: an wavelet based Convnet for Brain-computer interfaces",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Brain-computer interfaces (BCI) are systems that link the brain with machines using brainwaves as a medium of communication using electroencephalography to explore the brain activity which is an affordable solution, noninvasive, easy setup, and portability. However, the neural signals are noisy, non-stationary, and nonlinear where the processing of those signals in a pattern recognition problem needs a complex pipeline of preprocessing, feature extraction, and classification algorithms that need an apriori knowledge to avoid compatibility issues and a deep understanding of the studied signals. Moreover, some techniques need a huge computational power on the CPU and a huge size of RAM. Therefore, several papers proposed to use Deep Learning to get state of the art performance and visualization of the learned features to have more understanding about the neural signals. But, the convolutional neural network (Convnet) are not used properly and the results are often random when we reproduced the works. Hence, we propose a combination of the discrete wavelet transform (DWT) and a Convnet that processes raw EEG data. The DWT will be used to reduce the size of the data without losing useful information. Also, a modified version of EEGNET will be used to extract the features and classification.\n\t",
        "keywords": "BCI;EEG;Convnet;DWT;WEEGNET",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mouad Riyad;Mohammed Khalil;Abdellah Adib",
        "authorids": "riyadmouad1@gmail.com;medkhalil87@gmail.com;adib@fstm.ma",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJeN6grYDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "226;346;134",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            235.33333333333334,
            86.79989759338558
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ihrw7cfiHboJ:scholar.google.com/&scioq=WEEGNET:+an+wavelet+based+Convnet+for+Brain-computer+interfaces&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJeO7RNKPr",
        "title": "DeepV2D: Video to Depth with Differentiable Structure from Motion",
        "track": "main",
        "status": "Poster",
        "tldr": "DeepV2D predicts depth from a video clip by composing elements of classical SfM into a fully differentiable network.",
        "abstract": "We propose DeepV2D, an end-to-end deep learning architecture for predicting depth from video.  DeepV2D combines the representation ability of neural networks with the geometric principles governing image formation. We compose a collection of classical geometric algorithms, which are converted into trainable modules and combined into an end-to-end differentiable architecture. DeepV2D interleaves two stages: motion estimation and depth estimation. During inference, motion and depth estimation are alternated and converge to accurate depth. ",
        "keywords": "Structure-from-Motion;Video to Depth;Dense Depth Estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zachary Teed;Jia Deng",
        "authorids": "zteed@princeton.edu;jiadeng@princeton.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nTeed2020DeepV2D:,\ntitle={DeepV2D: Video to Depth with Differentiable Structure from Motion},\nauthor={Zachary Teed and Jia Deng},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeO7RNKPr}\n}",
        "github": "[![github](/images/github_icon.svg) princeton-vl/DeepV2D](https://github.com/princeton-vl/DeepV2D)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJeO7RNKPr",
        "pdf_size": 0,
        "rating": "6;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "151;202;483;206",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "121;7;372;118",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            6.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            260.5,
            130.2775882490922
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            154.5,
            133.71331272539769
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 305,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=564045569449021652&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJeOHJHFPH",
        "title": "FACE SUPER-RESOLUTION GUIDED BY 3D FACIAL PRIORS",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a novel face super resolution method that explicitly incorporates 3D facial priors which grasp the sharp facial structures.",
        "abstract": "State-of-the-art face super-resolution methods employ deep convolutional neural networks to learn a mapping between low- and high-resolution facial patterns by exploring local appearance knowledge. However, most of these methods do not well exploit facial structures and identity information, and struggle to deal with facial images that exhibit large pose variation and misalignment. In this paper, we propose a novel face super-resolution method that explicitly incorporates 3D facial priors which grasp the sharp facial structures. Firstly, the 3D face rendering branch is set up to obtain 3D priors of salient facial structures and identity knowledge. Secondly, the Spatial Attention Mechanism is used to better exploit this hierarchical information (i.e. intensity similarity, 3D facial structure, identity content) for the super-resolution problem. Extensive experiments demonstrate that the proposed algorithm achieves superior face super-resolution results and outperforms the state-of-the-art.",
        "keywords": "Super-resolution;3D Facial priors;Spatial Attention Mechanism",
        "primary_area": "",
        "supplementary_material": "",
        "author": "xiaobin hu;wenqi ren;jiaolong yang;xiaochun cao;Xiaoming Li;John LaMaster;Bjoern Menze;wei liu",
        "authorids": "xiaobin.hu@tum.de;rwq.renwenqi@gmail.com;jiaoyan@microsoft.com;caoxiaochun@iie.ac.cn;hit.xmshr@gmail.com;jlamaste@gmail.com;bjoern.menze@tum.de;wl2223@columbia.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJeOHJHFPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "178;348;350",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.0,
            80.61430807658567
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 85,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1494044347892756460&as_sdt=5,47&sciodt=0,47&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJeOekHKwr",
        "title": "Smoothness and Stability in GANs",
        "track": "main",
        "status": "Poster",
        "tldr": "We develop a principled theoretical framework for understanding and enforcing the stability of various types of GANs",
        "abstract": "Generative adversarial networks, or GANs, commonly display unstable behavior during training. In this work, we develop a principled theoretical framework for understanding the stability of various types of GANs. In particular, we derive conditions that guarantee eventual stationarity of the generator when it is trained with gradient descent, conditions that must be satisfied by the divergence that is minimized by the GAN and the generator's architecture. We find that existing GAN variants satisfy some, but not all, of these conditions. Using tools from convex analysis, optimal transport, and reproducing kernels, we construct a GAN that fulfills these conditions simultaneously. In the process, we explain and clarify the need for various existing GAN stabilization techniques, including Lipschitz constraints, gradient penalties, and smooth activation functions.",
        "keywords": "generative adversarial networks;stability;smoothness;convex conjugate",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Casey Chu;Kentaro Minami;Kenji Fukumizu",
        "authorids": "caseychu@stanford.edu;minami@preferred.jp;fukumizu@ism.ac.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nChu2020Smoothness,\ntitle={Smoothness and Stability in GANs},\nauthor={Casey Chu and Kentaro Minami and Kenji Fukumizu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeOekHKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJeOekHKwr",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "480;322;538",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1075;367;367",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            446.6666666666667,
            91.27735510823896
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            603.0,
            333.75440072005046
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 79,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16379618531443277584&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJePXkHtvS",
        "title": "Deep Generative Classifier for Out-of-distribution Sample Detection",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposes a deep generative classifier which is effective to detect out-of-distribution samples as well as classify in-distribution samples, by integrating the concept of Gaussian discriminant analysis into deep neural networks.",
        "abstract": "The capability of reliably detecting out-of-distribution samples is one of the key factors in deploying a good classifier, as the test distribution always does not match with the training distribution in most real-world applications. In this work, we propose a deep generative classifier which is effective to detect out-of-distribution samples as well as classify in-distribution samples, by integrating the concept of Gaussian discriminant analysis into deep neural networks. Unlike the discriminative (or softmax) classifier that only focuses on the decision boundary partitioning its latent space into multiple regions, our generative classifier aims to explicitly model class-conditional distributions as separable Gaussian distributions. Thereby, we can define the confidence score by the distance between a test sample and the center of each distribution. Our empirical evaluation on multi-class images and tabular data demonstrate that the generative classifier achieves the best performances in distinguishing out-of-distribution samples, and also it can be generalized well for various types of deep neural networks.",
        "keywords": "Out-of-distribution Detection;Generative Classifier;Deep Neural Networks;Multi-class Classification;Gaussian Discriminant Analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongha Lee;Sehun Yu;Hwanjo Yu",
        "authorids": "dongha0914@postech.ac.kr;hunu12@postech.ac.kr;hwanjoyu@postech.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlee2020deep,\ntitle={Deep Generative Classifier for Out-of-distribution Sample Detection},\nauthor={Dongha Lee and Sehun Yu and Hwanjo Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=HJePXkHtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJePXkHtvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "317;215;272",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "485;334;279",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            268.0,
            41.737273509418415
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            366.0,
            87.08999177096452
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4tp5SmS4MyYJ:scholar.google.com/&scioq=Deep+Generative+Classifier+for+Out-of-distribution+Sample+Detection&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJeRveHKDH",
        "title": "ADAPTIVE GENERATION OF PROGRAMMING PUZZLES",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce Programming Puzzles, an expressive set of reasoning problems that require minimal understanding of other domains like language. We then introduce TroubleMakers, an algorithm to generate hard programming puzzles. ",
        "abstract": "AI today is far from being able to write complex programs. What type of problems\nwould be best for computers to learn to program, and how should such problems\nbe generated? To answer the first question, we suggest programming puzzles as a\ndomain for teaching computers programming. A programming puzzle consists of a\nshort program for a Boolean function f(x) and the goal is, given the source code, to\nfind an input that makes f return True. Puzzles are objective in that one can easily\ntest the correctness of a given solution x by seeing whether it satisfies f, unlike the\nmost common representations for program synthesis: given input-output pairs or an\nEnglish problem description, the correctness of a given solution is not determined\nand is debatable. To address the second question of automatic puzzle generation,\nwe suggest a GAN-like generation algorithm called \u201cTroublemaker\u201d which can\ngenerate puzzles targeted at any given puzzle-solver. The main innovation is that it\nadapts to one or more given puzzle-solvers: rather than generating a single dataset\nof puzzles, Tro",
        "keywords": "program synthesis;reasoning;math problems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ashwin Kalyan;Oleksandr Polozov;Adam Tauman Kalai",
        "authorids": "ashwinkv@gatech.edu;alex.polozov@microsoft.com;adam.kalai@microsoft.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkalyan2020adaptive,\ntitle={{\\{}ADAPTIVE{\\}} {\\{}GENERATION{\\}} {\\{}OF{\\}} {\\{}PROGRAMMING{\\}} {\\{}PUZZLES{\\}}},\nauthor={Ashwin Kalyan and Oleksandr Polozov and Adam Tauman Kalai},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeRveHKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJeRveHKDH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "696;306;301",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "759;306;166",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            434.3333333333333,
            185.0375337300216
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            410.3333333333333,
            253.0827708258484
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1662219024322315372&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJeT3yrtDr",
        "title": "Cross-Lingual Ability of Multilingual BERT: An Empirical Study",
        "track": "main",
        "status": "Poster",
        "tldr": "Comprehensive analysis on Linguistic Properties, Model Architecture, and Input and Learning Objective of cross-lingual ability of Multilingual BERT",
        "abstract": "Recent work has exhibited the surprising cross-lingual abilities of multilingual BERT (M-BERT) -- surprising since it is trained without any cross-lingual objective and with no aligned data. In this work, we provide a comprehensive study of the contribution of different components in M-BERT to its cross-lingual ability. We study the impact of linguistic properties of the languages, the architecture of the model, and the learning objectives. The experimental study is done in the context of three typologically different languages -- Spanish, Hindi, and Russian -- and using two conceptually different NLP tasks, textual entailment and named entity recognition. Among our key conclusions is the fact that the lexical overlap between languages plays a negligible role in the cross-lingual success, while the depth of the network is an integral part of it. All our models and implementations can be found on our project page: http://cogcomp.org/page/publication_view/900.",
        "keywords": "Cross-Lingual Learning;Multilingual BERT",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Karthikeyan K;Zihan Wang;Stephen Mayhew;Dan Roth",
        "authorids": "kkarthi@seas.upenn.edu;zihanw2@illinois.edu;mayhew@seas.upenn.edu;danroth@seas.upenn.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nK2020Cross-Lingual,\ntitle={Cross-Lingual Ability of Multilingual BERT: An Empirical Study},\nauthor={Karthikeyan K and Zihan Wang and Stephen Mayhew and Dan Roth},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeT3yrtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJeT3yrtDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "359;413;468",
        "wc_reply_reviewers": "41;0;0",
        "wc_reply_authors": "357;256;324",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            413.3333333333333,
            44.4996878890428
        ],
        "wc_reply_reviewers_avg": [
            13.666666666666666,
            19.3275853524323
        ],
        "wc_reply_authors_avg": [
            312.3333333333333,
            42.05023450852828
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 396,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6616008322819007942&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJeTo2VFwH",
        "title": "A Signal Propagation Perspective for Pruning Neural Networks at Initialization",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We formally characterize the initialization conditions for effective pruning at initialization and analyze the signal propagation properties of the resulting pruned networks which leads to a method to enhance their trainability and pruning results.",
        "abstract": "Network pruning is a promising avenue for compressing deep neural networks. A typical approach to pruning starts by training a model and then removing redundant parameters while minimizing the impact on what is learned. Alternatively, a recent approach shows that pruning can be done at initialization prior to training, based on a saliency criterion called connection sensitivity. However, it remains unclear exactly why pruning an untrained, randomly initialized neural network is effective. In this work, by noting connection sensitivity as a form of gradient, we formally characterize initialization conditions to ensure reliable connection sensitivity measurements, which in turn yields effective pruning results. Moreover, we analyze the signal propagation properties of the resulting pruned networks and introduce a simple, data-free method to improve their trainability. Our modifications to the existing pruning at initialization method lead to improved results on all tested network models for image classification tasks. Furthermore, we empirically study the effect of supervision for pruning and demonstrate that our signal propagation perspective, combined with unsupervised pruning, can be useful in various scenarios where pruning is applied to non-standard arbitrarily-designed architectures.",
        "keywords": "neural network pruning;signal propagation perspective;sparse neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Namhoon Lee;Thalaiyasingam Ajanthan;Stephen Gould;Philip H. S. Torr",
        "authorids": "namhoon@robots.ox.ac.uk;thalaiyasingam.ajanthan@anu.edu.au;stephen.gould@anu.edu.au;phst@robots.ox.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLee2020A,\ntitle={A Signal Propagation Perspective for Pruning Neural Networks at Initialization},\nauthor={Namhoon Lee and Thalaiyasingam Ajanthan and Stephen Gould and Philip H. S. Torr},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeTo2VFwH}\n}",
        "github": "[![github](/images/github_icon.svg) namhoonlee/spp-public](https://github.com/namhoonlee/spp-public)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJeTo2VFwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "245;203;344",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "702;166;82",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            264.0,
            59.1100668245266
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            316.6666666666667,
            274.621355485856
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 198,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17910397385067453379&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJeVnCEKwH",
        "title": "A Closer Look at the Optimization Landscapes of Generative Adversarial Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "By proposing new visualization techniques we give better insights on GANs optimization in practical settings, we show that GANs on challenging datasets exhibit rotational behavior and do not converge to Nash-Equilibria",
        "abstract": "Generative adversarial networks have been very successful in generative modeling, however they remain relatively challenging to train compared to standard deep neural networks. In this paper, we propose new visualization techniques for the optimization landscapes of GANs that enable us to study the game vector field resulting from the concatenation of the gradient of both players.   Using these visualization techniques we try to bridge the gap between theory and practice by showing empirically that the training of GANs exhibits significant rotations around LSSP, similar to the one predicted by theory on toy examples. Moreover, we provide empirical evidence that GAN training seems to converge to a stable stationary point which is a saddle point for the generator loss, not a minimum, while still achieving excellent performance.",
        "keywords": "Deep Learning;Generative models;GANs;Optimization;Visualization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hugo Berard;Gauthier Gidel;Amjad Almahairi;Pascal Vincent;Simon Lacoste-Julien",
        "authorids": "berard.hugo@gmail.com;gauthier.gidel@umontreal.ca;amjadmahayri@gmail.com;vincentp@iro.umontreal.ca;slacoste@iro.umontreal.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nBerard2020A,\ntitle={A Closer Look at the Optimization Landscapes of Generative Adversarial Networks},\nauthor={Hugo Berard and Gauthier Gidel and Amjad Almahairi and Pascal Vincent and Simon Lacoste-Julien},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeVnCEKwH}\n}",
        "github": "https://anonymous.4open.science/repository/a93c04c6-a0b9-49ff-9c14-f817fd405fda/README.md",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJeVnCEKwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "497;644;442",
        "wc_reply_reviewers": "4;0;13",
        "wc_reply_authors": "498;372;1419",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            527.6666666666666,
            85.26950738035777
        ],
        "wc_reply_reviewers_avg": [
            5.666666666666667,
            5.436502143433363
        ],
        "wc_reply_authors_avg": [
            763.0,
            466.705474576847
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 83,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8697338348379515621&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJeYSxHFDS",
        "title": "Gauge Equivariant Spherical CNNs",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposes a scalable equivariant spherical convolution.",
        "abstract": "Spherical CNNs are convolutional neural networks that can process signals on the sphere, such as global climate and weather patterns or omnidirectional images. Over the last few years, a number of spherical convolution methods have been proposed, based on generalized spherical FFTs, graph convolutions, and other ideas. However, none of these methods is simultaneously equivariant to 3D rotations, able to detect anisotropic patterns, computationally efficient, agnostic to the type of sample grid used, and able to deal with signals defined on only a part of the sphere. To address these limitations, we introduce the Gauge Equivariant Spherical CNN. Our method is based on the recently proposed theory of Gauge Equivariant CNNs, which is in principle applicable to signals on any manifold, and which can be computed on any set of local charts covering all of the manifold or only part of it. In this paper we show how this method can be implemented efficiently for the sphere, and show that the resulting method is fast, numerically accurate, and achieves good results on the widely used benchmark problems of climate pattern segmentation and omnidirectional semantic segmentation.",
        "keywords": "deep learning;convolutional networks;equivariance;gauge equivariance;symmetry;geometric deep learning;manifold convolution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Berkay Kicanaoglu;Pim de Haan;Taco Cohen",
        "authorids": "b.kicanaoglu@uva.nl;pimdehaan@gmail.com;taco.cohen@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkicanaoglu2020gauge,\ntitle={Gauge Equivariant Spherical {\\{}CNN{\\}}s},\nauthor={Berkay Kicanaoglu and Pim de Haan and Taco Cohen},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeYSxHFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJeYSxHFDS",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "149;309;138",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "302;154;214",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            198.66666666666666,
            78.14658590680011
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            223.33333333333334,
            60.780113706887896
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:z7XAk8EUYlcJ:scholar.google.com/&scioq=Gauge+Equivariant+Spherical+CNNs&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJeYalBKvr",
        "title": "Attention over Phrases",
        "track": "main",
        "status": "Reject",
        "tldr": "We study the problem of representing phrases as atoms in attention.",
        "abstract": "How to represent the sentence ``That's the last straw for her''? The answer of the self-attention is a weighted sum of each individual words, i.e. $$semantics=\\alpha_1Emb(\\text{That})+\\alpha_2Emb(\\text{'s})+\\cdots+\\alpha_nEmb(\\text{her})$$. But the weighted sum of ``That's'', ``the'', ``last'', ``straw'' can hardly represent the semantics of the phrase. We argue that the phrases play an important role in attention.\nIf we combine some words into phrases, a more reasonable representation with compositions is \n$$semantics=\\alpha_1Emb(\\text{That's})+Emb_2(\\text{the last straw})+\\alpha_3Emb(\\text{for})+\\alpha_4Emb(\\text{her})$$.\nWhile recent studies prefer to use the attention mechanism to represent the natural language, few noticed the word compositions. In this paper, we study the problem of representing such compositional attentions in phrases. In this paper, we proposed a new attention architecture called HyperTransformer. Besides representing the words of the sentence, we introduce hypernodes to represent the candidate phrases in attention. \nHyperTransformer has two phases. The first phase is used to attend over all word/phrase pairs, which is similar to the standard Transformer. The second phase is used to represent the inductive bias within each phrase. Specially, we incorporate the non-linear attention in the second phase. The non-linearity represents the the semantic mutations in phrases. The experimental performance has been greatly improved. In WMT16 English-German translation task, the BLEU increases from 20.90 (by Transformer) to 34.61 (by HyperTransformer).",
        "keywords": "representation learning;natural language processing;attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wanyun Cui",
        "authorids": "cui.wanyun@sufe.edu.cn",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\ncui2020attention,\ntitle={Attention over Phrases},\nauthor={Wanyun Cui},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeYalBKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJeYalBKvr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "237;183;600",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            340.0,
            185.16479146965278
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MHmcK4JKnTQJ:scholar.google.com/&scioq=Attention+over+Phrases&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJe_Z04Yvr",
        "title": "Adjustable Real-time Style Transfer",
        "track": "main",
        "status": "Poster",
        "tldr": "Stochastic style transfer with adjustable features. ",
        "abstract": "Artistic style transfer is the problem of synthesizing an image with content similar to a given image and style similar to another. Although recent feed-forward neural networks can generate stylized images in real-time, these models produce a single stylization given a pair of style/content images, and the user doesn't have control over the synthesized output. Moreover, the style transfer depends on the hyper-parameters of the model with varying ``optimum\" for different input images. Therefore, if the stylized output is not appealing to the user, she/he has to try multiple models or retrain one with different hyper-parameters to get a favorite stylization. In this paper, we address these issues by proposing a novel method which allows adjustment of crucial hyper-parameters, after the training and in real-time, through a set of manually adjustable parameters. These parameters enable the user to modify the synthesized outputs from the same pair of style/content images, in search of a favorite stylized image. Our quantitative and qualitative experiments indicate how adjusting these parameters is comparable to retraining the model with different hyper-parameters. We also demonstrate how these parameters can be randomized to generate results which are diverse but still very similar in style and content.",
        "keywords": "Image Style Transfer;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohammad Babaeizadeh;Golnaz Ghiasi",
        "authorids": "mb2@uiuc.edu;golnazg@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nBabaeizadeh2020Adjustable,\ntitle={Adjustable Real-time Style Transfer},\nauthor={Mohammad Babaeizadeh and Golnaz Ghiasi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe_Z04Yvr}\n}",
        "github": "https://goo.gl/PVWQ9K",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJe_Z04Yvr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "247;280;259",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "163;547;323",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            262.0,
            13.638181696985855
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            344.3333333333333,
            157.49144597582296
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16904711578790888945&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJe_yR4Fwr",
        "title": "Improved Sample Complexities for Deep Neural Networks and Robust Classification via an All-Layer Margin",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a new notion of margin that has a direct relationship with neural net generalization, and obtain improved generalization bounds for neural nets and robust classification by analyzing this margin.",
        "abstract": "For linear classifiers, the relationship between (normalized) output margin and generalization is captured in a clear and simple bound \u2013 a large output margin implies good generalization. Unfortunately, for deep models, this relationship is less clear: existing analyses of the output margin give complicated bounds which sometimes depend exponentially on depth. In this work, we propose to instead analyze a new notion of margin, which we call the \u201call-layer margin.\u201d Our analysis reveals that the all-layer margin has a clear and direct relationship with generalization for deep models. This enables the following concrete applications of the all-layer margin: 1) by analyzing the all-layer margin, we obtain tighter generalization bounds for neural nets which depend on Jacobian and hidden layer norms and remove the exponential dependency on depth 2) our neural net results easily translate to the adversarially robust setting, giving the first direct analysis of robust test error for deep networks, and 3) we present a theoretically inspired training algorithm for increasing the all-layer margin. Our algorithm improves both clean and adversarially robust test performance over strong baselines in practice.",
        "keywords": "deep learning theory;generalization bounds;adversarially robust generalization;data-dependent generalization bounds",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Colin Wei;Tengyu Ma",
        "authorids": "colinwei@stanford.edu;tengyuma@cs.stanford.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nWei2020Improved,\ntitle={Improved Sample Complexities for Deep Neural Networks and Robust Classification via an All-Layer Margin},\nauthor={Colin Wei and Tengyu Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJe_yR4Fwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJe_yR4Fwr",
        "pdf_size": 0,
        "rating": "3;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "611;216;704;342",
        "wc_reply_reviewers": "0;0;54;0",
        "wc_reply_authors": "791;53;898;342",
        "reply_reviewers": "0;0;1;0",
        "reply_authors": "1;1;2;1",
        "rating_avg": [
            6.25,
            2.0463381929681126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            468.25,
            197.18313188505755
        ],
        "wc_reply_reviewers_avg": [
            13.5,
            23.382685902179844
        ],
        "wc_reply_authors_avg": [
            521.0,
            341.35538665736624
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 44,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8866532039764898763&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HJeb9xSYwB",
        "title": "Adversarial Training with Voronoi Constraints",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We replace the Lp ball constraint with the Voronoi cells of the training data to produce more robust models. ",
        "abstract": "Adversarial examples are a pervasive phenomenon of machine learning models where seemingly imperceptible perturbations to the input lead to misclassifications for otherwise statistically accurate models.  Adversarial training, one of the most successful empirical defenses to adversarial examples, refers to training on adversarial examples generated within a geometric constraint set. The most commonly used geometric constraint is an $L_p$-ball of radius $\\epsilon$ in some norm. We introduce adversarial training with Voronoi constraints, which replaces the $L_p$-ball constraint with the Voronoi cell for each point in the training set. We show that adversarial training with Voronoi constraints produces robust models which significantly improve over the state-of-the-art on MNIST and are competitive on CIFAR-10.",
        "keywords": "adversarial examples;adversarial training;voronoi diagrams",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marc Khoury;Dylan Hadfield-Menell",
        "authorids": "khoury@eecs.berkeley.edu;dhm@eecs.berkeley.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=HJeb9xSYwB",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=680891625089079315&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJedXaEtvS",
        "title": "Editable Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "Training neural networks so you can efficiently patch them later.",
        "abstract": "These days deep neural networks are ubiquitously used in a wide range of tasks, from image classification and machine translation to face identification and self-driving cars. In many applications, a single model error can lead to devastating financial, reputational and even life-threatening consequences. Therefore, it is crucially important to correct model mistakes quickly as they appear. In this work, we investigate the problem of neural network editing - how one can efficiently patch a mistake of the model on a particular sample, without influencing the model behavior on other samples. Namely, we propose Editable Training, a model-agnostic training technique that encourages fast editing of the trained model. We empirically demonstrate the effectiveness of this method on large-scale image classification and machine translation tasks.",
        "keywords": "editing;editable;meta-learning;maml",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anton Sinitsin;Vsevolod Plokhotnyuk;Dmitry Pyrkin;Sergei Popov;Artem Babenko",
        "authorids": "ant.sinitsin@gmail.com;vsevolod-pl@yandex.ru;alagaster@yandex.ru;sapopov@yandex-team.ru;artem.babenko@phystech.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nSinitsin2020Editable,\ntitle={Editable Neural Networks},\nauthor={Anton Sinitsin and Vsevolod Plokhotnyuk and Dmitry Pyrkin and Sergei Popov and Artem Babenko},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJedXaEtvS}\n}",
        "github": "https://github.com/editable-ICLR2020/editable",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJedXaEtvS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "243;1032;200",
        "wc_reply_reviewers": "14;257;0",
        "wc_reply_authors": "453;625;232",
        "reply_reviewers": "1;2;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            491.6666666666667,
            382.47643355143094
        ],
        "wc_reply_reviewers_avg": [
            90.33333333333333,
            117.98964172992287
        ],
        "wc_reply_authors_avg": [
            436.6666666666667,
            160.85673404893214
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 188,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9720720491011248758&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJeg46EKPr",
        "title": "Integrative Tensor-based Anomaly Detection System For Satellites",
        "track": "main",
        "status": "Reject",
        "tldr": "Integrative Tensor-based Anomaly Detection(ITAD) framework for a satellite system.",
        "abstract": "Detecting anomalies is of growing importance for various industrial applications and mission-critical infrastructures, including satellite systems. Although there have been several studies in detecting anomalies based on rule-based or machine learning-based approaches for satellite systems, a tensor-based decomposition method has not been extensively explored for anomaly detection. In this work, we introduce an Integrative Tensor-based Anomaly Detection (ITAD) framework to detect anomalies in a satellite system. Because of the high risk and cost, detecting anomalies in a satellite system is crucial. We construct 3rd-order tensors with telemetry data collected from Korea Multi-Purpose Satellite-2 (KOMPSAT-2) and calculate the anomaly score using one of the component matrices obtained by applying CANDECOMP/PARAFAC decomposition to detect anomalies. Our result shows that our tensor-based approach can be effective in achieving higher accuracy and reducing false positives in detecting anomalies as compared to other existing approaches.",
        "keywords": "Tensor decomposition;Anomaly detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Youjin Shin;Sangyup Lee;Shahroz Tariq;Myeong Shin Lee;OkchulJung;Daewon Chung;Simon Woo",
        "authorids": "youjin.shin.1@stonybrook.edu;shahroz@g.skku.edu;sangyup.lee@g.skku.edu;mslee@kari.re.kr;ocjung@kari.re.kr;dwchung@kari.re.kr;swoo@g.skku.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nshin2020integrative,\ntitle={Integrative Tensor-based Anomaly Detection System For Satellites},\nauthor={Youjin Shin and Sangyup Lee and Shahroz Tariq and Myeong Shin Lee and OkchulJung and Daewon Chung and Simon Woo},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeg46EKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJeg46EKPr",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "146;530",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            338.0,
            192.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1553716488762445246&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJeiDpVFPr",
        "title": "An Inductive Bias for Distances: Neural Nets that Respect the Triangle Inequality",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose novel neural network architectures, guaranteed to satisfy the triangle inequality, for purposes of (asymmetric) metric learning and modeling graph distances. ",
        "abstract": "Distances are pervasive in machine learning. They serve as similarity measures, loss functions, and learning targets; it is said that a good distance measure solves a task. When defining distances, the triangle inequality has proven to be a useful constraint, both theoretically---to prove convergence and optimality guarantees---and empirically---as an inductive bias. Deep metric learning architectures that respect the triangle inequality rely, almost exclusively, on Euclidean distance in the latent space. Though effective, this fails to model two broad classes of subadditive distances, common in graphs and reinforcement learning: asymmetric metrics, and metrics that cannot be embedded into Euclidean space. To address these problems, we introduce novel architectures that are guaranteed to satisfy the triangle inequality. We prove our architectures universally approximate norm-induced metrics on $\\mathbb{R}^n$, and present a similar result for modified Input Convex Neural Networks. We show that our architectures outperform existing metric approaches when modeling graph distances and have a better inductive bias than non-metric approaches when training data is limited in the multi-goal reinforcement learning setting.\n",
        "keywords": "metric learning;deep metric learning;neural network architectures;triangle inequality;graph distances",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Silviu Pitis;Harris Chan;Kiarash Jamali;Jimmy Ba",
        "authorids": "spitis@cs.toronto.edu;hchan@cs.toronto.edu;kiarash.jamali@mail.utoronto.ca;jba@cs.toronto.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nPitis2020An,\ntitle={An Inductive Bias for Distances: Neural Nets that Respect the Triangle Inequality},\nauthor={Silviu Pitis and Harris Chan and Kiarash Jamali and Jimmy Ba},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeiDpVFPr}\n}",
        "github": "https://github.com/spitis/deepnorms",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJeiDpVFPr",
        "pdf_size": 0,
        "rating": "3;8;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "489;134;329;107",
        "wc_reply_reviewers": "0;24;0;0",
        "wc_reply_authors": "299;285;225;34",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            6.75,
            2.165063509461097
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            264.75,
            155.23913005424888
        ],
        "wc_reply_reviewers_avg": [
            6.0,
            10.392304845413264
        ],
        "wc_reply_authors_avg": [
            210.75,
            105.76477438164372
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17260770554780214553&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJekvT4twr",
        "title": "RGTI:Response generation via templates integration for End to End dialog",
        "track": "main",
        "status": "Reject",
        "tldr": "A new simple but efficient model for end-to-end dialogue ",
        "abstract": "End-to-end models have achieved considerable success in task-oriented dialogue area, but suffer from the challenges of (a) poor semantic control, and (b) little interaction with auxiliary information. In this paper, we propose a novel yet simple end-to-end model for response generation via mixed templates, which can address above challenges. \nIn our model, we retrieval candidate responses which contain abundant syntactic and sequence information by dialogue semantic information related to dialogue history. Then, we exploit candidate response attention to get templates which should be mentioned in response. Our model can integrate multi template information to guide the decoder module how to generate response better. We show that our proposed model learns useful templates information, which improves the performance of \"how to say\" and \"what to say\" in response generation. Experiments on the large-scale Multiwoz dataset demonstrate the effectiveness of our proposed model, which attain the state-of-the-art performance.",
        "keywords": "End-to-end dialogue systems;transformer;pointer-generate network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuxin Zhang;Songyan Liu",
        "authorids": "zhangyuxin960625@gmail.com;anchor3l31@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzhang2020rgtiresponse,\ntitle={{\\{}RGTI{\\}}:Response generation via templates integration for End to End dialog},\nauthor={Yuxin Zhang and Songyan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=HJekvT4twr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJekvT4twr",
        "pdf_size": 0,
        "rating": "1;1;1;1",
        "confidence": "0;0;0;0",
        "wc_review": "174;132;76;212",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            148.5,
            50.52474641203061
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WqI1wntzqXgJ:scholar.google.com/&scioq=RGTI:Response+generation+via+templates+integration+for+End+to+End+dialog&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJel76NYPS",
        "title": "Collaborative Generated Hashing for Market Analysis and Fast Cold-start Recommendation",
        "track": "main",
        "status": "Reject",
        "tldr": "It can generate effective hash codes for efficient cold-start recommendation and meanwhile provide a feasible marketing strategy.",
        "abstract": "Cold-start and efficiency issues of the Top-k recommendation are critical to large-scale recommender systems. Previous hybrid recommendation methods are effective to deal with the cold-start issues by extracting real latent factors of cold-start items(users) from side information, but they still suffer low efficiency in online recommendation caused by the expensive similarity search in real latent space. This paper presents a collaborative generated hashing (CGH) to improve the efficiency by denoting users and items as binary codes, which applies to various settings: cold-start users, cold-start items and warm-start ones. Specifically, CGH is designed to learn hash functions of users and items through the Minimum Description Length (MDL) principle; thus, it can deal with various recommendation settings. In addition, CGH initiates a new marketing strategy through mining potential users by a generative step. To reconstruct effective users, the MDL principle is used to learn compact and informative binary codes from the content data. Extensive experiments on two public datasets show the advantages for recommendations in various settings over competing baselines and analyze the feasibility of the application in marketing.",
        "keywords": "Recommender system;generated model;market analysis;hash;cold start",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yan Zhang;Ivor W. Tsang;Lixin Duan;Guowu Yang",
        "authorids": "yixianqianzy@gmail.com;ivor.tsang@uts.edu.au;lxduan@gmail.com;guowu@uestc.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020collaborative,\ntitle={Collaborative Generated Hashing for Market Analysis and Fast Cold-start Recommendation},\nauthor={Yan Zhang and Ivor W. Tsang and Lixin Duan and Guowu Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=HJel76NYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJel76NYPS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "341;431;1000",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            590.6666666666666,
            291.7651262383647
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RwwbfwlGonsJ:scholar.google.com/&scioq=Collaborative+Generated+Hashing+for+Market+Analysis+and+Fast+Cold-start+Recommendation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJel8aEKPB",
        "title": "How does Lipschitz Regularization Influence GAN Training?",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Despite the recent success of Lipschitz regularization in stabilizing GAN training, the exact reason of its effectiveness remains poorly understood. It is commonly believed that the main function of K-Lipschitz regularization is to restrict the L2-norm of the neural network gradient to be smaller than a threshold K (e.g. K=1) such that || grad f || <= K. While in this work, we uncover a counter-intuitive fact that under typical GAN setups, the choice of K does not matter. This finding suggests that instead of keeping the neural network gradients small, an even more important function of Lipschitz regularization is its restriction on the domain and interval of attainable gradient values of the loss function. This avoids the bias of the loss function over input samples. Empirically, we verify our proposition on the MNIST, CIFAR10 and CelebA datasets.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yipeng Qin;Niloy Mitra;Peter Wonka",
        "authorids": "qinyipeng1991@gmail.com;niloym@gmail.com;pwonka@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJel8aEKPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "176;533;437",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            382.0,
            150.84429057806597
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6828304381335969869&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "HJem3yHKwH",
        "title": "EMPIR: Ensembles of Mixed Precision Deep Networks for Increased Robustness Against Adversarial Attacks",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose ensembles of mixed-precision DNNs as a new form of defense against adversarial attacks",
        "abstract": "Ensuring robustness of Deep Neural Networks (DNNs) is crucial to their adoption in safety-critical applications such as self-driving cars, drones, and healthcare. Notably, DNNs are vulnerable to adversarial attacks in which small input perturbations can produce catastrophic misclassifications. In this work, we propose EMPIR, ensembles of quantized DNN models with different numerical precisions, as a new approach to increase robustness against adversarial attacks. EMPIR is based on the observation that quantized neural networks often demonstrate much higher robustness to adversarial attacks than full precision networks, but at the cost of a substantial loss in accuracy on the original (unperturbed) inputs. EMPIR overcomes this limitation to achieve the ``best of both worlds\", i.e., the higher unperturbed accuracies of the full precision models combined with the higher robustness of the low precision models, by composing them in an ensemble. Further, as low precision DNN models have significantly lower computational and storage requirements than full precision models, EMPIR models only incur modest compute and memory overheads compared to a single full-precision model (<25% in our evaluations). We evaluate EMPIR across a suite of DNNs for 3 different image recognition tasks (MNIST, CIFAR-10 and ImageNet) and under 4 different adversarial attacks. Our results indicate that EMPIR boosts the average adversarial accuracies by 42.6%, 15.2% and 10.5% for the DNN models trained on the MNIST, CIFAR-10 and ImageNet datasets respectively, when compared to single full-precision models, without sacrificing accuracy on the unperturbed inputs.",
        "keywords": "ensembles;mixed precision;robustness;adversarial attacks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sanchari Sen;Balaraman Ravindran;Anand Raghunathan",
        "authorids": "sen9@purdue.edu;ravi@cse.iitm.ac.in;raghunathan@purdue.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nSen2020EMPIR:,\ntitle={EMPIR: Ensembles of Mixed Precision Deep Networks for Increased Robustness Against Adversarial Attacks},\nauthor={Sanchari Sen and Balaraman Ravindran and Anand Raghunathan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJem3yHKwH}\n}",
        "github": "https://github.com/sancharisen/EMPIR",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJem3yHKwH",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "214;137;187",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "529;179;498",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            179.33333333333334,
            31.899146627387317
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            402.0,
            158.19186662615328
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 87,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16573248157245653901&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJemQJBKDr",
        "title": "Continual Density Ratio Estimation (CDRE): A new method for evaluating generative models in continual learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new method (Continual Density Ratio Estimation) for evaluating generative models in continual learning using f-divegences.",
        "abstract": "We propose a new method Continual Density Ratio Estimation (CDRE), which can estimate density ratios between a target distribution of real samples and a distribution of samples generated by a model while the model is changing over time and the data of the target distribution is not available after a certain time point. This method perfectly fits the setting of continual learning, in which one model is supposed to learn different tasks sequentially and the most crucial restriction is that model has none or very limited access to the data of all learned tasks. Through CDRE, we can evaluate generative models in continual learning using f-divergences. To the best of our knowledge, there is no existing method that can evaluate generative models under the setting of continual learning without storing real samples from the target distribution.",
        "keywords": "density ratio estimation;continual learning;evaluation;generative model;f divergence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Chen;Song Liu;Tom Diethe;Peter Flach",
        "authorids": "yc14600@bristol.ac.uk;song.liu@bristol.ac.uk;tdiethe@amazon.com;peter.flach@bristol.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nchen2020continual,\ntitle={Continual Density Ratio Estimation ({\\{}CDRE{\\}}): A new method for evaluating generative models in continual learning},\nauthor={Yu Chen and Song Liu and Tom Diethe and Peter Flach},\nyear={2020},\nurl={https://openreview.net/forum?id=HJemQJBKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJemQJBKDr",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "356;223;179",
        "wc_reply_reviewers": "0;0;318",
        "wc_reply_authors": "715;468;597",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            252.66666666666666,
            75.24330904177165
        ],
        "wc_reply_reviewers_avg": [
            106.0,
            149.90663761154806
        ],
        "wc_reply_authors_avg": [
            593.3333333333334,
            100.87065425032638
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XsDhKuzPP0QJ:scholar.google.com/&scioq=Continual+Density+Ratio+Estimation+(CDRE):+A+new+method+for+evaluating+generative+models+in+continual+learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJenn6VFvB",
        "title": "Hamiltonian Generative Networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We introduce a class of generative models that reliably learn Hamiltonian dynamics from high-dimensional observations. The learnt Hamiltonian can be applied to sequence modeling or as a normalising flow.",
        "abstract": "The Hamiltonian formalism plays a central role in classical and quantum physics. Hamiltonians are the main tool for modelling the continuous time evolution of systems with conserved quantities, and they come equipped with many useful properties, like time reversibility and smooth interpolation in time. These properties are important for many machine learning problems - from sequence prediction to reinforcement learning and density modelling - but are not typically provided out of the box by standard tools such as recurrent neural networks. In this paper, we introduce the Hamiltonian Generative Network (HGN), the first approach capable of consistently learning Hamiltonian dynamics from high-dimensional observations (such as images) without restrictive domain assumptions. Once trained, we can use HGN to sample new trajectories, perform rollouts both forward and backward in time, and even speed up or slow down the learned dynamics. We demonstrate how a simple modification of the network architecture turns HGN into a powerful normalising flow model, called Neural Hamiltonian Flow (NHF), that uses Hamiltonian dynamics to model expressive densities. Hence, we hope that our work serves as a first practical demonstration of the value that the Hamiltonian formalism can bring to machine learning. More results and video evaluations are available at: http://tiny.cc/hgn",
        "keywords": "Hamiltonian dynamics;normalising flows;generative model;physics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Peter Toth;Danilo J. Rezende;Andrew Jaegle;S\u00e9bastien Racani\u00e8re;Aleksandar Botev;Irina Higgins",
        "authorids": "petertoth@google.com;danilor@google.com;drewjaegle@google.com;sracaniere@google.com;botev@google.com;irinah@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nToth2020Hamiltonian,\ntitle={Hamiltonian Generative Networks},\nauthor={Peter Toth and Danilo J. Rezende and Andrew Jaegle and S\u00e9bastien Racani\u00e8re and Aleksandar Botev and Irina Higgins},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJenn6VFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJenn6VFvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "660;328;513",
        "wc_reply_reviewers": "23;109;128",
        "wc_reply_authors": "945;408;428",
        "reply_reviewers": "1;1;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            500.3333333333333,
            135.83404907786885
        ],
        "wc_reply_reviewers_avg": [
            86.66666666666667,
            45.68247901426639
        ],
        "wc_reply_authors_avg": [
            593.6666666666666,
            248.5643221023931
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 264,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5523207391163407417&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJepXaVYDr",
        "title": "Stochastic AUC Maximization with Deep Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "The paper designs two algorithms for the stochastic AUC maximization problem with state-of-the-art complexities when using deep neural network as predictive model, which are also verified by empirical studies.",
        "abstract": "Stochastic AUC maximization has garnered an increasing interest due to better fit to imbalanced data classification. However, existing works are limited to stochastic AUC maximization with a linear predictive model, which restricts its predictive power when dealing with extremely complex data. In this paper, we consider stochastic AUC maximization problem with a deep neural network as the predictive model. Building on the saddle point reformulation of a surrogated loss of AUC, the problem can be cast into a {\\it non-convex concave} min-max problem. The main contribution made in this paper is to make stochastic AUC maximization more practical for deep neural networks and big data with theoretical insights as well. In particular, we propose to explore Polyak-\\L{}ojasiewicz (PL) condition that has been proved and observed in deep learning, which enables us to develop new stochastic algorithms with even faster convergence rate and more practical step size scheme. An AdaGrad-style algorithm is also analyzed under the PL condition with adaptive convergence rate. Our experimental results demonstrate the effectiveness of the proposed algorithms.",
        "keywords": "Stochastic AUC Maximization;Deep Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mingrui Liu;Zhuoning Yuan;Yiming Ying;Tianbao Yang",
        "authorids": "mingrui-liu@uiowa.edu;zhuoning-yuan@uiowa.edu;yying@albany.edu;tianbao-yang@uiowa.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLiu2020Stochastic,\ntitle={Stochastic AUC Maximization with Deep Neural Networks},\nauthor={Mingrui Liu and Zhuoning Yuan and Yiming Ying and Tianbao Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJepXaVYDr}\n}",
        "github": "https://drive.google.com/drive/folders/1nPM6fmvN5fTsSaWsOcGFbhMVW7Fxso-Y",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJepXaVYDr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "338;169;90",
        "wc_reply_reviewers": "41;0;0",
        "wc_reply_authors": "719;73;9",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            199.0,
            103.44402673265705
        ],
        "wc_reply_reviewers_avg": [
            13.666666666666666,
            19.3275853524323
        ],
        "wc_reply_authors_avg": [
            267.0,
            320.6784474620436
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 114,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4309685840764886913&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJeqhA4YDS",
        "title": "Denoising and Regularization via Exploiting the Structural Bias of Convolutional Generators",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Convolutional Neural Networks (CNNs) have emerged as highly successful tools for image generation, recovery, and restoration. A major contributing factor to this success is that convolutional networks impose strong prior assumptions about natural images. A surprising experiment that highlights this architectural bias towards natural images is that one can remove noise and corruptions from a natural image without using any training data, by simply fitting (via gradient descent) a randomly initialized, over-parameterized convolutional generator to the corrupted image. While this over-parameterized network can fit the corrupted image perfectly, surprisingly after a few iterations of gradient descent it generates an almost uncorrupted image. This intriguing phenomenon enables state-of-the-art CNN-based denoising and regularization of other inverse problems. In this paper, we attribute this effect to a particular architectural choice of convolutional networks, namely convolutions with fixed interpolating filters. We then formally characterize the dynamics of fitting a two-layer convolutional generator to a noisy signal and prove that early-stopped gradient descent denoises/regularizes. Our proof relies on showing that convolutional generators fit the structured part of an image significantly faster than the corrupted portion. ",
        "keywords": "theory for deep learning;convolutional network;deep image prior;deep decoder;dynamics of gradient descent;overparameterization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Reinhard Heckel and Mahdi Soltanolkotabi;Reinhard Heckel and Mahdi Soltanolkotabi",
        "authorids": "reinhard.heckel@tum.de;msoltoon@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nSoltanolkotabi2020Denoising,\ntitle={Denoising and Regularization via Exploiting the Structural Bias of Convolutional Generators},\nauthor={Reinhard Heckel and Mahdi Soltanolkotabi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJeqhA4YDS}\n}",
        "github": "https://github.com/MLI-lab/overparameterized_convolutional_generators",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJeqhA4YDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "457;584;205",
        "wc_reply_reviewers": "0;92;0",
        "wc_reply_authors": "497;651;342",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            415.3333333333333,
            157.5062608984869
        ],
        "wc_reply_reviewers_avg": [
            30.666666666666668,
            43.36921591277491
        ],
        "wc_reply_authors_avg": [
            496.6666666666667,
            126.14894195178792
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 97,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11773092557321050875&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJew70NYvH",
        "title": "TPO: TREE SEARCH POLICY OPTIMIZATION FOR CONTINUOUS ACTION SPACES",
        "track": "main",
        "status": "Reject",
        "tldr": "We use MCTS to further optimize a bootstrapped policy for continuous action spaces under a policy iteration setting.",
        "abstract": "Monte Carlo Tree Search (MCTS) has achieved impressive results on a range of discrete environments, such as Go, Mario and Arcade games, but it has not yet fulfilled its true potential in continuous domains.In this work, we introduceTPO, a tree search based policy optimization method for continuous environments. TPO takes a hybrid approach to policy optimization.  Building the MCTS tree in a continuous action space and updating the policy gradient using off-policy MCTS trajectories are non-trivial. To overcome these challenges, we propose limiting tree search branching factor by drawing only few action samples from the policy distribution and define a new loss function based on the trajectories\u2019 mean and standard deviations.  Our approach led to some non-intuitive findings.  MCTS training generally requires a large number of samples and simulations. However, we observed that bootstrappingtree search with a pre-trained policy allows us to achieve high quality results with a low MCTS branching factor and few number of simulations. Without the proposed policy bootstrapping, continuous MCTS would require a much larger branching factor and simulation count, rendering it computationally and prohibitively expensive. In our experiments, we use PPO as our baseline policy optimization algorithm. TPO significantly improves the policy on nearly all of our benchmarks.  For example, in complex environments such as Humanoid, we achieve a 2.5\u00d7improvement over the baseline algorithm.",
        "keywords": "monte-carlo tree search;reinforcement learning;tree search;policy optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amir Yazdanbakhsh;Ebrahim Songhori;Robert Ormandi;Anna Goldie;Azalia Mirhoseini",
        "authorids": "ayazdan@google.com;esonghori@google.com;ormandi@google.com;agoldie@google.com;azalia@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyazdanbakhsh2020tpo,\ntitle={{\\{}TPO{\\}}: {\\{}TREE{\\}} {\\{}SEARCH{\\}} {\\{}POLICY{\\}} {\\{}OPTIMIZATION{\\}} {\\{}FOR{\\}} {\\{}CONTINUOUS{\\}} {\\{}ACTION{\\}} {\\{}SPACES{\\}}},\nauthor={Amir Yazdanbakhsh and Ebrahim Songhori and Robert Ormandi and Anna Goldie and Azalia Mirhoseini},\nyear={2020},\nurl={https://openreview.net/forum?id=HJew70NYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJew70NYvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "875;294;358",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "515;274;441",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            509.0,
            260.1166405031917
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            410.0,
            100.80013227504548
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tPGtqJigUB4J:scholar.google.com/&scioq=TPO:+TREE+SEARCH+POLICY+OPTIMIZATION+FOR+CONTINUOUS+ACTION+SPACES&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJewiCVFPB",
        "title": "Gradient Surgery for Multi-Task Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop a simple and general approach for avoiding interference between gradients from different tasks, which improves the performance of multi-task learning in both the supervised and reinforcement learning domains.",
        "abstract": "While deep learning and deep reinforcement learning systems have demonstrated impressive results in domains such as image classification, game playing, and robotic control, data efficiency remains a major challenge, particularly as these algorithms learn individual tasks from scratch. Multi-task learning has emerged as a promising approach for sharing structure across multiple tasks to enable more efficient learning. However, the multi-task setting presents a number of optimization challenges, making it difficult to realize large efficiency gains compared to learning tasks independently. The reasons why multi-task learning is so challenging compared to single task learning are not fully understood. Motivated by the insight that gradient interference causes optimization challenges, we develop a simple and general approach for avoiding interference between gradients from different tasks, by altering the gradients through a technique we refer to as \u201cgradient surgery\u201d. We propose a form of gradient surgery that projects the gradient of a task onto the normal plane of the gradient of any other task that has a conflicting gradient. On a series of challenging multi-task supervised and multi-task reinforcement learning problems, we find that this approach leads to substantial gains in efficiency and performance.  Further, it can be effectively combined with previously-proposed multi-task architectures for enhanced performance in a model-agnostic way.",
        "keywords": "multi-task learning;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianhe Yu;Saurabh Kumar;Abhishek Gupta;Karol Hausman;Sergey Levine;Chelsea Finn",
        "authorids": "tianheyu@cs.stanford.edu;szk@stanford.edu;abhigupta@berkeley.edu;hausmankarol@gmail.com;svlevine@eecs.berkeley.edu;cbfinn@cs.stanford.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nyu2020gradient,\ntitle={Gradient Surgery for Multi-Task Learning},\nauthor={Tianhe Yu and Saurabh Kumar and Abhishek Gupta and Karol Hausman and Sergey Levine and Chelsea Finn},\nyear={2020},\nurl={https://openreview.net/forum?id=HJewiCVFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJewiCVFPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "226;642;688",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "286;499;622",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            518.6666666666666,
            207.79690939846904
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            469.0,
            138.80201727640704
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1273,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15639381935804051305&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJewxlHFwH",
        "title": "Skew-Explore: Learn faster in continuous spaces with sparse rewards",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In many reinforcement learning settings, rewards which are extrinsically available to the learning agent are too sparse to train a suitable policy. Beside reward shaping which requires human expertise, utilizing better exploration strategies helps to circumvent the problem of policy training with sparse rewards. In this work, we introduce an exploration approach based on maximizing the entropy of the visited states while learning a goal-conditioned policy. The main contribution of this work is to introduce a novel reward function which combined with a goal proposing scheme, increases the entropy of the visited states faster compared to the prior work. This improves the exploration capability of the agent, and therefore enhances the agent's chance to solve sparse reward problems more efficiently. Our empirical studies demonstrate the superiority of the proposed method to solve different sparse reward problems in comparison to the prior work. ",
        "keywords": "reinforcement learning;exploration;sparse reward",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xi Chen;Yuan Gao;Ali Ghadirzadeh;Marten Bjorkman;Ginevra Castellano;Patric Jensfelt",
        "authorids": "xi8@kth.se;gaoyuankidult@gmail.com;algh@kth.se;celle@csc.kth.se;ginevra.castellano@it.uu.se;patric@kth.se",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nchen2020skewexplore,\ntitle={Skew-Explore: Learn faster in continuous spaces with sparse rewards},\nauthor={Xi Chen and Yuan Gao and Ali Ghadirzadeh and Marten Bjorkman and Ginevra Castellano and Patric Jensfelt},\nyear={2020},\nurl={https://openreview.net/forum?id=HJewxlHFwH}\n}",
        "github": "https://anonymous.4open.science/r/b4596073-4cbc-4ac6-b85b-e9a786909058/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJewxlHFwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "235;660;927",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "21;1128;1191",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            607.3333333333334,
            284.95184778406957
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            780.0,
            537.3099664067288
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1675855642032816814&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJezF3VYPB",
        "title": "Federated Adversarial Domain Adaptation",
        "track": "main",
        "status": "Poster",
        "tldr": "we present a principled approach to the problem of federated domain adaptation, which aims to align the representations learned among the different nodes with the data distribution of the target node.",
        "abstract": "Federated learning improves data privacy and efficiency in machine learning performed over networks of distributed devices, such as mobile phones, IoT and wearable devices, etc. Yet models trained with federated learning can still fail to generalize to new devices due to the problem of domain shift. Domain shift occurs when the labeled data collected by source nodes statistically differs from the target node's unlabeled data. In this work, we present a principled approach to the problem of federated domain adaptation, which aims to align the representations learned among the different nodes with the data distribution of the target node. Our approach extends adversarial adaptation techniques to the constraints of the federated setting. In addition, we devise a dynamic attention mechanism and leverage feature disentanglement to enhance knowledge transfer. Empirically, we perform extensive experiments on several image and text classification tasks and show promising results under unsupervised federated domain adaptation setting.",
        "keywords": "Federated Learning;Domain Adaptation;Transfer Learning;Feature Disentanglement",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xingchao Peng;Zijun Huang;Yizhe Zhu;Kate Saenko",
        "authorids": "xpeng@bu.edu;zijun.huang@columbia.edu;yizhe.zhu@rutgers.edu;saenko@bu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nPeng2020Federated,\ntitle={Federated Adversarial Domain Adaptation},\nauthor={Xingchao Peng and Zijun Huang and Yizhe Zhu and Kate Saenko},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJezF3VYPB}\n}",
        "github": "https://drive.google.com/file/d/1OekTpqB6qLfjlE2XUjQPm3F110KDMFc0/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJezF3VYPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "449;224;370",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "853;608;1195",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            347.6666666666667,
            93.20348109855601
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            885.3333333333334,
            240.72990858821197
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 373,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4756471664363351607&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJezqlrKvr",
        "title": "Accelerate DNN Inference By Inter-Operator Parallelization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "High utilization is key to achieve high efficiency for deep neural networks. Existing deep learning frameworks has focused on improving the performance of individual operators but ignored the parallelization between operators. This leads to low device utilization especially for complex deep neural networks (DNNs) with many small operations such as Inception and NASNet. To make complex DNNs more efficient, we need to execute parallely. However, naive greedy schedule leads to much resource contention and do not yield best performance. In this work, we propose Deep Optimal Scheduling (DOS), a general dynamic programming algorithm to find optimal scheduling to improve utilization via parallel execution. Specifically, DOS optimizes the execution for given hardware and inference settings. Our experiments demonstrate that DOS consistently outperform existing deep learning library by 1.2 to 1.4 \u00d7 on widely used complex DNNs.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yaoyao Ding;Ligeng Zhu;Zhihao Jia;Song Han",
        "authorids": "yyding@mit.edu;ligeng@mit.edu;zhihao@cs.stanford.edu;songhan@mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJezqlrKvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "193;116;262",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            190.33333333333334,
            59.63406930792349
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RFr1nmlnlM4J:scholar.google.com/&scioq=Accelerate+DNN+Inference+By+Inter-Operator+Parallelization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJg0_eBFwB",
        "title": "In-training Matrix Factorization for Parameter-frugal Neural Machine Translation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper proposes using matrix factorization at training time for neural machine translation, which can reduce model size and decrease training time without harming performance.",
        "abstract": "In this paper, we propose the use of in-training matrix factorization to reduce the model size for neural machine translation. Using in-training matrix factorization, parameter matrices may be decomposed into the products of smaller matrices, which can compress large machine translation architectures by vastly reducing the number of learnable parameters. We apply in-training matrix factorization to different layers of standard neural architectures and show that in-training factorization is capable of reducing nearly 50% of learnable parameters without any associated loss in BLEU score. Further, we find that in-training matrix factorization is especially powerful on embedding layers, providing a simple and effective method to curtail the number of parameters with minimal impact on model performance, and, at times, an increase in performance.",
        "keywords": "natural language processing;neural machine translation;matrix factorization;model compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zachary Kaden;Teven Le Scao;Raphael Olivier",
        "authorids": "kadenzack@gmail.com;tlescao@andrew.cmu.edu;rolivier@cs.cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJg0_eBFwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "172;463;336",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            323.6666666666667,
            119.11992090139901
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8434222836686874931&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJg2b0VYDr",
        "title": "Selection via Proxy: Efficient Data Selection for Deep Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "we can significantly improve the computational efficiency of data selection in deep learning by using a much smaller proxy model to perform data selection.",
        "abstract": "Data selection methods, such as active learning and core-set selection, are useful tools for machine learning on large datasets. However, they can be prohibitively expensive to apply in deep learning because they depend on feature representations that need to be learned. In this work, we show that we can greatly improve the computational efficiency by using a small proxy model to perform data selection (e.g., selecting data points to label for active learning). By removing hidden layers from the target model, using smaller architectures, and training for fewer epochs, we create proxies that are an order of magnitude faster to train. Although these small proxy models have higher error rates, we find that they empirically provide useful signals for data selection. We evaluate this \"selection via proxy\" (SVP) approach on several data selection tasks across five datasets: CIFAR10, CIFAR100, ImageNet, Amazon Review Polarity, and Amazon Review Full. For active learning, applying SVP can give an order of magnitude improvement in data selection runtime (i.e., the time it takes to repeatedly train and select points) without significantly increasing the final error (often within 0.1%). For core-set selection on CIFAR10, proxies that are over 10\u00d7 faster to train than their larger, more accurate targets can remove up to 50% of the data without harming the final accuracy of the target, leading to a 1.6\u00d7 end-to-end training time improvement.",
        "keywords": "data selection;active-learning;core-set selection;deep learning;uncertainty sampling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cody Coleman;Christopher Yeh;Stephen Mussmann;Baharan Mirzasoleiman;Peter Bailis;Percy Liang;Jure Leskovec;Matei Zaharia",
        "authorids": "cody@cs.stanford.edu;chrisyeh@stanford.edu;mussmann@stanford.edu;baharanm@stanford.edu;pbailis@cs.stanford.edu;pliang@cs.stanford.edu;jure@cs.stanford.edu;matei@cs.stanford.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nColeman2020Selection,\ntitle={Selection via Proxy: Efficient Data Selection for Deep Learning},\nauthor={Cody Coleman and Christopher Yeh and Stephen Mussmann and Baharan Mirzasoleiman and Peter Bailis and Percy Liang and Jure Leskovec and Matei Zaharia},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJg2b0VYDr}\n}",
        "github": "https://github.com/stanford-futuredata/selection-via-proxy",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJg2b0VYDr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "326;593;481",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "414;1142;1532",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;3",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            466.6666666666667,
            109.47247244043692
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1029.3333333333333,
            463.3223020269536
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 391,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10606664093807319412&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJg3HyStwB",
        "title": "Perturbations are not Enough: Generating Adversarial Examples with Spatial Distortions",
        "track": "main",
        "status": "Reject",
        "tldr": "A new adversarial attack for images with both perturbations and spatial distortions",
        "abstract": "Deep neural network image classifiers are reported to be susceptible to adversarial evasion attacks, which use carefully crafted images created to mislead a classifier. Recently, various kinds of adversarial attack methods have been proposed, most of which focus on adding small perturbations to input images. Despite the success of existing approaches, the way to generate realistic adversarial images with small perturbations remains a challenging problem. In this paper, we aim to address this problem by proposing a novel adversarial method, which generates adversarial examples by imposing not only perturbations but also spatial distortions on input images, including scaling, rotation, shear, and translation. As humans are less susceptible to small spatial distortions, the proposed approach can produce visually more realistic attacks with smaller perturbations, able to deceive classifiers without affecting human predictions. We learn our method by amortized techniques with neural networks and generate adversarial examples efficiently by a forward pass of the networks. Extensive experiments on attacking different types of non-robustified classifiers and robust classifiers with defence show that our method has state-of-the-art performance in comparison with advanced attack parallels.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "He Zhao;Trung Le;Paul Montague;Olivier De Vel;Tamas Abraham;Dinh Phung",
        "authorids": "ethanhezhao@gmail.com;trunglm@monash.edu;paul.montague@dst.defence.gov.au;olivier.devel@dst.defence.gov.au;tamas.abraham@dst.defence.gov.au;dinh.phung@monash.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nzhao2020perturbations,\ntitle={Perturbations are not Enough: Generating Adversarial Examples with Spatial Distortions},\nauthor={He Zhao and Trung Le and Paul Montague and Olivier De Vel and Tamas Abraham and Dinh Phung},\nyear={2020},\nurl={https://openreview.net/forum?id=HJg3HyStwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJg3HyStwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "362;196;438",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "600;387;512",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            332.0,
            101.04784345381482
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            499.6666666666667,
            87.39310931392448
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=571685988675685930&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJg3Rp4FwH",
        "title": "Policy Optimization In the Face of Uncertainty",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Model-based reinforcement learning has the potential to be more sample efficient than model-free approaches. However, existing model-based methods are vulnerable to model bias, which leads to poor generalization and asymptotic performance compared to model-free counterparts. In this paper, we propose a novel policy optimization framework using an uncertainty-aware objective function to handle those issues. In this framework, the agent simultaneously learns an uncertainty-aware dynamics model and optimizes the policy according to these learned models. Under this framework, the objective function can represented end-to-end as a single computational graph, which allows seamless policy gradient computation via backpropagation through the models. In addition to being theoretically sound, our approach shows promising results on challenging continuous control benchmarks with competitive asymptotic performance and sample complexity compared to state-of-the-art baselines.",
        "keywords": "Reinforcement Learning;Model-based Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tung-Long Vuong;Han Nguyen;Hai Pham;Kenneth Tran",
        "authorids": "longvt94@vnu.edu.vn;hann1@andrew.cmu.edu;htpham@cs.cmu.edu;ktran@microsoft.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nvuong2020policy,\ntitle={Policy Optimization In the Face of Uncertainty},\nauthor={Tung-Long Vuong and Han Nguyen and Hai Pham and Kenneth Tran},\nyear={2020},\nurl={https://openreview.net/forum?id=HJg3Rp4FwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJg3Rp4FwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "720;398;241",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "756;523;577",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            453.0,
            199.3807078597793
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            618.6666666666666,
            99.58023007717055
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lCUzqnnjTjIJ:scholar.google.com/&scioq=Policy+Optimization+In+the+Face+of+Uncertainty&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJg4qxSKPB",
        "title": "Implicit Rugosity Regularization via Data Augmentation",
        "track": "main",
        "status": "Reject",
        "tldr": "Data augmentation provides an implicit regularization of the rugosity or \"roughness\" of the learned function of a deep network.",
        "abstract": "Deep (neural) networks have been applied productively in a wide range of supervised and unsupervised learning tasks.  Unlike classical machine learning algorithms, deep networks typically operate in the overparameterized regime, where the number of parameters is larger than the number of training data points.  Consequently, understanding the generalization properties and the role of (explicit or implicit) regularization in these networks is of great importance. In this work, we explore how the oft-used heuristic of data augmentation imposes an implicit regularization penalty of a novel measure of the rugosity or \u201croughness\u201d based on the tangent Hessian of the function fit to the training data.",
        "keywords": "deep networks;implicit regularization;Hessian;rugosity;curviness;complexity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel LeJeune;Randall Balestriero;Hamid Javadi;Richard G. Baraniuk",
        "authorids": "dlejeune@rice.edu;randallbalestriero@gmail.com;hh35@rice.edu;richb@rice.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nlejeune2020implicit,\ntitle={Implicit Rugosity Regularization via Data Augmentation},\nauthor={Daniel LeJeune and Randall Balestriero and Hamid Javadi and Richard G. Baraniuk},\nyear={2020},\nurl={https://openreview.net/forum?id=HJg4qxSKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJg4qxSKPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "256;210;414",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "603;243;502",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            293.3333333333333,
            87.36640595153774
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            449.3333333333333,
            151.614276665784
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3682020007628956136&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJg6VREFDH",
        "title": "iWGAN: an Autoencoder WGAN for Inference",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Generative Adversarial Networks (GANs) have been impactful on many problems and applications but suffer from unstable training. Wasserstein GAN (WGAN) leverages the Wasserstein distance to avoid the caveats in the minmax two-player training of GANs but has other defects such as mode collapse and lack of metric to detect the convergence. We introduce a novel inference WGAN (iWGAN) model, which is a principled framework to fuse auto-encoders and WGANs. The iWGAN jointly learns an encoder network and a generative network using an iterative primal dual optimization process. We establish the generalization error bound of iWGANs. We further provide a rigorous probabilistic interpretation of our model under the framework of maximum likelihood estimation. The iWGAN, with a clear stopping criteria, has many advantages over other autoencoder GANs. The empirical experiments show that our model greatly mitigates the symptom of mode collapse, speeds up the convergence, and is able to provide a measurement of quality check for each individual sample. We illustrate the ability of iWGANs by obtaining a competitive and stable performance with state-of-the-art for benchmark datasets.",
        "keywords": "Generative model;Autoencoder;Inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yao Chen;Qingyi Gao;Xiao Wang",
        "authorids": "chen2037@purdue.edu;gao424@purdue.edu;wangxiao@purdue.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchen2020iwgan,\ntitle={i{\\{}WGAN{\\}}: an Autoencoder {\\{}WGAN{\\}} for Inference},\nauthor={Yao Chen and Qingyi Gao and Xiao Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=HJg6VREFDH}\n}",
        "github": "https://drive.google.com/drive/folders/1-_vIrbOYwf2BH1lOrVEcEPJUxkyV5CiB?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJg6VREFDH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "346;212;84",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "582;561;109",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            214.0,
            106.97040089046439
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            417.3333333333333,
            218.19308472594227
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mv5OTvtcm54J:scholar.google.com/&scioq=iWGAN:+an+Autoencoder+WGAN+for+Inference&hl=en&as_sdt=0,44",
        "gs_version_total": 0
    },
    {
        "id": "HJgBA2VYwH",
        "title": "FSPool: Learning Set Representations with Featurewise Sort Pooling",
        "track": "main",
        "status": "Poster",
        "tldr": "Sort in encoder and undo sorting in decoder to avoid responsibility problem in set auto-encoders",
        "abstract": "Traditional set prediction models can struggle with simple datasets due to an issue we call the responsibility problem. We introduce a pooling method for sets of feature vectors based on sorting features across elements of the set. This can be used to construct a permutation-equivariant auto-encoder that avoids this responsibility problem. On a toy dataset of polygons and a set version of MNIST, we show that such an auto-encoder produces considerably better reconstructions and representations. Replacing the pooling function in existing set encoders with FSPool improves accuracy and convergence speed on a variety of datasets.",
        "keywords": "set auto-encoder;set encoder;pooling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yan Zhang;Jonathon Hare;Adam Pr\u00fcgel-Bennett",
        "authorids": "yz5n12@ecs.soton.ac.uk;jsh2@ecs.soton.ac.uk;apb@ecs.soton.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nZhang2020FSPool:,\ntitle={FSPool: Learning Set Representations with Featurewise Sort Pooling},\nauthor={Yan Zhang and Jonathon Hare and Adam Pr\u00fcgel-Bennett},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgBA2VYwH}\n}",
        "github": "https://github.com/Cyanogenoid/fspool",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJgBA2VYwH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "123;329;396",
        "wc_reply_reviewers": "0;0;19",
        "wc_reply_authors": "230;8;770",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            282.6666666666667,
            116.1675035837858
        ],
        "wc_reply_reviewers_avg": [
            6.333333333333333,
            8.956685895029603
        ],
        "wc_reply_authors_avg": [
            336.0,
            319.98749975584985
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 94,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3929630154366081815&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJgC60EtwB",
        "title": "Robust Reinforcement Learning for Continuous Control with Model Misspecification",
        "track": "main",
        "status": "Poster",
        "tldr": "A framework for incorporating robustness to model misspecification into continuous control Reinforcement Learning algorithms.",
        "abstract": "We provide a framework for incorporating robustness -- to perturbations in the transition dynamics which we refer to as model misspecification -- into continuous control Reinforcement Learning (RL) algorithms. We specifically focus on incorporating robustness into a state-of-the-art continuous control RL algorithm called Maximum a-posteriori Policy Optimization (MPO). We achieve this by learning a policy that optimizes for a worst case, entropy-regularized, expected return objective and derive a corresponding robust entropy-regularized Bellman contraction operator. In addition, we introduce a less conservative, soft-robust, entropy-regularized objective with a corresponding Bellman operator. We show that both, robust and soft-robust policies, outperform their non-robust counterparts in nine Mujoco domains with environment perturbations. In addition, we show improved robust performance on a challenging, simulated, dexterous robotic hand. Finally, we present multiple investigative experiments that provide a deeper insight into the robustness framework; including an adaptation to another continuous control RL algorithm. Performance videos can be found online at https://sites.google.com/view/robust-rl.",
        "keywords": "reinforcement learning;robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel J. Mankowitz;Nir Levine;Rae Jeong;Abbas Abdolmaleki;Jost Tobias Springenberg;Yuanyuan Shi;Jackie Kay;Todd Hester;Timothy Mann;Martin Riedmiller",
        "authorids": "dmankowitz@google.com;nirlevine@google.com;raejeong@google.com;aabdolmaleki@google.com;springenberg@google.com;yyshi@google.com;kayj@google.com;toddhester@google.com;timothymann@google.com;riedmiller@google.com",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@inproceedings{\nMankowitz2020Robust,\ntitle={Robust Reinforcement Learning for Continuous Control with Model Misspecification},\nauthor={Daniel J. Mankowitz and Nir Levine and Rae Jeong and Abbas Abdolmaleki and Jost Tobias Springenberg and Yuanyuan Shi and Jackie Kay and Todd Hester and Timothy Mann and Martin Riedmiller},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgC60EtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJgC60EtwB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "728;257;1108",
        "wc_reply_reviewers": "463;0;81",
        "wc_reply_authors": "1526;351;1080",
        "reply_reviewers": "2;0;1",
        "reply_authors": "3;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            697.6666666666666,
            348.08076968172514
        ],
        "wc_reply_reviewers_avg": [
            181.33333333333334,
            201.89491215866624
        ],
        "wc_reply_authors_avg": [
            985.6666666666666,
            484.3072945512545
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 135,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=235976194213784825&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJgCF0VFwr",
        "title": "Probabilistic Connection Importance Inference and Lossless Compression of Deep Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Deep neural networks (DNNs) can be huge in size, requiring a considerable a mount of energy and computational resources to operate, which limits their applications in numerous scenarios. It is thus of interest to compress DNNs while maintaining their performance levels.  We here propose a probabilistic importance inference approach for pruning DNNs. Specifically, we test the significance of the relevance of a connection in a DNN to the DNN\u2019s outputs using a nonparemtric scoring testand keep only those significant ones. Experimental results show that the proposed approach achieves better lossless compression rates than existing techniques",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin Xing;Long Sha;Pengyu Hong;Zuofeng Shang;Jun S. Liu",
        "authorids": "xin_xing@fas.harvard.edu;longsha@brandeis.edu;hongpeng@brandeis.edu;zuofeng.shang@njit.edu;jliu@stat.harvard.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nXing2020Probabilistic,\ntitle={Probabilistic Connection Importance Inference and Lossless Compression of Deep Neural Networks},\nauthor={Xin Xing and Long Sha and Pengyu Hong and Zuofeng Shang and Jun S. Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgCF0VFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJgCF0VFwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "301;376;519",
        "wc_reply_reviewers": "0;0;129",
        "wc_reply_authors": "1001;983;1179",
        "reply_reviewers": "0;0;2",
        "reply_authors": "2;2;4",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            398.6666666666667,
            90.42983775035513
        ],
        "wc_reply_reviewers_avg": [
            43.0,
            60.81118318204309
        ],
        "wc_reply_authors_avg": [
            1054.3333333333333,
            88.45840202540904
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.9428090415820634
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18001882661991819683&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJgCcCNtwH",
        "title": "NeuroFabric: Identifying Ideal Topologies for Training A Priori Sparse Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigate pruning DNNs before training and provide an answer to which topology should be used for training a priori sparse networks.",
        "abstract": "Long training times of deep neural networks are a bottleneck in machine learning research. The major impediment to fast training is the quadratic growth of both memory and compute requirements of dense and convolutional layers with respect to their information bandwidth. Recently, training `a priori' sparse networks has been proposed as a method for allowing layers to retain high information bandwidth, while keeping memory and compute low. However, the choice of which sparse topology should be used in these networks is unclear. In this work, we provide a theoretical foundation for the choice of intra-layer topology. First, we derive a new sparse neural network initialization scheme that allows us to explore the space of very deep sparse networks. Next, we evaluate several topologies and show that seemingly similar topologies can often have a large difference in attainable accuracy. To explain these differences, we develop a data-free heuristic that can evaluate a topology independently from the dataset the network will be trained on. We then derive a set of requirements that make a good topology, and arrive at a single topology that satisfies all of them. ",
        "keywords": "Sparsity;model compression;training;topology",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mihailo Isakov;Michel A. Kinsy",
        "authorids": "mihailo@bu.edu;mkinsy@bu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nisakov2020neurofabric,\ntitle={NeuroFabric: Identifying Ideal Topologies for Training A Priori Sparse Networks},\nauthor={Mihailo Isakov and Michel A. Kinsy},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgCcCNtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJgCcCNtwH",
        "pdf_size": 0,
        "rating": "3;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "489;196;648;585",
        "wc_reply_reviewers": "0;0;0;100",
        "wc_reply_authors": "503;97;348;187",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            479.5,
            173.19425510102812
        ],
        "wc_reply_reviewers_avg": [
            25.0,
            43.30127018922193
        ],
        "wc_reply_authors_avg": [
            283.75,
            155.26972499492618
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13699841016568779335&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJgEMpVFwB",
        "title": "Adversarial Policies: Attacking Deep Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "Deep RL policies can be attacked by other agents taking actions so as to create natural observations that are adversarial.",
        "abstract": "Deep reinforcement learning (RL) policies are known to be vulnerable to adversarial perturbations to their observations, similar to adversarial examples for classifiers. However, an attacker is not usually able to directly modify another agent's observations. This might lead one to wonder: is it possible to attack an RL agent simply by choosing an adversarial policy acting in a multi-agent environment so as to create natural observations that are adversarial? We demonstrate the existence of adversarial policies in zero-sum games between simulated humanoid robots with proprioceptive observations, against state-of-the-art victims trained via self-play to be robust to opponents. The adversarial policies reliably win against the victims but generate seemingly random and uncoordinated behavior. We find that these policies are more successful in high-dimensional environments, and induce substantially different activations in the victim policy network than when the victim plays against a normal opponent. Videos are available at https://adversarialpolicies.github.io/.",
        "keywords": "deep RL;adversarial examples;security;multi-agent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Adam Gleave;Michael Dennis;Cody Wild;Neel Kant;Sergey Levine;Stuart Russell",
        "authorids": "gleave@berkeley.edu;michael_dennis@berkeley.edu;codywild@berkeley.edu;kantneel@berkeley.edu;svlevine@eecs.berkeley.edu;russell@cs.berkeley.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nGleave2020Adversarial,\ntitle={Adversarial Policies: Attacking Deep Reinforcement Learning},\nauthor={Adam Gleave and Michael Dennis and Cody Wild and Neel Kant and Sergey Levine and Stuart Russell},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgEMpVFwB}\n}",
        "github": "https://github.com/humancompatibleai/adversarial-policies",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJgEMpVFwB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "285;350;544",
        "wc_reply_reviewers": "0;0;147",
        "wc_reply_authors": "649;174;456",
        "reply_reviewers": "0;0;1",
        "reply_authors": "3;1;3",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            393.0,
            110.02121007636057
        ],
        "wc_reply_reviewers_avg": [
            49.0,
            69.29646455628166
        ],
        "wc_reply_authors_avg": [
            426.3333333333333,
            195.04928152193287
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 472,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1203868559900085227&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "HJgEe1SKPr",
        "title": "GAN-based Gaussian Mixture Model Responsibility Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Mixture Model (MM) is a probabilistic framework which allows us to define a dataset containing K different modes. When each of the modes is associated with a Gaussian distribution, we refer it as Gaussian MM, or GMM. Given a data point x, GMM may assume the existence of a random index k \u2208 {1, . . . , K } identifying which Gaussian the particular data is associated with. In a traditional GMM paradigm, it is straightforward to compute in closed-form, the conditional like- lihood p(x|k, \u03b8), as well as responsibility probability p(k|x, \u03b8) which describes the distribution index corresponds to the data. Computing the responsibility allows us to retrieve many important statistics of the overall dataset, including the weights of each of the modes. Modern large datasets often contain multiple unlabelled modes, such as paintings dataset containing several styles; fashion images containing several unlabelled categories. In its raw representation, the Euclidean distances between the data do not allow them to form mixtures naturally, nor it\u2019s feasible to compute responsibility distribution, making GMM unable to apply. To this paper, we utilize the Generative Adversarial Network (GAN) framework to achieve an alternative plausible method to compute these probabilities at the data\u2019s latent space z instead of x. Instead of defining p(x|k, \u03b8) explicitly, we devised a modified GAN to allow us to define the distribution using p(z|k, \u03b8), where z is the corresponding latent representation of x, as well as p(k|x, \u03b8) through an additional classification network which is trained with the GAN in an \u201cend-to-end\u201d fashion. These techniques allow us to discover interesting properties of an unsupervised dataset, including dataset segments as well as generating new \u201cout-distribution\u201d data by smooth linear interpolation across any combinations of the modes in a completely unsupervised manner.",
        "keywords": "Generative Adversarial Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wanming Huang;Shuai Jiang;Xuan Liang;Ian Oppermann;Richard Yi Da Xu",
        "authorids": "wanming.huang@student.uts.edu.au;shuai.jiang-1@student.uts.edu.au;xuan.liang@student.uts.edu.au;ianopper@outlook.com;yida.xu@uts.edu.au",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nhuang2020ganbased,\ntitle={{\\{}GAN{\\}}-based Gaussian Mixture Model Responsibility Learning},\nauthor={Wanming Huang and Shuai Jiang and Xuan Liang and Ian Oppermann and Richard Yi Da Xu},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgEe1SKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJgEe1SKPr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "557;254;170",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            327.0,
            166.21070964291079
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YqtGCK_ABNUJ:scholar.google.com/&scioq=GAN-based+Gaussian+Mixture+Model+Responsibility+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 4
    },
    {
        "id": "HJgExaVtwr",
        "title": "DivideMix: Learning with Noisy Labels as Semi-supervised Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a novel semi-supervised learning approach with SOTA performance on combating learning with noisy labels.",
        "abstract": "Deep neural networks are known to be annotation-hungry. Numerous efforts have been devoted to reducing the annotation cost when learning with deep networks. Two prominent directions include learning with noisy labels and semi-supervised learning by exploiting unlabeled data. In this work, we propose DivideMix, a novel framework for learning with noisy labels by leveraging semi-supervised learning techniques. In particular, DivideMix models the per-sample loss distribution with a mixture model to dynamically divide the training data into a labeled set with clean samples and an unlabeled set with noisy samples, and trains the model on both the labeled and unlabeled data in a semi-supervised manner. To avoid confirmation bias, we simultaneously train two diverged networks where each network uses the dataset division from the other network. During the semi-supervised training phase, we improve the MixMatch strategy by performing label co-refinement and label co-guessing on labeled and unlabeled samples, respectively. Experiments on multiple benchmark datasets demonstrate substantial improvements over state-of-the-art methods. Code is available at https://github.com/LiJunnan1992/DivideMix .",
        "keywords": "label noise;semi-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junnan Li;Richard Socher;Steven C.H. Hoi",
        "authorids": "junnan.li@salesforce.com;rsocher@salesforce.com;shoi@salesforce.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLi2020DivideMix:,\ntitle={DivideMix: Learning with Noisy Labels as Semi-supervised Learning},\nauthor={Junnan Li and Richard Socher and Steven C.H. Hoi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgExaVtwr}\n}",
        "github": "[![github](/images/github_icon.svg) LiJunnan1992/DivideMix](https://github.com/LiJunnan1992/DivideMix) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HJgExaVtwr)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJgExaVtwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "180;250;436",
        "wc_reply_reviewers": "0;0;15",
        "wc_reply_authors": "33;192;389",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            288.6666666666667,
            108.02880274363048
        ],
        "wc_reply_reviewers_avg": [
            5.0,
            7.0710678118654755
        ],
        "wc_reply_authors_avg": [
            204.66666666666666,
            145.61211793284545
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1342,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11967955085227307195&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJgFW6EKvH",
        "title": "Generating Robust Audio Adversarial Examples using Iterative Proportional Clipping",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Audio adversarial examples, imperceptible to humans, have been constructed to attack automatic speech recognition (ASR) systems. However, the adversarial examples generated by existing approaches usually involve notable noise, especially during the periods of silence and pauses, which may lead to the detection of such attacks. This paper proposes a new approach to generate adversarial audios using Iterative Proportional Clipping (IPC), which exploits temporal dependency in original audios to significantly limit human-perceptible noise. Specifically, in every iteration of optimization, we use a backpropagation model to learn the raw perturbation on the original audio to construct our clipping. We then impose a constraint on the perturbation at the positions with lower sound intensity across the time domain to eliminate the perceptible noise during the silent periods or pauses. IPC preserves the linear proportionality between the original audio and the perturbed one to maintain the temporal dependency. We show that the proposed approach can successfully attack the latest state-of-the-art ASR model Wav2letter+, and only requires a few minutes to generate an audio adversarial example. Experimental results also demonstrate that our approach succeeds in preserving temporal dependency and can bypass temporal dependency based defense mechanisms.",
        "keywords": "audio adversarial examples;attack;machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongting Zhang;Qiben Yan;Pan Zhou",
        "authorids": "htzhang@hust.edu.cn;qyan@msu.edu;panzhou@hust.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhang2020generating,\ntitle={Generating Robust Audio Adversarial Examples using Iterative Proportional Clipping},\nauthor={Hongting Zhang and Qiben Yan and Pan Zhou},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgFW6EKvH}\n}",
        "github": "https://drive.google.com/open?id=14LSY9x5lEhaVtJGtOKNpeD8tpGHBCSEh",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJgFW6EKvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "313;339;210",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "480;569;461",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            287.3333333333333,
            55.70358057512003
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            503.3333333333333,
            47.07676945396978
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RkgHYlcbzt8J:scholar.google.com/&scioq=Generating+Robust+Audio+Adversarial+Examples+using+Iterative+Proportional+Clipping&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJgHUCNFPS",
        "title": "Multi-View Summarization and Activity Recognition Meet Edge Computing in IoT Environments",
        "track": "main",
        "status": "Withdraw",
        "tldr": "An efficient multi-view video summarization scheme advanced to activity recognition in IoT environments.",
        "abstract": "Multi-view video summarization (MVS) lacks researchers\u2019 attention due to their major challenges of inter-view correlations and overlapping of cameras. Most of the prior MVS works are offline, relying on only summary, needing extra communication bandwidth and transmission time with no focus on uncertain environments. Different from the existing methods, we propose edge intelligence based MVS and spatio-temporal features based activity recognition for IoT environments. We segment the multi-view videos on each slave device over edge into shots using light-weight CNN object detection model and compute mutual information among them to generate summary. Our system does not rely on summary only but encode and transmit it to a master device with neural computing stick (NCS) for intelligently computing inter-view correlations and efficiently recognizing activities, thereby saving computation resources, communication bandwidth, and transmission time. Experiments report an increase of 0.4 in F-measure score on MVS Office dataset as well as 0.2% and 2% increase in activity recognition accuracy over UCF-50 and YouTube 11 datasets, respectively, with lower storage and transmission time compared to state-of-the-art. The time complexity is decreased from 1.23 to 0.45 secs for a single frame processing, thereby generating 0.75 secs faster MVS. Furthermore, we made a new dataset by synthetically adding fog to an MVS dataset to show the adaptability of our system for both certain and uncertain surveillance environments.",
        "keywords": "Artificial Intelligence;Big Data;Convolutional Neural Network;Computational Intelligence;Computer Vision;IIoT;IoT;Video Summarization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tanveer Hussain;Khan Muhammad;Amin Ullah;Javier Del Ser;Sung Wook Baik",
        "authorids": "tanveer445@ieee.org;khan.muhammad@ieee.org;aminullah@ieee.org;javier.delser@tecanlia.com;sbaik3797p@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=HJgHUCNFPS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            2,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1967479925883493925&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJgJNCEKPr",
        "title": "Improving One-Shot NAS By Suppressing The Posterior Fading",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Our paper identifies the issue of existing weight sharing approach in neural architecture search and propose a practical method, achieving strong results.",
        "abstract": "There is a growing interest in automated neural architecture search (NAS). To improve the efficiency of NAS, previous approaches adopt  weight sharing method to force all models share the same set of weights.  However, it has been observed that a model performing better with shared weights does not necessarily perform  better when trained alone. In this paper, we analyse existing weight sharing one-shot NAS approaches from a Bayesian point of view and identify the posterior fading problem, which compromises the effectiveness of shared weights. To alleviate this problem, we present a practical approach to guide the parameter posterior towards its true distribution. Moreover, a hard latency constraint is introduced during the search so that the desired latency can be achieved. The resulted method, namely Posterior Convergent NAS (PC-NAS), achieves state-of-the-art performance under standard GPU latency constraint on ImageNet. In our small search space, our model PC-NAS-S attains76.8% top-1 accuracy, 2.1% higher than MobileNetV2 (1.4x) with the same latency. When adopted to our large search space, PC-NAS-L achieves 78.1% top-1 accuracy within 11ms. The discovered architecture also transfers well to other computer vision applications such as object detection and person re-identification.",
        "keywords": "Computer vision;Image classification;Neural Architecture Search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiang Li*;Chen Lin*;Chuming Li;Ming Sun;Wei Wu;Junjie Yan;Wanli Ouyang",
        "authorids": "xiang_li_1@brown.edu;linchen@sensetime.com;lichuming@sensetime.com;sunming1@sensetime.com;wuwei@sensetime.com;yanjunjie@sensetime.com;wanli.ouyang@sydney.edu.au",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJgJNCEKPr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "536;163;190",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "482;594;361",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            296.3333333333333,
            169.8280175811858
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            479.0,
            95.14550260872379
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 90,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15867810413468751916&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJgJtT4tvB",
        "title": "ReClor: A Reading Comprehension Dataset Requiring Logical Reasoning",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce ReClor, a reading comprehension dataset requiring logical reasoning, and find that current state-of-the-art models struggle with real logical reasoning with poor performance near that of random guess.",
        "abstract": "Recent powerful pre-trained language models have achieved remarkable performance on most of the popular datasets for reading comprehension. It is time to introduce more challenging datasets to push the development of this field towards more comprehensive reasoning of text. In this paper, we introduce a new Reading Comprehension dataset requiring logical reasoning (ReClor) extracted from standardized graduate admission examinations. As earlier studies suggest, human-annotated datasets usually contain biases, which are often exploited by models to achieve high accuracy without truly understanding the text. In order to comprehensively evaluate the logical reasoning ability of models on ReClor, we propose to identify biased data points and separate them into EASY set while the rest as HARD set. Empirical results show that state-of-the-art models have an outstanding ability to capture biases contained in the dataset with high accuracy on EASY set. However, they struggle on HARD set with poor performance near that of random guess, indicating more research is needed to essentially enhance the logical reasoning ability of current models. ",
        "keywords": "reading comprehension;logical reasoning;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Weihao Yu;Zihang Jiang;Yanfei Dong;Jiashi Feng",
        "authorids": "weihaoyu6@gmail.com;jzihang@u.nus.edu;yanfei.dong43@gmail.com;elefjia@nus.edu.sg",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nYu2020ReClor:,\ntitle={ReClor: A Reading Comprehension Dataset Requiring Logical Reasoning},\nauthor={Weihao Yu and Zihang Jiang and Yanfei Dong and Jiashi Feng},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgJtT4tvB}\n}",
        "github": "http://whyu.me/reclor/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJgJtT4tvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "401;577;948",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "687;778;1154",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            642.0,
            227.9926899412932
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            873.0,
            202.14021536217544
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 264,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4598160843843301931&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJgK0h4Ywr",
        "title": "Theory and Evaluation Metrics for Learning Disentangled Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We make two theoretical contributions to disentanglement learning by (a) defining precise semantics of disentangled representations, and (b) establishing robust metrics for evaluation. First, we characterize the concept \u201cdisentangled representations\u201d used in supervised and unsupervised methods along three dimensions\u2013informativeness, separability and interpretability\u2013which can be expressed and quantified explicitly using information-theoretic constructs. This helps explain the behaviors of several well-known disentanglement learning models. We then propose robust metrics for measuring informativeness, separability and interpretability. Through a comprehensive suite of experiments, we show that our metrics correctly characterize the representations learned by different methods and are consistent with qualitative (visual) results. Thus, the metrics allow disentanglement learning methods to be compared on a fair ground. We also empirically uncovered new interesting properties of VAE-based methods and interpreted them with our formulation. These findings are promising and hopefully will encourage the design of more theoretically driven models for learning disentangled representations. ",
        "keywords": "disentanglement;metrics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kien Do;Truyen Tran",
        "authorids": "dkdo@deakin.edu.au;truyen.tran@deakin.edu.au",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nDo2020Theory,\ntitle={Theory and Evaluation Metrics for Learning Disentangled Representations},\nauthor={Kien Do and Truyen Tran},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgK0h4Ywr}\n}",
        "github": "[![github](/images/github_icon.svg) clarken92/DisentanglementMetrics](https://github.com/clarken92/DisentanglementMetrics) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HJgK0h4Ywr)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJgK0h4Ywr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "297;150;890",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "209;163;675",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            445.6666666666667,
            319.8711546162854
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            349.0,
            231.28049348500332
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 108,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7456690520633127745&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJgKYlSKvr",
        "title": "Unsupervised Generative 3D Shape Learning from Natural Images",
        "track": "main",
        "status": "Reject",
        "tldr": "We train a generative 3D model of shapes from natural images in an fully unsupervised way.",
        "abstract": "In this paper we present, to the best of our knowledge, the first method to learn a generative model of 3D shapes from natural images in a fully unsupervised way. For example, we do not use any ground truth 3D or 2D annotations, stereo video, and ego-motion during the training. Our approach follows the general strategy of Generative Adversarial Networks, where an image generator network learns to create image samples that are realistic enough to fool a discriminator network into believing that they are natural images. In contrast, in our approach the image gen- eration is split into 2 stages. In the first stage a generator network outputs 3D ob- jects. In the second, a differentiable renderer produces an image of the 3D object from a random viewpoint. The key observation is that a realistic 3D object should yield a realistic rendering from any plausible viewpoint. Thus, by randomizing the choice of the viewpoint our proposed training forces the generator network to learn an interpretable 3D representation disentangled from the viewpoint. In this work, a 3D representation consists of a triangle mesh and a texture map that is used to color the triangle surface by using the UV-mapping technique. We provide analysis of our learning approach, expose its ambiguities and show how to over- come them. Experimentally, we demonstrate that our method can learn realistic 3D shapes of faces by using only the natural images of the FFHQ dataset.",
        "keywords": "unsupervised;3D;differentiable;rendering;disentangling;interpretable",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Attila Szabo;Givi Meishvili;Paolo Favaro",
        "authorids": "attila.szabo@inf.unibe.ch;givi.meishvili@inf.unibe.ch;paolo.favaro@inf.unibe.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nszabo2020unsupervised,\ntitle={Unsupervised Generative 3D Shape Learning from Natural Images},\nauthor={Attila Szabo and Givi Meishvili and Paolo Favaro},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgKYlSKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJgKYlSKvr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "630;695;519",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "275;108;40",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            614.6666666666666,
            72.66513759859141
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            141.0,
            98.73533646403736
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 76,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7988563800654832522&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJgLLyrYwB",
        "title": "State-only Imitation with Transition Dynamics Mismatch",
        "track": "main",
        "status": "Poster",
        "tldr": "Algorithm for imitation with state-only expert demonstrations; builds on adversarial-IRL; experiments with transition dynamics mismatch b/w expert and imitator",
        "abstract": "Imitation Learning (IL) is a popular paradigm for training agents to achieve complicated goals by leveraging expert behavior, rather than dealing with the hardships of designing a correct reward function. With the environment modeled as a Markov Decision Process (MDP), most of the existing IL algorithms are contingent on the availability of expert demonstrations in the same MDP as the one in which a new imitator policy is to be learned. This is uncharacteristic of many real-life scenarios where discrepancies between the expert and the imitator MDPs are common, especially in the transition dynamics function. Furthermore, obtaining expert actions may be costly or infeasible, making the recent trend towards state-only IL (where expert demonstrations constitute only states or observations) ever so promising. Building on recent adversarial imitation approaches that are motivated by the idea of divergence minimization, we present a new state-only IL algorithm in this paper. It divides the overall optimization objective into two subproblems by introducing an indirection step and solves the subproblems iteratively. We show that our algorithm is particularly effective when there is a transition dynamics mismatch between the expert and imitator MDPs, while the baseline IL methods suffer from performance degradation. To analyze this, we construct several interesting MDPs by modifying the configuration parameters for the MuJoCo locomotion tasks from OpenAI Gym.",
        "keywords": "Imitation learning;Reinforcement Learning;Inverse Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tanmay Gangwani;Jian Peng",
        "authorids": "gangwan2@illinois.edu;jianpeng@illinois.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nGangwani2020State-only,\ntitle={State-only Imitation with Transition Dynamics Mismatch},\nauthor={Tanmay Gangwani and Jian Peng},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgLLyrYwB}\n}",
        "github": "https://github.com/tgangwani/RL-Indirect-imitation",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJgLLyrYwB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "891;131;252",
        "wc_reply_reviewers": "267;0;0",
        "wc_reply_authors": "1339;180;347",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            424.6666666666667,
            333.4269868435301
        ],
        "wc_reply_reviewers_avg": [
            89.0,
            125.86500705120545
        ],
        "wc_reply_authors_avg": [
            622.0,
            511.5590549161129
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 61,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14672237104350314112&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJgLZR4KvH",
        "title": "Dynamics-Aware Unsupervised Discovery of Skills",
        "track": "main",
        "status": "Talk",
        "tldr": "We propose an unsupervised skill discovery which enables model-based planning for hierarchical reinforcement learning.",
        "abstract": "Conventionally, model-based reinforcement learning (MBRL) aims to learn a global model for the dynamics of the environment. A good model can potentially enable planning algorithms to generate a large variety of behaviors and solve diverse tasks. However, learning an accurate model for complex dynamical systems is difficult, and even then, the model might not generalize well outside the distribution of states on which it was trained. In this work, we combine model-based learning with model-free learning of primitives that make model-based planning easy. To that end, we aim to answer the question: how can we discover skills whose outcomes are easy to predict? We propose an unsupervised learning algorithm, Dynamics-Aware Discovery of Skills (DADS), which simultaneously discovers predictable behaviors and learns their dynamics. Our method can leverage continuous skill spaces, theoretically, allowing us to learn infinitely many behaviors even for high-dimensional state-spaces. We demonstrate that zero-shot planning in the learned latent space significantly outperforms standard MBRL and model-free goal-conditioned RL, can handle sparse-reward tasks, and substantially improves over prior hierarchical RL methods for unsupervised skill discovery.",
        "keywords": "reinforcement learning;unsupervised learning;model-based learning;deep learning;hierarchical reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Archit Sharma;Shixiang Gu;Sergey Levine;Vikash Kumar;Karol Hausman",
        "authorids": "architsh@google.com;shanegu@google.com;slevine@google.com;vikashplus@google.com;karolhausman@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nSharma2020Dynamics-Aware,\ntitle={Dynamics-Aware Unsupervised Discovery of Skills},\nauthor={Archit Sharma and Shixiang Gu and Sergey Levine and Vikash Kumar and Karol Hausman},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgLZR4KvH}\n}",
        "github": "https://github.com/google-research/dads",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJgLZR4KvH",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "432;305;293",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "271;185;12",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            343.3333333333333,
            62.887907334311016
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            156.0,
            107.70639102052704
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 518,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17528482615651308176&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJgLlgBKvH",
        "title": "Diversely Stale Parameters for Efficient Training of Deep Convolutional Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose Diversely Stale Parameters to break lockings of the backpropoagation algorithm and train a CNN in parallel.",
        "abstract": "The backpropagation algorithm is the most popular algorithm training neural networks nowadays. However, it suffers from the forward locking, backward locking and update locking problems, especially when a neural network is so large that its layers are distributed across multiple devices. Existing solutions either can only handle one locking problem or lead to severe accuracy loss or memory inefficiency. Moreover, none of them consider the straggler problem among devices. In this paper, we propose \\textbf{Layer-wise Staleness} and a novel efficient training algorithm, \\textbf{Diversely Stale Parameters} (DSP), which can address all these challenges without loss of accuracy nor memory issue. We also analyze the convergence of DSP with two popular gradient-based methods and prove that both of them are guaranteed to converge to critical points for non-convex problems. Finally, extensive experimental results on training deep convolutional neural networks demonstrate that our proposed DSP algorithm can achieve significant training speedup with stronger robustness and better generalization than compared methods.",
        "keywords": "Layer-wise Staleness;Parallel Training;Convolutional Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "An Xu;Zhouyuan Huo;Heng Huang",
        "authorids": "an.xu@pitt.edu;zhouyuan.huo@pitt.edu;heng.huang@pitt.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJgLlgBKvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "415;663;651",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            576.3333333333334,
            114.1850350187021
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:prGGzy8gnUwJ:scholar.google.com/&scioq=Diversely+Stale+Parameters+for+Efficient+Training+of+Deep+Convolutional+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJgQpgBKDH",
        "title": "Meta Label Correction for Learning with Weak Supervision",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Leveraging weak or noisy supervision for building effective machine learning\nmodels has long been an important research problem. The growing need\nfor large-scale datasets to train deep learning models has increased\nits importance.  Weak or noisy supervision could originate from\nmultiple sources including non-expert annotators or automatic labeling\nbased on heuristics or user interaction signals. Previous work on\nmodeling and correcting weak labels have been focused on various\naspects, including loss correction, training instance re-weighting,\netc. In this paper, we approach this problem from a novel perspective\nbased on meta-learning. We view the label correction procedure as a\nmeta-process and propose a new meta-learning based framework termed\nMLC for learning with weak supervision. Experiments with different\nlabel noise levels on multiple datasets show that MLC can achieve\nlarge improvement over previous methods incorporating weak labels for\nlearning.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guoqing Zheng;Ahmed Hassan Awadallah;Susan Dumais",
        "authorids": "zheng@microsoft.com;hassanam@microsoft.com;sdumais@microsoft.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJgQpgBKDH",
        "pdf_size": 0,
        "rating": "1;3;3;8",
        "confidence": "0;0;0;0",
        "wc_review": "842;475;260;151",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "668;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;0;0;0",
        "rating_avg": [
            3.75,
            2.5860201081971503
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            432.0,
            263.8626536666377
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            167.0,
            289.2524848640025
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.25,
            0.4330127018922193
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 26,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18436637221034349124&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HJgR5lSFwr",
        "title": "Variational lower bounds on mutual information based on nonextensive statistical mechanics",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Mutual information estimator based nonextensive statistical mechanics",
        "abstract": "This paper aims to address the limitations of mutual information estimators based on variational optimization. By redefining the cost using generalized functions from nonextensive statistical mechanics we raise the upper bound of previous estimators and enable the control of the bias variance trade off. Variational based estimators outperform previous methods especially in high dependence high dimensional scenarios found in machine learning setups. Despite their performance, these estimators either exhibit a high variance or are upper bounded by log(batch size). Our approach inspired by nonextensive statistical mechanics uses different generalizations for the logarithm and the exponential in the partition function. This enables the estimator to capture changes in mutual information over a wider range of dimensions and correlations of the input variables whereas previous estimators saturate them.",
        "keywords": "mutual information;variational bounds;nonextensive statistical mechanics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Valeriu Balaban;Yang Zikun;Paul Bogdan",
        "authorids": "vbalaban@usc.edu;yangzikun@buaa.edu.cn;pbogdan@usc.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://github.com/valeriu-balaban/variational-bounds-mi-nonextensive-statistics",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJgR5lSFwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "847;239;344",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            476.6666666666667,
            265.35050195710744
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NkF0DVbueDoJ:scholar.google.com/&scioq=Variational+lower+bounds+on+mutual+information+based+on+nonextensive+statistical+mechanics&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJgRCyHFDr",
        "title": "On Weight-Sharing and Bilevel Optimization in Architecture Search",
        "track": "main",
        "status": "Reject",
        "tldr": "An analysis of the learning and optimization structures of architecture search in neural networks and beyond.",
        "abstract": "Weight-sharing\u2014the simultaneous optimization of multiple neural networks using the same parameters\u2014has emerged as a key component of state-of-the-art neural architecture search. However, its success is poorly understood and often found to be surprising. We argue that, rather than just being an optimization trick, the weight-sharing approach is induced by the relaxation of a structured hypothesis space, and introduces new algorithmic and theoretical challenges as well as applications beyond neural architecture search. Algorithmically, we show how the geometry of ERM for weight-sharing requires greater care when designing gradient- based minimization methods and apply tools from non-convex non-Euclidean optimization to give general-purpose algorithms that adapt to the underlying structure. We further analyze the learning-theoretic behavior of the bilevel optimization solved by practical weight-sharing methods. Next, using kernel configuration and NLP feature selection as case studies, we demonstrate how weight-sharing applies to the architecture search generalization of NAS and effectively optimizes the resulting bilevel objective. Finally, we use our optimization analysis to develop a simple exponentiated gradient method for NAS that aligns with the underlying optimization geometry and matches state-of-the-art approaches on CIFAR-10.",
        "keywords": "neural architecture search;weight-sharing;bilevel optimization;non-convex optimization;hyperparameter optimization;model selection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mikhail Khodak;Liam Li;Maria-Florina Balcan;Ameet Talwalkar",
        "authorids": "khodak@cmu.edu;me@liamcli.com;ninamf@cs.cmu.edu;talwalkar@cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkhodak2020on,\ntitle={On Weight-Sharing and Bilevel Optimization in Architecture Search},\nauthor={Mikhail Khodak and Liam Li and Maria-Florina Balcan and Ameet Talwalkar},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgRCyHFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJgRCyHFDr",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "149;290",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "252;447",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            219.5,
            70.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            349.5,
            97.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14660459731235121836&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJgS7p4FPH",
        "title": "Accelerating Reinforcement Learning Through GPU Atari Emulation",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper introduces a new library to emulate Atari games on a GPU and shows its benefits in terms of acceleration and scaling to multiple GPU system, while also providing an analysis of the advantages and disadvantages of GPU emulation for RL.",
        "abstract": "We introduce CuLE (CUDA Learning Environment), a CUDA port of the Atari Learning Environment (ALE) which is used for the development of deep reinforcement algorithms.  CuLE overcomes many limitations of existing CPU-based emulators and scales naturally to multiple GPUs. It leverages GPU parallelization to run thousands of games simultaneously and it renders frames directly on the GPU, to avoid the bottleneck arising from the limited CPU-GPU communication bandwidth. CuLE generates up to 155M frames per hour on a single GPU, a finding previously achieved only through a cluster of CPUs. Beyond highlighting the differences between CPU and GPU emulators in the context of reinforcement learning, we show how to leverage the high throughput of CuLE by effective batching of the training data, and show accelerated convergence for A2C+V-trace. CuLE is available at [hidden URL].",
        "keywords": "GPU;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Steven Dalton;Michael Garland;Iuri Frosio",
        "authorids": "sdalton@nvidia.com;mgarland@nvidia.com;ifrosio@nvidia.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndalton2020accelerating,\ntitle={Accelerating Reinforcement Learning Through {\\{}GPU{\\}} Atari Emulation},\nauthor={Steven Dalton and Michael Garland and Iuri Frosio},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgS7p4FPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJgS7p4FPH",
        "pdf_size": 0,
        "rating": "1;8;8",
        "confidence": "0;0;0",
        "wc_review": "352;169;148",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "788;256;127",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.666666666666667,
            3.299831645537222
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            223.0,
            91.61877536837086
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            390.3333333333333,
            286.0819618376679
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 48,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14852827801833804671&as_sdt=40005&sciodt=0,10&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJgSwyBKvr",
        "title": "Weakly Supervised Disentanglement with Guarantees",
        "track": "main",
        "status": "Poster",
        "tldr": "We construct a theoretical framework for weakly supervised disentanglement and conducted lots of experiments to back up the theory.",
        "abstract": "Learning disentangled representations that correspond to factors of variation in real-world data is critical to interpretable and human-controllable machine learning. Recently, concerns about the viability of learning disentangled representations in a purely unsupervised manner has spurred a shift toward the incorporation of weak supervision. However, there is currently no formalism that identifies when and how weak supervision will guarantee disentanglement. To address this issue, we provide a theoretical framework to assist in analyzing the disentanglement guarantees (or lack thereof) conferred by weak supervision when coupled with learning algorithms based on distribution matching. We empirically verify the guarantees and limitations of several weak supervision methods (restricted labeling, match-pairing, and rank-pairing), demonstrating the predictive power and usefulness of our theoretical framework.",
        "keywords": "disentanglement;theory of disentanglement;representation learning;generative models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rui Shu;Yining Chen;Abhishek Kumar;Stefano Ermon;Ben Poole",
        "authorids": "ruishu@stanford.edu;cynnjjs@stanford.edu;abhishk@google.com;ermon@cs.stanford.edu;pooleb@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nShu2020Weakly,\ntitle={Weakly Supervised Disentanglement with Guarantees},\nauthor={Rui Shu and Yining Chen and Abhishek Kumar and Stefano Ermon and Ben Poole},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgSwyBKvr}\n}",
        "github": "https://github.com/google-research/google-research/tree/master/weak_disentangle",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJgSwyBKvr",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "140;282;521",
        "wc_reply_reviewers": "0;184;27",
        "wc_reply_authors": "792;669;1459",
        "reply_reviewers": "0;1;1",
        "reply_authors": "2;2;4",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            314.3333333333333,
            157.21393774796883
        ],
        "wc_reply_reviewers_avg": [
            70.33333333333333,
            81.12678691748833
        ],
        "wc_reply_authors_avg": [
            973.3333333333334,
            347.06995772546423
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.9428090415820634
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 169,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4450469536114462455&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJgY6R4YPH",
        "title": "Domain Adaptation Through Label Propagation: Learning Clustered and Aligned Features",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A novel domain adaptation method to align manifolds from source and target domains using label propagation for better accuracy.",
        "abstract": "The difficulty of obtaining sufficient labeled data for supervised learning has motivated domain adaptation, in which a classifier is trained in one domain, source domain, but operates in another, target domain. Reducing domain discrepancy has improved the performance, but it is hampered by the embedded features that do not form clearly separable and aligned clusters. We address this issue by propagating labels using a manifold structure, and by enforcing cycle consistency to align the clusters of features in each domain more closely. Specifically, we prove that cycle consistency leads the embedded features distant from all but one clusters if the source domain is ideally clustered. We additionally utilize more information from approximated local manifold and pursue local manifold consistency for more improvement. Results for various domain adaptation scenarios show tighter clustering and an improvement in classification accuracy.",
        "keywords": "domain adaptation;label propagation;manifold regularization;computer vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Changhwa Park;Jaeyoon Yoo;Youngjun Hong;Sungroh Yoon",
        "authorids": "omega6464@snu.ac.kr;yjy765@snu.ac.kr;youngjun.hong@enerzai.com;sryoon@snu.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJgY6R4YPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "273;504;157",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1107;1916;976",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;4;3",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            311.3333333333333,
            144.2320660910357
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1333.0,
            415.6978069062509
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            3.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:C_yhhKbWznwJ:scholar.google.com/&scioq=Domain+Adaptation+Through+Label+Propagation:+Learning+Clustered+and+Aligned+Features&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJg_ECEKDr",
        "title": "Generative Teaching Networks: Accelerating Neural Architecture Search by Learning to Generate Synthetic Training Data",
        "track": "main",
        "status": "Reject",
        "tldr": "We meta-learn a DNN to generate synthetic training data that rapidly teaches a learning DNN a target task, speeding up neural architecture search nine-fold. ",
        "abstract": "This paper investigates the intriguing question of whether we can create learning algorithms that automatically generate training data, learning environments, and curricula in order to help AI agents rapidly learn. We show that such algorithms are possible via Generative Teaching Networks (GTNs), a general approach that is applicable to supervised, unsupervised, and reinforcement learning. GTNs are deep neural networks that generate data and/or training environments that a learner (e.g.\\ a freshly initialized neural network) trains on before being tested on a target task. We then differentiate \\emph{through the entire learning process} via meta-gradients to update the GTN parameters to improve performance on the target task. GTNs have the beneficial property that they can theoretically generate any type of data or training environment, making their potential impact large. This paper introduces GTNs, discusses their potential, and showcases that they can substantially accelerate learning. We also demonstrate a practical and exciting application of GTNs: accelerating the evaluation of candidate architectures for neural architecture search (NAS), which is rate-limited by such evaluations, enabling massive speed-ups in NAS. GTN-NAS improves the NAS state of the art, finding higher performing architectures when controlling for the search proposal mechanism. GTN-NAS also is competitive with the overall state of the art approaches, which achieve top performance while using orders of magnitude less computation than typical NAS methods. Overall, GTNs represent a first step toward the ambitious goal of algorithms that generate their own training data and, in doing so, open a variety of interesting new research questions and directions.",
        "keywords": "Generative models;generating synthetic data;neural architecture search;learning to teach;meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Felipe Petroski Such;Aditya Rawal;Joel Lehman;Kenneth Stanley;Jeff Clune",
        "authorids": "felipe.such@uber.com;aditya.rawal@uber.com;joel.lehman@uber.com;kstanley@uber.com;jeffclune@uber.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsuch2020generative,\ntitle={Generative Teaching Networks: Accelerating Neural Architecture Search by Learning  to Generate Synthetic Training Data},\nauthor={Felipe Petroski Such and Aditya Rawal and Joel Lehman and Kenneth Stanley and Jeff Clune},\nyear={2020},\nurl={https://openreview.net/forum?id=HJg_ECEKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJg_ECEKDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "848;473;408",
        "wc_reply_reviewers": "516;125;0",
        "wc_reply_authors": "2739;798;937",
        "reply_reviewers": "1;1;0",
        "reply_authors": "4;2;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            576.3333333333334,
            193.92151906262376
        ],
        "wc_reply_reviewers_avg": [
            213.66666666666666,
            219.78828196445983
        ],
        "wc_reply_authors_avg": [
            1491.3333333333333,
            884.0566849598628
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.9428090415820634
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 211,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12098789456237773192&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJg_tkBtwS",
        "title": "Model-Agnostic Feature Selection with Additional Mutual Information",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop a simple regression-based model-agnostic feature selection method to interpret data generating processes with FDR control, and outperform several popular baselines on several simulated, medical, and image datasets.",
        "abstract": "Answering questions about data can require understanding what parts of an input X influence the response Y. Finding such an understanding can be built by testing relationships between variables through a machine learning model. For example, conditional randomization tests help determine whether a variable relates to the response given the rest of the variables. However, randomization tests require users to specify test statistics. We formalize a class of proper test statistics that are guaranteed to select a feature when it provides information about the response even when the rest of the features are known. We show that f-divergences provide a broad class of proper test statistics. In the class of f-divergences, the KL-divergence yields an easy-to-compute proper test statistic that relates to the AMI. Questions of feature importance can be asked at the level of an individual sample.  We show that estimators from the same AMI test can also be used to find important features in a particular instance. We provide an example to show that perfect predictive models are insufficient for instance-wise feature selection. We evaluate our method on several simulation experiments, on a genomic dataset, a clinical dataset for hospital readmission, and on a subset of classes in ImageNet. Our method outperforms several baselines in various simulated datasets, is able to identify biologically significant genes, can select the most important predictors of a hospital readmission event, and is able to identify distinguishing features in an image-classification task. ",
        "keywords": "feature selection;interpretability;randomization;fdr control;p-values",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mukund Sudarshan;Aahlad Manas Puli;Lakshmi Subramanian;Sriram Sankararaman;Rajesh Ranganath",
        "authorids": "ms7490@nyu.edu;apm470@nyu.edu;lakshmi@cs.nyu.edu;sriram@cs.ucla.edu;rajeshr@cims.nyu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsudarshan2020modelagnostic,\ntitle={Model-Agnostic Feature Selection with Additional Mutual Information},\nauthor={Mukund Sudarshan and Aahlad Manas Puli and Lakshmi Subramanian and Sriram Sankararaman and Rajesh Ranganath},\nyear={2020},\nurl={https://openreview.net/forum?id=HJg_tkBtwS}\n}",
        "github": "https://drive.google.com/file/d/13ShNmzQV-rI1DQN2AXpFrNAUKe7rmC0l/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJg_tkBtwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "1234;420;277",
        "wc_reply_reviewers": "45;0;0",
        "wc_reply_authors": "1351;273;149",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            643.6666666666666,
            421.49126786789884
        ],
        "wc_reply_reviewers_avg": [
            15.0,
            21.213203435596427
        ],
        "wc_reply_authors_avg": [
            591.0,
            539.7802021810977
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:joofUuXj1WcJ:scholar.google.com/&scioq=Model-Agnostic+Feature+Selection+with+Additional+Mutual+Information&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJgb7lSFwS",
        "title": "Distance-based Composable Representations with Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We introduce a new deep learning technique that builds individual and class representations based on distance estimates to randomly generated contextual dimensions for different modalities. Recent works have demonstrated advantages to creating representations from probability distributions over their contexts rather than single points in a low-dimensional Euclidean vector space. These methods, however, rely on pre-existing features and are limited to textual information. In this work, we obtain generic template representations that are vectors containing the average distance of a class to randomly generated contextual information. These representations have the benefit of being both interpretable and composable. They are initially learned by estimating the Wasserstein distance for different data subsets with deep neural networks. Individual samples or instances can then be compared to the generic class representations, which we call templates, to determine their similarity and thus class membership. We show that this technique, which we call WDVec, delivers good results for multi-label image classification. Additionally, we illustrate the benefit of templates and their composability by performing retrieval with complex queries where we modify the information content in the representations. Our method can be used in conjunction with any existing neural network and create theoretically infinitely large feature maps.",
        "keywords": "Representation learning;Wasserstein distance;Composability;Templates",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Graham Spinks;Marie-Francine Moens",
        "authorids": "graham.spinks@cs.kuleuven.be;sien.moens@cs.kuleuven.be",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nspinks2020distancebased,\ntitle={Distance-based Composable Representations with Neural Networks},\nauthor={Graham Spinks and Marie-Francine Moens},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgb7lSFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJgb7lSFwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "399;671;144",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "740;748;396",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.6666666666667,
            215.18415885520528
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            628.0,
            164.08128067109504
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J0KicGmeeIgJ:scholar.google.com/&scioq=Distance-based+Composable+Representations+with+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJgcvJBFvB",
        "title": "Network Randomization: A Simple Technique for Generalization in Deep Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a simple randomization technique for improving generalization in deep reinforcement learning across tasks with various unseen visual patterns.",
        "abstract": "Deep reinforcement learning (RL) agents often fail to generalize to unseen environments (yet semantically similar to trained agents), particularly when they are trained on high-dimensional state spaces, such as images. In this paper, we propose a simple technique to improve a generalization ability of deep RL agents by introducing a randomized (convolutional) neural network that randomly perturbs input observations. It enables trained agents to adapt to new domains by learning robust features invariant across varied and randomized environments. Furthermore, we consider an inference method based on the Monte Carlo approximation to reduce the variance induced by this randomization. We demonstrate the superiority of our method across 2D CoinRun, 3D DeepMind Lab exploration and 3D robotics control tasks: it significantly outperforms various regularization and data augmentation methods for the same purpose.",
        "keywords": "Deep reinforcement learning;Generalization in visual domains",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kimin Lee;Kibok Lee;Jinwoo Shin;Honglak Lee",
        "authorids": "kiminlee@kaist.ac.kr;kibok@umich.edu;jinwoos@kaist.ac.kr;honglak@eecs.umich.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLee2020Network,\ntitle={Network Randomization: A Simple Technique for Generalization in Deep Reinforcement Learning},\nauthor={Kimin Lee and Kibok Lee and Jinwoo Shin and Honglak Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgcvJBFvB}\n}",
        "github": "https://github.com/pokaxpoka/netrand",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJgcvJBFvB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "315;412;385",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "680;493;471",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            370.6666666666667,
            40.87650778734515
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            548.0,
            93.76922025199242
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 246,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6049043144348184316&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJgcw0Etwr",
        "title": "Toward Understanding Generalization of Over-parameterized Deep ReLU network trained with SGD in Student-teacher Setting",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper analyzes training dynamics and critical points of training deep ReLU network via SGD in the teacher-student setting. ",
        "abstract": "To analyze deep ReLU network, we adopt a student-teacher setting in which an over-parameterized student network learns from the output of a fixed teacher network of the same depth, with Stochastic Gradient Descent (SGD). Our contributions are two-fold. First, we prove that when the gradient is zero (or bounded above by a small constant) at every data point in training, a situation called  \\emph{interpolation setting}, there exists many-to-one \\emph{alignment} between student and teacher nodes in the lowest layer under mild conditions. This suggests that generalization in unseen dataset is achievable, even the same condition often leads to zero training error. Second, analysis of noisy recovery and training dynamics in 2-layer network shows that strong teacher nodes (with large fan-out weights) are learned first and subtle teacher nodes are left unlearned until late stage of training. As a result, it could take a long time to converge into these small-gradient critical points. Our analysis shows that over-parameterization plays two roles: (1) it is a necessary condition for alignment to happen at the critical points, and (2) in training dynamics, it helps student nodes cover more teacher nodes with fewer iterations. Both improve generalization. Experiments justify our finding.",
        "keywords": "deep ReLU network;theoretical analysis;generalization;training dynamics;student teacher setting;interpolation region;over-parameterization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuandong Tian",
        "authorids": "yuandong.tian@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\ntian2020toward,\ntitle={Toward Understanding Generalization of Over-parameterized Deep Re{\\{}LU{\\}} network trained with {\\{}SGD{\\}} in Student-teacher Setting},\nauthor={Yuandong Tian},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgcw0Etwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJgcw0Etwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "371;691;484",
        "wc_reply_reviewers": "0;57;0",
        "wc_reply_authors": "433;446;551",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            515.3333333333334,
            132.5049265331503
        ],
        "wc_reply_reviewers_avg": [
            19.0,
            26.870057685088806
        ],
        "wc_reply_authors_avg": [
            476.6666666666667,
            52.82886416428891
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8yVEwA-OVdcJ:scholar.google.com/&scioq=Toward+Understanding+Generalization+of+Over-parameterized+Deep+ReLU+network+trained+with+SGD+in+Student-teacher+Setting&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJgdo6VFPH",
        "title": "OmniNet: A unified architecture for multi-modal multi-task learning",
        "track": "main",
        "status": "Reject",
        "tldr": "OmniNet is a unified and extended version of the Transformer architecture for multi-modal multi-task learning. ",
        "abstract": "Transformer is a popularly used neural network architecture, especially for language understanding. We introduce an extended and unified architecture that can be used for tasks involving a variety of modalities like image, text, videos, etc. We propose a spatio-temporal cache mechanism that enables learning spatial dimension of the input in addition to the hidden states corresponding to the temporal input sequence. The proposed architecture further enables a single model to support tasks with multiple input modalities as well as asynchronous multi-task learning, thus we refer to it as OmniNet. For example, a single instance of OmniNet can concurrently learn to perform the tasks of part-of-speech tagging, image captioning, visual question answering and video activity recognition. We demonstrate that training these four tasks together results in about three times compressed model while retaining the performance in comparison to training them individually. We also show that using this neural network pre-trained on some modalities assists in learning unseen tasks such as video captioning and video question answering. This illustrates the generalization capacity of the self-attention mechanism on the spatio-temporal cache present in OmniNet. ",
        "keywords": "multimodal;multi-task;transformer;spatio-temporal;attention-networks;neural-network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Subhojeet Pramanik;Priyanka Agrawal;Aman Hussain",
        "authorids": "subhojeetpramanik@gmail.com;pagrawal.ml@gmail.com;email@amanhussain.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npramanik2020omninet,\ntitle={OmniNet: A unified architecture for multi-modal multi-task learning},\nauthor={Subhojeet Pramanik and Priyanka Agrawal and Aman Hussain},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgdo6VFPH}\n}",
        "github": "https://storage.googleapis.com/omninet/OmniNet.zip",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJgdo6VFPH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "374;235;269",
        "wc_reply_reviewers": "0;167;0",
        "wc_reply_authors": "716;904;68",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.6666666666667,
            59.162675921751735
        ],
        "wc_reply_reviewers_avg": [
            55.666666666666664,
            78.7245549721023
        ],
        "wc_reply_authors_avg": [
            562.6666666666666,
            358.1036473176756
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 57,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6074487200103411401&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJgepaNtDS",
        "title": "Learnable Group Transform For Time-Series",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose to undertake the problem of representation learning for time-series by considering a Group Transform approach. This framework allows us to, first, generalize classical time-frequency transformations such as the Wavelet Transform, and second, to enable the learnability of the representation. While the creation of the Wavelet Transform filter-bank relies on the sampling of the affine group in order to transform the mother filter, our approach allows for non-linear transformations of the mother filter by introducing the group of strictly increasing and continuous functions. The transformations induced by such a group enable us to span a larger class of signal representations. The sampling of this group can be optimized with respect to a specific loss and function and thus cast into a Deep Learning architecture. The experiments on diverse time-series datasets demonstrate the expressivity of this framework which competes with state-of-the-art performances.",
        "keywords": "Group Transform;Time-Frequency Representation;Wavelet Transform;Group Theory;Representation Theory;Time-Series",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Romain Cosentino;Behnaam Aazhang",
        "authorids": "rc57@rice.edu;aaz@rice.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ncosentino2020learnable,\ntitle={Learnable Group Transform For Time-Series},\nauthor={Romain Cosentino and Behnaam Aazhang},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgepaNtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJgepaNtDS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "355;698;511",
        "wc_reply_reviewers": "0;667;0",
        "wc_reply_authors": "676;1022;356",
        "reply_reviewers": "0;3;0",
        "reply_authors": "1;4;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            521.3333333333334,
            140.2196689325558
        ],
        "wc_reply_reviewers_avg": [
            222.33333333333334,
            314.4268153676181
        ],
        "wc_reply_authors_avg": [
            684.6666666666666,
            271.9624157040005
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11923042673090742544&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJgfDREKDB",
        "title": "Higher-Order Function Networks for Learning Composable 3D Object Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "Neural nets can encode complex 3D objects into the parameters of other (surprisingly small) neural nets",
        "abstract": "We present a new approach to 3D object representation where a neural network encodes the geometry of an object directly into the weights and biases of a second 'mapping' network. This mapping network can be used to reconstruct an object by applying its encoded transformation to points randomly sampled from a simple geometric space, such as the unit sphere. We study the effectiveness of our method through various experiments on subsets of the ShapeNet dataset. We find that the proposed approach can reconstruct encoded objects with accuracy equal to or exceeding state-of-the-art methods with orders of magnitude fewer parameters. Our smallest mapping network has only about 7000 parameters and shows reconstruction quality on par with state-of-the-art object decoder architectures with millions of parameters. Further experiments on feature mixing through the composition of learned functions show that the encoding captures a meaningful subspace of objects.",
        "keywords": "computer vision;3d reconstruction;deep learning;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eric Mitchell;Selim Engin;Volkan Isler;Daniel D Lee",
        "authorids": "eric.anthony.mitchell95@gmail.com;engin003@umn.edu;isler@umn.edu;ddlee@seas.upenn.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nMitchell2020Higher-Order,\ntitle={Higher-Order Function Networks for Learning Composable 3D Object Representations},\nauthor={Eric Mitchell and Selim Engin and Volkan Isler and Daniel D Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgfDREKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJgfDREKDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "426;473;573",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "768;662;535",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            490.6666666666667,
            61.29890337971435
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            655.0,
            95.25054680507964
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2487377280827479828&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJggj3VKPH",
        "title": "On the Dynamics and Convergence of Weight Normalization for Training Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We prove ReLU networks trained with weight normalization converge and analyze distinct behavior of different convergence regimes.",
        "abstract": "We present a proof of convergence for ReLU networks trained with weight normalization. In the analysis, we consider over-parameterized 2-layer ReLU networks initialized at random and trained with batch gradient descent and a fixed step size. The proof builds on recent theoretical works that bound the trajectory of parameters from their initialization and monitor the network predictions via the evolution of a ''neural tangent kernel'' (Jacot et al. 2018).  We discover that training with weight normalization decomposes such a kernel via the so called ''length-direction decoupling''.  This in turn leads to two convergence regimes and can rigorously explain the utility of WeightNorm. From the modified convergence we make a few curious observations including a natural form of ''lazy training'' where the direction of each weight vector remains stationary. ",
        "keywords": "Normalization methods;Weight Normalization;Convergence Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yonatan Dukler;Quanquan Gu;Guido Montufar",
        "authorids": "ydukler@math.ucla.edu;qgu@cs.ucla.edu;montufar@math.ucla.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndukler2020on,\ntitle={On the Dynamics and Convergence of Weight Normalization for Training Neural Networks},\nauthor={Yonatan Dukler and Quanquan Gu and Guido Montufar},\nyear={2020},\nurl={https://openreview.net/forum?id=HJggj3VKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJggj3VKPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "554;1058;166",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "624;1207;40",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            592.6666666666666,
            365.18245059452437
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            623.6666666666666,
            476.42581327585054
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Tog8LEbq83EJ:scholar.google.com/&scioq=On+the+Dynamics+and+Convergence+of+Weight+Normalization+for+Training+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJghoa4YDB",
        "title": "Temporal-difference learning for nonlinear value function approximation in the lazy training regime",
        "track": "main",
        "status": "Reject",
        "tldr": "Proof of convergence for TD learning with nonlinear value function approximation when parameters undergo little displacement during training. This regime (lazy training), occurs naturally in neural networks.",
        "abstract": "We discuss the approximation of the value function for infinite-horizon discounted Markov Reward Processes (MRP) with nonlinear functions trained with the Temporal-Difference (TD) learning algorithm. We consider this problem under a certain scaling of the approximating function, leading to a regime called lazy training. In this regime the parameters of the model vary only slightly during the learning process, a feature that has recently been observed in the training of neural networks, where the scaling we study arises naturally, implicit in the initialization of their parameters.  Both in the under- and over-parametrized frameworks, we prove exponential convergence to local, respectively global minimizers of the above algorithm in the lazy training regime. We then give examples of such convergence results in the case of models that diverge if trained with non-lazy TD learning, and in the case of neural networks.",
        "keywords": "deep reinforcement learning;function approximation;temporal-difference;lazy training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrea Agazzi;Jianfeng Lu",
        "authorids": "agazzi@math.duke.edu;jianfeng@math.duke.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nagazzi2020temporaldifference,\ntitle={Temporal-difference learning for nonlinear value function approximation in the lazy training regime},\nauthor={Andrea Agazzi and Jianfeng Lu},\nyear={2020},\nurl={https://openreview.net/forum?id=HJghoa4YDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJghoa4YDB",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "183;469;468;508",
        "wc_reply_reviewers": "0;0;167;0",
        "wc_reply_authors": "175;293;303;0",
        "reply_reviewers": "0;0;1;0",
        "reply_authors": "1;1;2;0",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            407.0,
            130.32843128036185
        ],
        "wc_reply_reviewers_avg": [
            41.75,
            72.31312121600062
        ],
        "wc_reply_authors_avg": [
            192.75,
            122.1400323399335
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.7071067811865476
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1929658209307546334&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJgjuCVKwS",
        "title": "Learning Classifier Synthesis for Generalized Few-Shot Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose to learn synthesizing few-shot classifiers and many-shot classifiers using one single objective function for GFSL.",
        "abstract": "Object recognition in real-world requires handling long-tailed or even open-ended data. An ideal visual system needs to reliably recognize the populated visual concepts and meanwhile efficiently learn about emerging new categories with a few training instances. Class-balanced many-shot learning and few-shot learning tackle one side of this problem, via either learning strong classifiers for populated categories or learning to learn few-shot classifiers for the tail classes. In this paper, we investigate the problem of generalized few-shot learning (GFSL) -- a model during the deployment is required to not only learn about \"tail\" categories with few shots, but simultaneously classify the \"head\" and \"tail\" categories. We propose the Classifier Synthesis Learning (CASTLE), a learning framework that learns how to synthesize calibrated few-shot classifiers in addition to the multi-class classifiers of ``head'' classes, leveraging a shared neural dictionary. CASTLE sheds light upon the inductive GFSL through optimizing one clean and effective GFSL learning objective. It demonstrates superior performances than existing GFSL algorithms and strong baselines on MiniImageNet and TieredImageNet data sets. More interestingly, it outperforms previous state-of-the-art methods when evaluated on standard few-shot learning.",
        "keywords": "Generalized Few-Shot Learning (GFSL);Few-Shot Learning;Meta-Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Han-Jia Ye;Hexiang Hu;De-Chuan Zhan;Fei Sha",
        "authorids": "yehj@lamda.nju.edu.cn;hexiang.frank.hu@gmail.com;zhandc@nju.edu.cn;feisha@usc.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJgjuCVKwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "505;499;326",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            443.3333333333333,
            83.0033466527438
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 89,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8585481603172802922&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJgkj0NFwr",
        "title": "Differentiable Architecture Compression",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In many learning situations, resources at inference time are significantly more constrained than resources at training time. This paper studies a general paradigm, called Differentiable ARchitecture Compression (DARC), that combines model compression and architecture search to learn models that are resource-efficient at inference time. Given a resource-intensive base architecture, DARC utilizes the training data to learn which sub-components can be replaced by cheaper alternatives. The high-level technique can be applied to any neural architecture, and we report experiments on state-of-the-art convolutional neural networks for image classification. For a WideResNet with 97.2% accuracy on CIFAR-10, we improve single-sample inference speed by 2.28X and memory footprint by 5.64X, with no accuracy loss. For a ResNet with 79.15% Top-1 accuracy on ImageNet, we improve batch inference speed by 1.29X and memory footprint by 3.57X with 1% accuracy loss. We also give theoretical Rademacher complexity bounds in simplified cases, showing how DARC avoids over-fitting despite over-parameterization.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shashank Singh;Ashish Khetan;Zohar Karnin",
        "authorids": "sss1@andrew.cmu.edu;khetan2@illinois.edu;zkarnin@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsingh2020differentiable,\ntitle={Differentiable Architecture Compression},\nauthor={Shashank Singh and Ashish Khetan and Zohar Karnin},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgkj0NFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJgkj0NFwr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "374;267;513",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "605;414;581",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            384.6666666666667,
            100.7119103626224
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            533.3333333333334,
            84.94835032078153
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Y2w6ENeoA64J:scholar.google.com/&scioq=Differentiable+Architecture+Compression&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJgpugrKPS",
        "title": "Scale-Equivariant Steerable Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "The effectiveness of Convolutional Neural Networks (CNNs) has been substantially attributed to their built-in property of translation equivariance. However, CNNs do not have embedded mechanisms to handle other types of transformations. In this work, we pay attention to scale changes, which regularly appear in various tasks due to the changing distances between the objects and the camera. First, we introduce the general theory for building scale-equivariant convolutional networks with steerable filters. We develop scale-convolution and generalize other common blocks to be scale-equivariant. We demonstrate the computational efficiency and numerical stability of the proposed method. We compare the proposed models to the previously developed methods for scale equivariance and local scale invariance. We demonstrate state-of-the-art results on the MNIST-scale dataset and on the STL-10 dataset in the supervised learning setting.",
        "keywords": "Scale Equivariance;Steerable Filters",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ivan Sosnovik;Micha\u0142 Szmaja;Arnold Smeulders",
        "authorids": "sosnovikivan@gmail.com;szmajamichal@gmail.com;a.w.m.smeulders@uva.nl",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nSosnovik2020Scale-Equivariant,\ntitle={Scale-Equivariant Steerable Networks},\nauthor={Ivan Sosnovik and Micha\u0142 Szmaja and Arnold Smeulders},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgpugrKPS}\n}",
        "github": "https://github.com/isosnovik/sesn",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJgpugrKPS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "364;678;162",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "408;963;76",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            401.3333333333333,
            212.30376560223536
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            482.3333333333333,
            365.911039583242
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 183,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13399040399275986450&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJgub1SKDH",
        "title": "All Neural Networks are Created Equal",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Most neural networks approximate the same classification function, even across architectures, through all stages of learning.",
        "abstract": "One of the unresolved questions in deep learning is the nature of the solutions that are being discovered. We investigate the collection of solutions reached by the same network architecture, with different random initialization of weights and random mini-batches. These solutions are shown to be rather similar - more often than not, each train and test example is either classified correctly by all the networks, or by none at all.  Surprisingly, all the network instances seem to share the same learning dynamics, whereby initially the same train and test examples are correctly recognized by the learned model, followed by other examples which are learned in roughly the same order. When extending the investigation to heterogeneous collections of neural network architectures, once again examples are seen to be learned in the same order irrespective of architecture, although the more powerful architecture may continue to learn and thus achieve higher accuracy. This pattern of results remains true even when the composition of classes in the test set is unrelated to the train set, for example, when using out of sample natural images or even artificial images. To show the robustness of these phenomena we provide an extensive summary of our empirical study, which includes hundreds of graphs describing tens of thousands of networks with varying NN architectures, hyper-parameters and domains. We also discuss cases where this pattern of similarity breaks down, which show that the reported similarity is not an artifact of optimization by gradient descent. Rather, the observed pattern of similarity is characteristic of learning complex problems with big networks. Finally, we show that this pattern of similarity seems to be strongly correlated with effective generalization.",
        "keywords": "deep learning;empirical machine learning;learning dynamics;curriculum;consistency score;consensus score;network comparison",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guy Hacohen;Leshem Choshen;Daphna Weinshall",
        "authorids": "guy.hacohen@mail.huji.ac.il;leshem.choshen@mail.huji.ac.il;daphna@cs.huji.ac.il",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJgub1SKDH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "529;477;626",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            544.0,
            61.746794788609606
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10429240635484643323&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJgySxSKvB",
        "title": "Deep Relational Factorization Machines",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Factorization Machines (FMs) is an important supervised learning approach due to its unique ability to capture feature interactions when dealing with high-dimensional sparse data. However, FMs assume each sample is independently observed and hence incapable of exploiting the interactions among samples. On the contrary, Graph Neural Networks (GNNs) has become increasingly popular due to its strength at capturing the dependencies among samples. But unfortunately, it cannot efficiently handle high-dimensional sparse data, which is quite common in modern machine learning tasks.  In this work, to leverage their complementary advantages and yet overcome their issues,  we proposed a novel approach, namely Deep Relational Factorization Machines,  which can capture both the feature interaction and the sample interaction. In particular, we disclosed the relationship between the feature interaction and the graph, which opens a brand new avenue to deal with high-dimensional features. Finally, we demonstrate the effectiveness of the proposed approach with experiments on several real-world datasets.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongchang Gao;Gang Wu;Ryan Rossi;Viswanathan Swaminathan;Heng Huang",
        "authorids": "hongchanggao@gmail.com;gawu@adobe.com;ryrossi@adobe.com;vishy@adobe.com;henghuanghh@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ngao2020deep,\ntitle={Deep Relational Factorization Machines},\nauthor={Hongchang Gao and Gang Wu and Ryan Rossi and Viswanathan Swaminathan and Heng Huang},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgySxSKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJgySxSKvB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "465;305;108",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.6666666666667,
            146.00532714786664
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:h7jG9W8MTPcJ:scholar.google.com/&scioq=Deep+Relational+Factorization+Machines&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJgzpgrYDr",
        "title": "Learning to Reason: Distilling Hierarchy via Self-Supervision and Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "This work attempts to construct a hierarchical structure that combines learning and reasoning for multi-task RL.",
        "abstract": "We present a hierarchical planning and control framework that enables an agent to perform various tasks and adapt to a new task flexibly. Rather than learning an individual policy for each particular task, the proposed framework, DISH, distills a hierarchical policy from a set of tasks by self-supervision and reinforcement learning. The framework is based on the idea of latent variable models that represent high-dimensional observations using low-dimensional latent variables. The resulting policy consists of two levels of hierarchy: (i) a planning module that reasons a sequence of latent intentions that would lead to optimistic future and (ii) a feedback control policy, shared across the tasks, that executes the inferred intention. Because the reasoning is performed in low-dimensional latent space, the learned policy can immediately be used to solve or adapt to new tasks without additional training. We demonstrate the proposed framework can learn compact representations (3-dimensional latent states for a 90-dimensional humanoid system) while solving a small number of imitation tasks, and the resulting policy is directly applicable to other types of tasks, i.e., navigation in cluttered environments.",
        "keywords": "Reinforcement learning;Self-supervised learning;unsupervised learning;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jung-Su Ha;Young-Jin Park;Hyeok-Joo Chae;Soon-Seo Park;Han-Lim Choi",
        "authorids": "jung-su.ha@ipvs.uni-stuttgart.de;yjpark@lics.kaist.ac.kr;hjchae@lics.kaist.ac.kr;sspark@lics.kaist.ac.kr;hanlimc@kaist.ac.kr",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nha2020learning,\ntitle={Learning to Reason: Distilling Hierarchy via Self-Supervision and Reinforcement Learning},\nauthor={Jung-Su Ha and Young-Jin Park and Hyeok-Joo Chae and Soon-Seo Park and Han-Lim Choi},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgzpgrYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJgzpgrYDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "868;420;1093",
        "wc_reply_reviewers": "394;0;207",
        "wc_reply_authors": "1360;502;894",
        "reply_reviewers": "1;0;1",
        "reply_authors": "3;1;3",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            793.6666666666666,
            279.73360343170947
        ],
        "wc_reply_reviewers_avg": [
            200.33333333333334,
            160.9188891612445
        ],
        "wc_reply_authors_avg": [
            918.6666666666666,
            350.7110238105187
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:v8mRevX8tmkJ:scholar.google.com/&scioq=Learning+to+Reason:+Distilling+Hierarchy+via+Self-Supervision+and+Reinforcement+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJgzt2VKPB",
        "title": "CATER: A diagnostic dataset for Compositional Actions & TEmporal Reasoning",
        "track": "main",
        "status": "Talk",
        "tldr": "We propose a new video understanding benchmark, with tasks that by-design require temporal reasoning to be solved, unlike most existing video datasets.",
        "abstract": "Computer vision has undergone a dramatic revolution in performance, driven in large part through deep features trained on large-scale supervised datasets. However, much of these improvements have focused on static image analysis; video understanding has seen rather modest improvements. Even though new datasets and spatiotemporal models have been proposed, simple frame-by-frame classification methods often still remain competitive. We posit that current video datasets are plagued with implicit biases over scene and object structure that can dwarf variations in temporal structure. In this work, we build a video dataset with fully observable and controllable object and scene bias, and which truly requires spatiotemporal understanding in order to be solved. Our dataset, named CATER, is rendered synthetically using a library of standard 3D objects, and tests the ability to recognize compositions of object movements that require long-term reasoning. In addition to being a challenging dataset, CATER also provides a plethora of diagnostic tools to analyze modern spatiotemporal video architectures by being completely observable and controllable. Using CATER, we provide insights into some of the most recent state of the art deep video architectures.",
        "keywords": "Video Understanding;Temporal Reasoning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rohit Girdhar;Deva Ramanan",
        "authorids": "rgirdhar@cs.cmu.edu;deva@cs.cmu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nGirdhar2020CATER:,\ntitle={CATER: A diagnostic dataset for Compositional Actions & TEmporal Reasoning},\nauthor={Rohit Girdhar and Deva Ramanan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJgzt2VKPB}\n}",
        "github": "http://rohitgirdhar.github.io/CATER",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJgzt2VKPB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "296;506;272",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "212;145;223",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            358.0,
            105.1094667477673
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            193.33333333333334,
            34.4705993887867
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 205,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13377141443469650091&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJl6tC4KwB",
        "title": "ManiGAN: Text-Guided Image Manipulation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a novel method to manipulate given images using natural language descriptions.",
        "abstract": "We propose a novel generative adversarial network for visual attributes manipulation (ManiGAN), which is able to semantically modify the visual attributes of given images using natural language descriptions. The key to our method is to design a novel co-attention module to combine text and image information rather than simply concatenating two features along the channel direction. Also, a detail correction module is proposed to rectify mismatched attributes of the synthetic image, and to reconstruct text-unrelated contents. Finally, we propose a new metric for evaluating manipulation results, in terms of both the generation of text-related attributes and the reconstruction of text-unrelated contents. Extensive experiments on benchmark datasets demonstrate the advantages of our proposed method, regarding the effectiveness of image manipulation and the capability of generating high-quality results.",
        "keywords": "Image Manipulation;Natural Language;Generative Adversarial Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bowen Li;Xiaojuan Qi;Thomas Lukasiewicz;Philip H. S. Torr",
        "authorids": "bowen.li@cs.ox.ac.uk;xiaojuan.qi@eng.ox.ac.uk;thomas.lukasiewicz@cs.ox.ac.uk;philip.torr@eng.ox.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJl6tC4KwB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "417;225;220",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "626;355;40",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            287.3333333333333,
            91.71089841937483
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            340.3333333333333,
            239.45818470501737
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 353,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17129748820213352788&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "HJl8AaVFwB",
        "title": "Deep Multiple Instance Learning for Taxonomic Classification of Metagenomic read sets",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Metagenomic studies have increasingly utilized sequencing technologies in order to analyze DNA fragments found in environmental samples. It can provide useful insights for studying the interactions between hosts and microbes, infectious disease proliferation, and novel species discovery. One important step in this analysis is the taxonomic classification of those DNA fragments. Of particular interest is the determination of the distribution of the taxa of microbes in metagenomic samples. Recent attempts using deep learning focus on architectures that classify single DNA reads independently from each other. In this work, we attempt to solve the task of directly predicting the distribution over the taxa of whole metagenomic read sets. We formulate this task as a Multiple Instance Learning (MIL) problem. We extend architectures used in single-read taxonomic classification with two different types of permutation-invariant MIL pooling layers: a) deepsets and b) attention-based pooling. We illustrate that our architecture can exploit the co-occurrence of species in metagenomic read sets and outperforms the single-read architectures in predicting the distribution over the taxa at higher taxonomic ranks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andreas Georgiou;Vincent Fortuin;Harun Mustafa;Gunnar R\u00e4tsch",
        "authorids": "geandrea@ethz.ch;fortuin@inf.ethz.ch;harun.mustafa@inf.ethz.ch;raetsch@inf.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ngeorgiou2020deep,\ntitle={Deep Multiple Instance Learning for Taxonomic Classification of Metagenomic read sets},\nauthor={Andreas Georgiou and Vincent Fortuin and Harun Mustafa and Gunnar R{\\\"a}tsch},\nyear={2020},\nurl={https://openreview.net/forum?id=HJl8AaVFwB}\n}",
        "github": "https://github.com/MetagenomicMIL/MetaSetMIL",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJl8AaVFwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "351;110;271",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "517;165;401",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            244.0,
            100.22308449986294
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            361.0,
            146.46046110355746
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4554478280327411937&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HJl8SgHtwr",
        "title": "VIMPNN: A physics informed neural network for estimating potential energies of out-of-equilibrium systems",
        "track": "main",
        "status": "Reject",
        "tldr": "Using physics informed properties to estimate ground-state energies of molecular and crystal systems with a Neural Network",
        "abstract": "Simulation of molecular and crystal systems enables insight into interesting chemical properties that benefit processes ranging from drug discovery to material synthesis. However these simulations can be computationally expensive and time consuming despite the approximations through Density Functional Theory (DFT). We propose the Valence Interaction Message Passing Neural Network (VIMPNN) to approximate DFT's ground-state energy calculations. VIMPNN integrates physics prior knowledge such as the existence of different interatomic bounds to estimate more accurate energies. Furthermore, while many previous machine learning methods consider only stable systems, our proposed method is demonstrated on unstable systems at different atomic distances. VIMPNN predictions can be used to determine the stable configurations of systems, i.e. stable distance for atoms -- a necessary step for the future simulation of crystal growth for example. Our method is extensively evaluated on a augmented version of the QM9 dataset that includes unstable molecules, as well as a new dataset of infinite- and finite-size crystals, and is compared with the Message Passing Neural Network (MPNN). VIMPNN has comparable accuracy with DFT, while allowing for 5 orders of magnitude in computational speed up compared to DFT simulations, and produces more accurate and informative potential energy curves than MPNN for estimating stable configurations.",
        "keywords": "neural network;chemical energy estimation;density functional theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jay Morgan;Adeline Paiement;Christian Klinke",
        "authorids": "j.p.morgan@swansea.ac.uk;adeline.paiement@univ-tln.fr;christian.klinke@uni-rostock.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmorgan2020vimpnn,\ntitle={{\\{}VIMPNN{\\}}: A physics informed neural network for estimating potential energies of out-of-equilibrium systems},\nauthor={Jay Morgan and Adeline Paiement and Christian Klinke},\nyear={2020},\nurl={https://openreview.net/forum?id=HJl8SgHtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJl8SgHtwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "389;1129;659",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "669;838;796",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            725.6666666666666,
            305.7595278791634
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            767.6666666666666,
            71.84396673037726
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eJvKQoyS2VIJ:scholar.google.com/&scioq=VIMPNN:+A+physics+informed+neural+network+for+estimating+potential+energies+of+out-of-equilibrium+systems&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJl8SkBYPr",
        "title": "Consistency-Based Semi-Supervised Active Learning: Towards Minimizing Labeling Budget",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Active learning (AL) aims to integrate data labeling and model training in a unified way, and to minimize the labeling budget by prioritizing the selection of high value data that can best improve model performance. Readily-available unlabeled data are used to evaluate selection mechanisms, but are not used for model training in conventional pool-based AL. To minimize the labeling budget, we unify unlabeled sample selection and model training based on two principles. First, we exploit both labeled and unlabeled data using semi-supervised learning (SSL) to distill information from unlabeled data that improves representation learning and sample selection. Second, we propose a simple yet effective selection metric that is coherent with the training objective such that the selected samples are effective at improving model performance. Our experimental results demonstrate superior performance with our proposed principles for limited labeled data compared to alternative AL and SSL combinations.  In addition, we study the AL phenomena of `cold start', which is becoming an increasingly more important factor to enable optimal unification of data labeling, model training and labeling budget minimization.  We propose a measure that is found to be empirically correlated with the AL target loss. This measure can be used to assist in determining the proper start size.",
        "keywords": "Active learning;semi-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mingfei Gao;Zizhao Zhang;Guo Yu;Sercan O. Arik;Larry S. Davis;Tomas Pfister",
        "authorids": "mgao@cs.umd.edu;zizhaoz@google.com;gy63@uw.edu;soarik@google.com;lsd@umiacs.umd.edu;tpfister@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ngao2020consistencybased,\ntitle={Consistency-Based Semi-Supervised Active Learning: Towards Minimizing Labeling Budget},\nauthor={Mingfei Gao and Zizhao Zhang and Guo Yu and Sercan O. Arik and Larry S. Davis and Tomas Pfister},\nyear={2020},\nurl={https://openreview.net/forum?id=HJl8SkBYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJl8SkBYPr",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "340;342",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "534;551",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            341.0,
            1.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            542.5,
            8.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 237,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=46788632071040149&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJl8_eHYvS",
        "title": "Discriminative Particle Filter Reinforcement Learning for Complex Partial observations",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce DPFRL, a framework for reinforcement learning under partial and complex observations with an importance-weighted particle filter",
        "abstract": "Deep reinforcement learning is successful in decision making for sophisticated games, such as Atari, Go, etc. \nHowever, real-world decision making often requires reasoning with partial information extracted from complex visual observations. This paper presents  Discriminative Particle Filter Reinforcement Learning (DPFRL), a new reinforcement learning framework for complex partial observations. DPFRL encodes a differentiable particle filter in the neural network policy for explicit reasoning with partial observations over time. The particle filter maintains a belief using learned discriminative update, which is trained end-to-end for decision making. We show that using the discriminative update instead of standard generative models results in significantly improved performance, especially for tasks with complex visual observations, because they circumvent the difficulty of modeling complex observations that are irrelevant to decision making.\nIn addition, to extract features from the particle belief, we propose a new type of belief feature based on the moment generating function. DPFRL outperforms state-of-the-art POMDP RL models in Flickering Atari Games, an existing POMDP RL benchmark, and in Natural Flickering Atari Games, a new, more challenging POMDP RL benchmark introduced in this paper.  Further,  DPFRL performs well for visual navigation with real-world data in the Habitat environment.",
        "keywords": "Reinforcement Learning;Partial Observability;Differentiable Particle Filtering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiao Ma;Peter Karkus;David Hsu;Wee Sun Lee;Nan Ye",
        "authorids": "xiao-ma@comp.nus.edu.sg;karkus@comp.nus.edu.sg;dyhsu@comp.nus.edu.sg;leews@comp.nus.edu.sg;nan.ye@uq.edu.au",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nMa2020Discriminative,\ntitle={Discriminative Particle Filter Reinforcement Learning for Complex Partial observations},\nauthor={Xiao Ma and Peter Karkus and David Hsu and Wee Sun Lee and Nan Ye},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJl8_eHYvS}\n}",
        "github": "[![github](/images/github_icon.svg) Yusufma03/DPFRL](https://github.com/Yusufma03/DPFRL)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJl8_eHYvS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "131;659;625",
        "wc_reply_reviewers": "0;20;0",
        "wc_reply_authors": "16;748;402",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            471.6666666666667,
            241.28728842513763
        ],
        "wc_reply_reviewers_avg": [
            6.666666666666667,
            9.428090415820632
        ],
        "wc_reply_authors_avg": [
            388.6666666666667,
            298.98643596138083
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1615417312084406584&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJlA0C4tPS",
        "title": "A Probabilistic Formulation of Unsupervised Text Style Transfer",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We formulate a probabilistic latent sequence model to tackle unsupervised text style transfer, and show its effectiveness across a suite of unsupervised text style transfer tasks. ",
        "abstract": "We present a deep generative model for unsupervised text style transfer that unifies previously proposed non-generative techniques. Our probabilistic approach models non-parallel data from two domains as a partially observed parallel corpus. By hypothesizing a parallel latent sequence that generates each observed sequence, our model learns to transform sequences from one domain to another in a completely unsupervised fashion. In contrast with traditional generative sequence models (e.g. the HMM), our model makes few assumptions about the data it generates: it uses a recurrent language model as a prior and an encoder-decoder as a transduction distribution. While computation of marginal data likelihood is intractable in this model class, we show that amortized variational inference admits a practical surrogate. Further, by drawing connections between our variational objective and other recent unsupervised style transfer and machine translation techniques, we show how our probabilistic view can unify some known non-generative objectives such as backtranslation and adversarial loss. Finally, we demonstrate the effectiveness of our method on a wide range of unsupervised style transfer tasks, including sentiment transfer, formality transfer, word decipherment, author imitation, and related language translation. Across all style transfer tasks, our approach yields substantial gains over state-of-the-art non-generative baselines, including the state-of-the-art unsupervised machine translation techniques that our approach generalizes. Further, we conduct experiments on a standard unsupervised machine translation task and find that our unified approach matches the current state-of-the-art.",
        "keywords": "unsupervised text style transfer;deep latent sequence model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junxian He;Xinyi Wang;Graham Neubig;Taylor Berg-Kirkpatrick",
        "authorids": "junxianh@cs.cmu.edu;xinyiw1@cs.cmu.edu;gneubig@cs.cmu.edu;tberg@eng.ucsd.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nHe2020A,\ntitle={A Probabilistic Formulation of Unsupervised Text Style Transfer},\nauthor={Junxian He and Xinyi Wang and Graham Neubig and Taylor Berg-Kirkpatrick},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlA0C4tPS}\n}",
        "github": "[![github](/images/github_icon.svg) cindyxinyiwang/deep-latent-sequence-model](https://github.com/cindyxinyiwang/deep-latent-sequence-model) + [![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=HJlA0C4tPS)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJlA0C4tPS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "291;329;203",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "670;347;186",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            274.3333333333333,
            52.77204647243547
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            401.0,
            201.24777431481488
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 154,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12354733292674478284&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJlAUaVYvH",
        "title": "Optimising Neural Network Architectures for Provable Adversarial Robustness",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Existing Lipschitz-based provable defences to adversarial examples only cover the L2 threat model. We introduce the first bound that makes use of Lipschitz continuity to provide a more general guarantee for threat models based on any p-norm. Additionally, a new strategy is proposed for designing network architectures that exhibit superior provable adversarial robustness over conventional convolutional neural networks. Experiments are conducted to validate our theoretical contributions, show that the assumptions made during the design of our novel architecture hold in practice, and quantify the empirical robustness of several Lipschitz-based adversarial defence methods.",
        "keywords": "Provable adversarial robustness;Lipschitz neural networks;network architectures",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Henry Gouk;Timothy M. Hospedales",
        "authorids": "hgouk@inf.ed.ac.uk;t.hospedales@ed.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngouk2020optimising,\ntitle={Optimising Neural Network Architectures for Provable Adversarial Robustness},\nauthor={Henry Gouk and Timothy M. Hospedales},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlAUaVYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlAUaVYvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "830;573;592",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            665.0,
            116.93017859674492
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EhRAEnN3EtcJ:scholar.google.com/&scioq=Optimising+Neural+Network+Architectures+for+Provable+Adversarial+Robustness&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJlF3h4FvB",
        "title": "Distillation $\\approx$ Early Stopping? Harvesting Dark Knowledge Utilizing Anisotropic Information Retrieval For Overparameterized NN",
        "track": "main",
        "status": "Reject",
        "tldr": "theoretically understand the regularization effect of distillation. We show that early stopping is essential in this process. From this perspective, we developed a distillation method for learning with corrupted Label with theoretical guarantees.",
        "abstract": "Distillation is a method to transfer knowledge from one model to another and often achieves higher accuracy with the same capacity. In this paper, we aim to provide a theoretical understanding on what mainly helps with the distillation. Our answer is \"early stopping\". Assuming that the teacher network is overparameterized, we argue that the teacher network is essentially harvesting dark knowledge from the data via early stopping. This can be justified by a new concept, Anisotropic In- formation Retrieval (AIR), which means that the neural network tends to fit the informative information first and the non-informative information (including noise) later. Motivated by the recent development on theoretically analyzing overparame- terized neural networks, we can characterize AIR by the eigenspace of the Neural Tangent Kernel(NTK). AIR facilities a new understanding of distillation. With that, we further utilize distillation to refine noisy labels. We propose a self-distillation al- gorithm to sequentially distill knowledge from the network in the previous training epoch to avoid memorizing the wrong labels. We also demonstrate, both theoret- ically and empirically, that self-distillation can benefit from more than just early stopping. Theoretically, we prove convergence of the proposed algorithm to the ground truth labels for randomly initialized overparameterized neural networks in terms of l2 distance, while the previous result was on convergence in 0-1 loss. The theoretical result ensures the learned neural network enjoy a margin on the training data which leads to better generalization. Empirically, we achieve better testing accuracy and entirely avoid early stopping which makes the algorithm more user-friendly.\n",
        "keywords": "Distillation;Learning Thoery;Corrupted Label",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bin Dong;Jikai Hou;Yiping Lu;Zhihua Zhang",
        "authorids": "dongbin@math.pku.edu.cn;houjikai@pku.edu.cn;yplu@stanford.edu;zhzhang@math.pku.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ndong2020distillation,\ntitle={Distillation {\\$}{\\textbackslash}approx{\\$} Early Stopping? Harvesting Dark Knowledge Utilizing Anisotropic Information Retrieval For Overparameterized {\\{}NN{\\}}},\nauthor={Bin Dong and Jikai Hou and Yiping Lu and Zhihua Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlF3h4FvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJlF3h4FvB",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "329;263;309",
        "wc_reply_reviewers": "483;0;0",
        "wc_reply_authors": "1223;520;183",
        "reply_reviewers": "2;0;0",
        "reply_authors": "4;2;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            300.3333333333333,
            27.632509034750274
        ],
        "wc_reply_reviewers_avg": [
            161.0,
            227.6883835420683
        ],
        "wc_reply_authors_avg": [
            642.0,
            433.2535824048852
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 38,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12437093272161943052&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJlHzJBFwB",
        "title": "Accelerating Monte Carlo Bayesian Inference via Approximating Predictive Uncertainty over the Simplex",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Estimating the predictive uncertainty of a Bayesian learning model is critical in various decision-making problems, e.g., reinforcement learning, detecting adversarial attack, self-driving car. As the model posterior is almost always intractable, most efforts were made on finding an accurate approximation the true posterior. Even though a decent estimation of the model posterior is obtained, another approximation is required to compute the predictive distribution over the desired output. A common accurate solution is to use Monte Carlo (MC) integration. However, it needs to maintain a large number of samples, evaluate the model repeatedly and average multiple model outputs. In many real-world cases, this is computationally prohibitive. In this work, assuming that the exact posterior or a decent approximation is obtained, we propose a generic framework to approximate the output probability distribution induced by model posterior with a parameterized model and in an amortized fashion. The aim is to approximate the true uncertainty of a specific Bayesian model, meanwhile alleviating the heavy workload of MC integration at testing time. The proposed method is universally applicable to Bayesian classification models that allow for posterior sampling. Theoretically, we show that the idea of amortization incurs no additional costs on approximation performance. Empirical results validate the strong practical performance of our approach.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yufei Cui;Wuguannan Yao;Qiao Li;Antoni Chan;Chun Jason Xue",
        "authorids": "yufeicui92@gmail.com;satie.yao@my.cityu.edu.hk;qiaoli045@gmail.com;abchan@cityu.edu.hk;jasonxue@cityu.edu.hk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ncui2020accelerating,\ntitle={Accelerating Monte Carlo Bayesian Inference via Approximating Predictive Uncertainty over the Simplex},\nauthor={Yufei Cui and Wuguannan Yao and Qiao Li and Antoni Chan and Chun Jason Xue},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlHzJBFwB}\n}",
        "github": "https://github.com/ralphc1212/amortized-approximation-of-induced-distribution",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlHzJBFwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "469;422;202",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1452;1043;415",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            364.3333333333333,
            116.37964694147436
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            970.0,
            426.4887649946557
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13804573751508519268&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJlISCEKvB",
        "title": "Improving Multi-Manifold GANs with a Learned Noise Prior",
        "track": "main",
        "status": "Reject",
        "tldr": "A multi-generator GAN framework with an additional network to learn a prior over the input noise.",
        "abstract": "Generative adversarial networks (GANs) learn to map samples from a noise distribution to a chosen data distribution. Recent work has demonstrated that GANs are consequently sensitive to, and limited by, the shape of the noise distribution. For example, a single generator struggles to map continuous noise (e.g. a uniform distribution) to discontinuous output (e.g. separate Gaussians) or complex output (e.g. intersecting parabolas). We address this problem by learning to generate from multiple models such that the generator's output is actually the combination of several distinct networks. We contribute a novel formulation of multi-generator models where we learn a prior over the generators conditioned on the noise, parameterized by a neural network. Thus, this network not only learns the optimal rate to sample from each generator but also optimally shapes the noise received by each generator. The resulting Noise Prior GAN (NPGAN) achieves expressivity and flexibility that surpasses both single generator models and previous multi-generator models.",
        "keywords": "GAN;generative adversarial network;ensemble",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matthew Amodio;Smita Krishnaswamy",
        "authorids": "matthew.amodio@yale.edu;smita.krishnaswamy@yale.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\namodio2020improving,\ntitle={Improving Multi-Manifold {\\{}GAN{\\}}s with a Learned Noise Prior},\nauthor={Matthew Amodio and Smita Krishnaswamy},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlISCEKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlISCEKvB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "817;513;258",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "749;574;336",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            529.3333333333334,
            228.50285677766823
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            553.0,
            169.2591701110066
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kZ2B8T7bLwsJ:scholar.google.com/&scioq=Improving+Multi-Manifold+GANs+with+a+Learned+Noise+Prior&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJlMkTNYvH",
        "title": "MODiR: Multi-Objective Dimensionality Reduction for Joint Data Visualisation",
        "track": "main",
        "status": "Reject",
        "tldr": "Dimensionality reduction algorithm to visualise text with network information, for example an email corpus or co-authorships.",
        "abstract": "Many large text collections exhibit graph structures, either inherent to the content itself or encoded in the metadata of the individual documents.\nExample graphs extracted from document collections are co-author networks, citation networks, or named-entity-cooccurrence networks.\nFurthermore, social networks can be extracted from email corpora, tweets, or social media. \nWhen it comes to visualising these large corpora, either the textual content or the network graph are used.\n\nIn this paper, we propose to incorporate both, text and graph, to not only visualise the semantic information encoded in the documents' content but also the relationships expressed by the inherent network structure.\nTo this end, we introduce a novel algorithm based on multi-objective optimisation to jointly position embedded documents and graph nodes in a two-dimensional landscape.\nWe illustrate the effectiveness of our approach with real-world datasets and show that we can capture the semantics of large document collections better than other visualisations based on either the content or the network information.",
        "keywords": "dimensionality reduction;visualisation;text visualisation;network drawing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tim Repke;Ralf Krestel",
        "authorids": "tim.repke@hpi.uni-potsdam.de;ralf.krestel@hpi.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nrepke2020modir,\ntitle={{\\{}MOD{\\}}iR: Multi-Objective Dimensionality Reduction for Joint Data Visualisation},\nauthor={Tim Repke and Ralf Krestel},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlMkTNYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJlMkTNYvH",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "185;139",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            162.0,
            23.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:l7xGvzaeFBEJ:scholar.google.com/&scioq=MODiR:+Multi-Objective+Dimensionality+Reduction+for+Joint+Data+Visualisation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJlPC6NKDH",
        "title": "Training Deep Neural Networks by optimizing over nonlocal paths in hyperparameter space",
        "track": "main",
        "status": "Reject",
        "tldr": "Physics-inspired method for deep neural networks training in joint weights-hyperparameters space by optimizing over the non local path in that space. ",
        "abstract": "Hyperparameter optimization is both a practical issue and an interesting theoretical problem in training of deep architectures. Despite many recent advances the most commonly used methods almost universally involve training multiple and decoupled copies of the model, in effect sampling the hyperparameter space. We show that at a negligible additional computational cost, results can be improved by sampling \\emph{nonlocal paths} instead of points in hyperparameter space. To this end we interpret hyperparameters as controlling the level of correlated noise in training, which can be mapped to an effective temperature. The usually independent instances of the model are coupled and allowed to exchange their hyperparameters throughout the training using the well established parallel tempering technique of statistical physics. Each simulation corresponds then to a unique path, or history, in the joint hyperparameter/model-parameter space. We provide empirical tests of our method, in particular for dropout and learning rate optimization. We observed faster training and improved resistance to overfitting and showed a systematic decrease in the absolute validation error, improving over benchmark results.",
        "keywords": "deep learning;Hyperparameter optimization;dropout",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vlad Pushkarov;Yonathan Efroni;Mykola Maksymenko;Maciej Koch-Janusz",
        "authorids": "vladpush@icloud.com;jonathan.efroni@gmail.com;mmaks@softserveinc.com;maciejk@ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\npushkarov2020training,\ntitle={Training Deep Neural Networks by optimizing over nonlocal paths in hyperparameter space},\nauthor={Vlad Pushkarov and Yonathan Efroni and Mykola Maksymenko and Maciej Koch-Janusz},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlPC6NKDH}\n}",
        "github": "https://github.com/vlad-user/parallel-tempering/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlPC6NKDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "723;161;189",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;227;380",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.6666666666667,
            258.5824605206024
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            202.33333333333334,
            156.11178331211545
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18299192146298965430&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJlP_pEFPH",
        "title": "SRDGAN: learning the noise prior for Super Resolution with Dual Generative Adversarial Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Single Image Super Resolution (SISR) is the task of producing a high resolution (HR) image from a given low-resolution (LR) image. It is a well researched prob- lem with extensive commercial applications like digital camera, video compres- sion, medical imaging, etc. Most recent super resolution works focus on the fea- ture learning architecture, like Chao Dong (2016); Dong et al. (2016); Wang et al. (2018b); Ledig et al. (2017). However, these works suffer from the following chal- lenges: (1) The low-resolution (LR) training images are artificially synthesized us- ing HR images with bicubic downsampling, which have much more information than real demosaic-upscaled images. The mismatch between training and realistic mobile data heavily blocks the effect on practical SR problem. (2) These methods cannot effectively handle the blind distortions during super resolution in practical applications. In this work, an end-to-end novel framework, including high-to-low network and low-to-high network, is proposed to solve the above problems with dual Generative Adversarial Networks (GAN). First, the above mismatch prob- lems are well explored with the high-to-low network, where clear high-resolution image and the corresponding realistic low-resolution image pairs can be gener- ated. With high-to-low network, a large-scale General Mobile Super Resolution Dataset, GMSR, is proposed, which can be utilized for training or as a bench- mark for super resolution methods. Second, an effective low-to-high network (super resolution network) is proposed in the framework. Benefiting from the GMSR dataset and novel training strategies, the proposed super resolution model can effectively handle detail recovery and denoising at the same time.",
        "keywords": "Super Resolution GAN Denoise",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jingwei GUAN;Cheng PAN;Songnan LI and Dahai YU",
        "authorids": ";;",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nguan2020srdgan,\ntitle={{\\{}SRDGAN{\\}}: learning the noise prior for Super Resolution with Dual Generative Adversarial Networks},\nauthor={Jingwei GUAN and Cheng PAN and Songnan LI and Dahai YU},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlP_pEFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlP_pEFPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "242;322;215",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            259.6666666666667,
            45.433712397538265
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8594363097797576257&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJlQ96EtPr",
        "title": "FleXOR: Trainable Fractional Quantization",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an encryption algorithm/architecture to compress quantized weights in order to achieve fractional numbers of bits per weight",
        "abstract": "Parameter quantization is a popular model compression technique due to its regular form and high compression ratio. In particular, quantization based on binary codes is gaining attention because each quantized bit can be directly utilized for computations without dequantization using look-up tables. Previous attempts, however, only allow for integer numbers of quantization bits, which ends up restricting the search space for compression ratio and accuracy. Moreover, quantization bits are usually obtained by minimizing quantization loss in a local manner that does not directly correspond to minimizing the loss function. In this paper, we propose an encryption algorithm/architecture to compress quantized weights in order to achieve fractional numbers of bits per weight and new compression configurations further optimize accuracy/compression trade-offs. Decryption is implemented using XOR gates added into the neural network model and described as $\\tanh(x)$, which enable gradient calculations superior to the straight-through gradient method. We perform experiments using MNIST, CIFAR-10, and ImageNet to show that inserting XOR gates learns quantization/encrypted bit decisions through training and obtains high accuracy even for fractional sub 1-bit weights.",
        "keywords": "Quantization;Model Compression;Trainable Compression;XOR;Encryption",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongsoo Lee;Se Jung Kwon;Byeongwook Kim;Yongkweon Jeon;Baeseong Park;Jeongin Yun;Gu-Yeon Wei",
        "authorids": "dslee3@gmail.com;mogndrewk@gmail.com;quddnr145@gmail.com;dragwon.jeon@gmail.com;qkrqotjd91@gmail.com;yji6373@naver.com;gywei@g.harvard.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nlee2020flexor,\ntitle={Fle{\\{}XOR{\\}}: Trainable Fractional Quantization},\nauthor={Dongsoo Lee and Se Jung Kwon and Byeongwook Kim and Yongkweon Jeon and Baeseong Park and Jeongin Yun and Gu-Yeon Wei},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlQ96EtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlQ96EtPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "432;395;102",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "512;357;25",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            309.6666666666667,
            147.61737326239378
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            298.0,
            203.14690907485317
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7159724765664067362&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJlRFlHFPS",
        "title": "Unsupervised Distillation of Syntactic Information from Contextualized Word Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "We distill language models representations for syntax by unsupervised metric learning",
        "abstract": "Contextualized word representations, such as ELMo and BERT, were shown to perform well on a various of semantic and structural (syntactic) task. In this work, we tackle the task of unsupervised disentanglement between semantics and structure in neural language representations: we aim to learn a transformation of the contextualized vectors, that discards the lexical semantics, but keeps the structural information. To this end, we automatically generate groups of sentences which are structurally similar but semantically different, and use metric-learning approach to learn a transformation that emphasizes the structural component that is encoded in the vectors. We demonstrate that our transformation clusters vectors in space by structural properties, rather than by lexical semantics. Finally, we demonstrate the utility of our distilled representations by showing that they outperform the original contextualized representations in few-shot parsing setting.",
        "keywords": "dismantlement;contextualized word representations;language models;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shauli Ravfogel;Yanai Elazar;Jacob Goldberger;Yoav Goldberg",
        "authorids": "shauli.ravfogel@gmail.com;yanaiela@gmail.com;jacob.goldberger@biu.ac.il;yogo@cs.biu.ac.il",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nravfogel2020unsupervised,\ntitle={Unsupervised Distillation of Syntactic Information from Contextualized Word Representations},\nauthor={Shauli Ravfogel and Yanai Elazar and Jacob Goldberger and Yoav Goldberg},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlRFlHFPS}\n}",
        "github": "https://drive.google.com/file/d/1tGoYmNCOSTgE7T5RjRv_bV7o3JUD160t/view",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJlRFlHFPS",
        "pdf_size": 0,
        "rating": "1;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "291;1112;1113;130",
        "wc_reply_reviewers": "296;80;20;0",
        "wc_reply_authors": "826;979;298;177",
        "reply_reviewers": "1;1;1;0",
        "reply_authors": "2;3;2;1",
        "rating_avg": [
            5.25,
            2.5860201081971503
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            661.5,
            454.57810110034995
        ],
        "wc_reply_reviewers_avg": [
            99.0,
            117.48616939878498
        ],
        "wc_reply_authors_avg": [
            570.0,
            339.57694268015314
        ],
        "reply_reviewers_avg": [
            0.75,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            2.0,
            0.7071067811865476
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10460033187643573952&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJlSmC4FPS",
        "title": "Robust And Interpretable Blind Image Denoising Via Bias-Free Convolutional Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We show that removing constant terms from CNN architectures ensures strong generalization across noise levels, and also provides interpretability of the denoising method via linear-algebra techniques.",
        "abstract": "We study the generalization properties of deep convolutional neural networks for image denoising in the presence of varying noise levels. We provide extensive empirical evidence that current state-of-the-art architectures systematically overfit to the noise levels in the training set, performing very poorly at new noise levels. We show that strong generalization can be achieved through a simple architectural modification: removing all additive constants. The resulting \"bias-free\" networks attain state-of-the-art performance over a broad range of noise levels, even when trained over a limited range. They are also locally linear, which enables direct analysis with linear-algebraic tools.  We show that the denoising map can be visualized locally as a filter that adapts to both image structure and noise level. In addition, our analysis reveals that deep networks implicitly perform a projection onto an adaptively-selected low-dimensional subspace, with dimensionality inversely proportional to noise level, that captures features of natural images. ",
        "keywords": "denoising;overfitting;generalization;robustness;interpretability;analysis of neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sreyas Mohan;Zahra Kadkhodaie;Eero P. Simoncelli;Carlos Fernandez-Granda",
        "authorids": "sm7582@nyu.edu;zk388@nyu.edu;eero.simoncelli@nyu.edu;cfgranda@cims.nyu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nMohan2020Robust,\ntitle={Robust And Interpretable Blind Image Denoising Via Bias-Free Convolutional Neural Networks},\nauthor={Sreyas Mohan and Zahra Kadkhodaie and Eero P. Simoncelli and Carlos Fernandez-Granda},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlSmC4FPS}\n}",
        "github": "[![github](/images/github_icon.svg) LabForComputationalVision/bias_free_denoising](https://github.com/LabForComputationalVision/bias_free_denoising)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJlSmC4FPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "136;117;332",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "198;374;468",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            195.0,
            97.1836748979306
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            346.6666666666667,
            111.90869294007305
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 159,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11707547899272178627&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJlTpCEKvS",
        "title": "Which Tasks Should Be Learned Together in Multi-task Learning?",
        "track": "main",
        "status": "Reject",
        "tldr": "We analyze what tasks are best learned together in one network, and which are best to learn separately. ",
        "abstract": "Many computer vision applications require solving multiple tasks in real-time. A neural network can be trained to solve multiple tasks simultaneously using 'multi-task learning'. This saves computation at inference time as only a single network needs to be evaluated. Unfortunately, this often leads to inferior overall performance as task objectives compete, which consequently poses the question: which tasks should and should not be learned together in one network when employing multi-task learning? We systematically study task cooperation and competition and propose a framework for assigning tasks to a few neural networks such that cooperating tasks are computed by the same neural network, while competing tasks are computed by different networks. Our framework offers a time-accuracy trade-off and can produce better accuracy using less inference time than not only a single large multi-task neural network but also many single-task networks.\n",
        "keywords": "multi-task learning;Computer Vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Trevor Standley;Amir R. Zamir;Dawn Chen;Leonidas Guibas;Jitendra Malik;Silvio Savarese",
        "authorids": "tstand@cs.stanford.edu;zamir@cs.stanford.edu;sdawnchen@gmail.com;guibas@cs.stanford.edu;malik@eecs.berkeley.edu;ssilvio@stanford.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nstandley2020which,\ntitle={Which Tasks Should Be Learned Together in Multi-task Learning?},\nauthor={Trevor Standley and Amir R. Zamir and Dawn Chen and Leonidas Guibas and Jitendra Malik and Silvio Savarese},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlTpCEKvS}\n}",
        "github": "https://anonymous.4open.science/r/6cd16de7-0d82-454f-86ef-b540591cd782/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJlTpCEKvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "122;229;297",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "167;303;322",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            216.0,
            72.0324001173546
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            264.0,
            69.02656493457187
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 662,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11792880914150945674&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJlU-AVtvS",
        "title": "A Fine-Grained Spectral Perspective on Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Eigenvalues of Conjugate (aka NNGP) and Neural Tangent Kernel can be computed in closed form over the Boolean cube and reveal the effects of hyperparameters on neural network inductive bias, training, and generalization.",
        "abstract": "Are neural networks biased toward simple functions?\nDoes depth always help learn more complex features?\nIs training the last layer of a network as good as training all layers?\nThese questions seem unrelated at face value, but in this work we give all of them a common treatment from the spectral perspective.\nWe will study the spectra of the *Conjugate Kernel, CK,* (also called the *Neural Network-Gaussian Process Kernel*), and the *Neural Tangent Kernel, NTK*.\nRoughly, the CK and the NTK tell us respectively ``\"what a network looks like at initialization\" and \"``what a network looks like during and after training.\"\nTheir spectra then encode valuable information about the initial distribution and the training and generalization properties of neural networks.\nBy analyzing the eigenvalues, we lend novel insights into the questions put forth at the beginning, and we verify these insights by extensive experiments of neural networks.\nWe believe the computational tools we develop here for analyzing the spectra of CK and NTK serve as a solid foundation for future studies of deep neural networks.\nWe have open-sourced the code for it and for generating the plots in this paper at github.com/jxVmnLgedVwv6mNcGCBy/NNspectra.",
        "keywords": "Neural Tangent Kernel;Neural Network Gaussian Process;Spectral theory;Eigenvalues;Harmonic analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Greg Yang;Hadi Salman",
        "authorids": "gregyang@microsoft.com;hadicsalman@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nyang2020a,\ntitle={A Fine-Grained Spectral Perspective on Neural Networks},\nauthor={Greg Yang and Hadi Salman},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlU-AVtvS}\n}",
        "github": "https://github.com/jxVmnLgedVwv6mNcGCBy/NNspectra",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJlU-AVtvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "521;497;416",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1853;473;643",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            478.0,
            44.9221548904324
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            989.6666666666666,
            614.4012442117031
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 109,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16819407386250953243&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJlWIANtPH",
        "title": "Neural Embeddings for Nearest Neighbor Search Under Edit Distance",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a learning-based edit distance embedding method, which improves over prior data-independent approaches.",
        "abstract": "The edit distance between two sequences is an important metric with many applications. The drawback, however, is the high computational cost of many basic problems involving this notion, such as the nearest neighbor search. A natural approach to overcoming this issue is to embed the sequences into a vector space such that the geometric distance in the target space approximates the edit distance in the original space. However, the known edit distance embedding algorithms, such as Chakraborty et al.(2016), construct embeddings that are data-independent, i.e., do not exploit any structure of embedded sets of strings. In this paper we propose an alternative approach, which learns the embedding function according to the data distribution. Our experiments show that the new algorithm has much better empirical performance than prior data-independent methods. ",
        "keywords": "Embedding;Edit Distance;Nearest Neighbor Search;Learning-Augmented Algorithm",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiyuan Zhang;Yang Yuan;Piotr Indyk",
        "authorids": "zhangxiyuan@zju.edu.cn;yuanyang@tsinghua.edu.cn;indyk@mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhang2020neural,\ntitle={Neural Embeddings for Nearest Neighbor Search Under Edit Distance},\nauthor={Xiyuan Zhang and Yang Yuan and Piotr Indyk},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlWIANtPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlWIANtPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "486;314;498",
        "wc_reply_reviewers": "0;0;104",
        "wc_reply_authors": "393;515;744",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            432.6666666666667,
            84.05289339986393
        ],
        "wc_reply_reviewers_avg": [
            34.666666666666664,
            49.0260701622673
        ],
        "wc_reply_authors_avg": [
            550.6666666666666,
            145.49761357340387
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16724893229036910821&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJlWWJSFDH",
        "title": "Strategies for Pre-training Graph Neural Networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We develop a strategy for pre-training Graph Neural Networks (GNNs) and systematically study its effectiveness on multiple datasets, GNN architectures, and diverse downstream tasks.",
        "abstract": "Many applications of machine learning require a model to make accurate pre-dictions on test examples that are distributionally different from training ones, while task-specific labels are scarce during training. An effective approach to this challenge is to pre-train a model on related tasks where data is abundant, and then fine-tune it on a downstream task of interest. While pre-training has been effective in many language and vision domains, it remains an open question how to effectively use pre-training on graph datasets. In this paper, we develop a new strategy and self-supervised methods for pre-training Graph Neural Networks (GNNs). The key to the success of our strategy is to pre-train an expressive GNN at the level of individual nodes as well as entire graphs so that the GNN can learn useful local and global representations simultaneously. We systematically study pre-training on multiple graph classification datasets. We find that na\u00efve strategies, which pre-train GNNs at the level of either entire graphs or individual nodes, give limited improvement and can even lead to negative transfer on many downstream tasks. In contrast, our strategy avoids negative transfer and improves generalization significantly across downstream tasks, leading up to 9.4% absolute improvements in ROC-AUC over non-pre-trained models and achieving state-of-the-art performance for molecular property prediction and protein function prediction.",
        "keywords": "Pre-training;Transfer learning;Graph Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Weihua Hu*;Bowen Liu*;Joseph Gomes;Marinka Zitnik;Percy Liang;Vijay Pande;Jure Leskovec",
        "authorids": "weihuahu@stanford.edu;liubowen@stanford.edu;joegomes@stanford.edu;marinka@cs.stanford.edu;pliang@cs.stanford.edu;pande@stanford.edu;jure@cs.stanford.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nHu*2020Strategies,\ntitle={Strategies for Pre-training Graph Neural Networks},\nauthor={Weihua Hu* and Bowen Liu* and Joseph Gomes and Marinka Zitnik and Percy Liang and Vijay Pande and Jure Leskovec},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlWWJSFDH}\n}",
        "github": "https://github.com/snap-stanford/pretrain-gnns/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlWWJSFDH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "230;207;181",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "375;217;502",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            206.0,
            20.016659728003237
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            364.6666666666667,
            116.57996778558865
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1823,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8697784444104397361&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "HJlXC3EtwB",
        "title": "Learning to Anneal and Prune Proximity Graphs for Similarity Search",
        "track": "main",
        "status": "Reject",
        "tldr": "Annealable proximity graphs facilitates similarity search by learning to prune inferior edges without drastically changing graph properties.",
        "abstract": "This paper studies similarity search, which is a crucial enabler of many feature vector--based applications. The problem of similarity search has been extensively studied in the machine learning community. Recent advances of proximity graphs have achieved outstanding performance through exploiting the navigability of the underlying graph structure. In this work, we introduce the annealable proximity graph (APG) method to learn and reshape proximity graphs for efficiency and effective similarity search. APG makes proximity graph edges annealable, which can be effectively trained with a stochastic optimization algorithm. APG identifies important edges that best preserve graph navigability and prune inferior edges without drastically changing graph properties. Experimental results show that APG achieves state-of-the-art results not only by producing proximity graphs with less number of edges but also speeding up the search time by 20--40\\% across different datasets with almost no loss of accuracy.\n",
        "keywords": "Similarity search;Proximity graph;Learning to prune;Edge heterogeneity;Annealing;Efficiency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Minjia Zhang;Wenhan Wang;Yuxiong He",
        "authorids": "minjiaz@microsoft.com;wenhanw@microsoft.com;yuxhe@microsoft.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhang2020learning,\ntitle={Learning to Anneal and Prune Proximity Graphs for Similarity Search},\nauthor={Minjia Zhang and Wenhan Wang and Yuxiong He},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlXC3EtwB}\n}",
        "github": "https://drive.google.com/open?id=15vGhNS0O9l-zPAbdPAxwxIzV558jjGeQ",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlXC3EtwB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "437;223;193",
        "wc_reply_reviewers": "97;0;0",
        "wc_reply_authors": "759;727;243",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            284.3333333333333,
            108.64416945034627
        ],
        "wc_reply_reviewers_avg": [
            32.333333333333336,
            45.726238516730064
        ],
        "wc_reply_authors_avg": [
            576.3333333333334,
            236.06402144804326
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tV3RTIiR1tEJ:scholar.google.com/&scioq=Learning+to+Anneal+and+Prune+Proximity+Graphs+for+Similarity+Search&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJlY_6VKDr",
        "title": "BUZz: BUffer Zones for defending adversarial examples in image classification",
        "track": "main",
        "status": "Reject",
        "tldr": "Achieving strong adversarial defense (coined as BUZz) comparable to existing ones based on new security concept -- buffer zones",
        "abstract": "We propose a novel defense against all existing gradient based adversarial attacks on deep neural networks for image classification problems. Our defense is based on a combination of deep neural networks and simple image transformations. While straight forward in implementation, this defense yields a unique security property which we term buffer zones. In this paper, we formalize the concept of buffer zones. We argue that our defense based on buffer zones is secure  against state-of-the-art black box attacks. We are able to achieve this security even when the adversary has access to the entire original training data set and unlimited query access to the defense. We verify our security claims through experimentation using FashionMNIST, CIFAR-10 and CIFAR-100. We demonstrate <10% attack success rate -- significantly lower than what other well-known defenses offer -- at only a price of a 15-20% drop in clean accuracy. By using a new intuitive metric we explain why this trade-off offers a significant improvement over prior work.",
        "keywords": "adversarial machine learning;machine learning security",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Phuong Ha Nguyen*;Kaleel Mahmood*;Lam M. Nguyen;Thanh Nguyen;Marten van Dijk",
        "authorids": "phuongha.ntu@gmail.com;kaleel.mahmood@uconn.edu;lamnguyen.mltd@gmail.com;thanhng@iastate.edu;marten.van_dijk@uconn.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nnguyen*2020buzz,\ntitle={{\\{}BUZ{\\}}z: {\\{}BU{\\}}ffer Zones for defending  adversarial examples in image classification},\nauthor={Phuong Ha Nguyen* and Kaleel Mahmood* and Lam M. Nguyen and Thanh Nguyen and Marten van Dijk},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlY_6VKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJlY_6VKDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "414;130;392",
        "wc_reply_reviewers": "0;0;371",
        "wc_reply_authors": "820;510;967",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            312.0,
            129.0064597865807
        ],
        "wc_reply_reviewers_avg": [
            123.66666666666667,
            174.89107721347276
        ],
        "wc_reply_authors_avg": [
            765.6666666666666,
            190.4841784039352
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6859781530620819570&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJlfuTEtvB",
        "title": "CLN2INV: Learning Loop Invariants with Continuous Logic Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce the Continuous Logic Network (CLN), a novel neural architecture for automatically learning loop invariants and general SMT formulas.",
        "abstract": "Program verification offers a framework for ensuring program correctness and therefore systematically eliminating different classes of bugs. Inferring loop invariants is one of the main challenges behind automated verification of real-world programs which often contain many loops. In this paper, we present the Continuous Logic Network (CLN), a novel neural architecture for automatically learning loop invariants directly from program execution traces. Unlike existing neural networks, CLNs can learn precise and explicit representations of formulas in Satisfiability Modulo Theories (SMT)  for loop invariants from program execution traces. We develop a new sound and complete semantic mapping for assigning SMT formulas to continuous truth values that allows CLNs to be trained efficiently. We use CLNs to implement a new inference system for loop invariants, CLN2INV, that significantly outperforms existing approaches on the popular Code2Inv dataset. CLN2INV is the first tool to solve all 124 theoretically solvable problems in the Code2Inv dataset. Moreover, CLN2INV takes only 1.1 second on average for each problem, which is 40 times faster than existing approaches. We further demonstrate that CLN2INV can even learn 12 significantly more complex loop invariants than the ones required for the Code2Inv dataset.",
        "keywords": "loop invariants;deep learning;logic learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gabriel Ryan;Justin Wong;Jianan Yao;Ronghui Gu;Suman Jana",
        "authorids": "gabe@cs.columbia.edu;justin.wong@columbia.edu;jy3022@columbia.edu;ronghui.gu@columbia.edu;suman@cs.columbia.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nRyan2020CLN2INV:,\ntitle={CLN2INV: Learning Loop Invariants with Continuous Logic Networks},\nauthor={Gabriel Ryan and Justin Wong and Jianan Yao and Ronghui Gu and Suman Jana},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlfuTEtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlfuTEtvB",
        "pdf_size": 0,
        "rating": "3;8",
        "confidence": "0;0",
        "wc_review": "400;325",
        "wc_reply_reviewers": "124;0",
        "wc_reply_authors": "546;227",
        "reply_reviewers": "1;0",
        "reply_authors": "3;2",
        "rating_avg": [
            5.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            362.5,
            37.5
        ],
        "wc_reply_reviewers_avg": [
            62.0,
            62.0
        ],
        "wc_reply_authors_avg": [
            386.5,
            159.5
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            2.5,
            0.5
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 64,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4139124080220294224&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJli2hNKDH",
        "title": "Observational Overfitting in Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We isolate one factor of RL generalization by analyzing the case when the agent only overfits to the observations. We show that architectural implicit regularizations occur in this regime.",
        "abstract": "A major component of overfitting in model-free reinforcement learning (RL) involves the case where the agent may mistakenly correlate reward with certain spurious features from the observations generated by the Markov Decision Process (MDP). We provide a general framework for analyzing this scenario, which we use to design multiple synthetic benchmarks from only modifying the observation space of an MDP. When an agent overfits to different observation spaces even if the underlying MDP dynamics is fixed, we term this observational overfitting. Our experiments expose intriguing properties especially with regards to implicit regularization, and also corroborate results from previous works in RL generalization and supervised learning (SL). ",
        "keywords": "observational;overfitting;reinforcement;learning;generalization;implicit;regularization;overparametrization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xingyou Song;Yiding Jiang;Stephen Tu;Yilun Du;Behnam Neyshabur",
        "authorids": "xsong@berkeley.edu;ydjiang@google.com;stephentu@google.com;yilundu@mit.edu;neyshabur@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nSong2020Observational,\ntitle={Observational Overfitting in Reinforcement Learning},\nauthor={Xingyou Song and Yiding Jiang and Stephen Tu and Yilun Du and Behnam Neyshabur},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJli2hNKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJli2hNKDH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "681;236;273",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1630;442;466",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            396.6666666666667,
            201.62065590828954
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            846.0,
            554.4582941935308
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 171,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8173274210027327931&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJlk-eHFwH",
        "title": "AdaGAN: Adaptive GAN for Many-to-Many Non-Parallel Voice Conversion",
        "track": "main",
        "status": "Reject",
        "tldr": "Novel adaptive instance normalization based GAN framework for non parallel many-to-many and zero-shot VC. ",
        "abstract": "Voice Conversion (VC) is a task of converting perceived speaker identity from a source speaker to a particular target speaker. Earlier approaches in the literature primarily find a mapping between the given source-target speaker-pairs. Developing mapping techniques for many-to-many VC using non-parallel data, including zero-shot learning remains less explored areas in VC. Most of the many-to-many VC architectures require training data from all the target speakers for whom we want to convert the voices. In this paper, we propose a novel style transfer architecture, which can also be extended to generate voices even for target speakers whose data were not used in the training (i.e., case of zero-shot learning). In particular, propose Adaptive Generative Adversarial Network (AdaGAN), new architectural training procedure help in learning normalized speaker-independent latent representation, which will be used to generate speech with different speaking styles in the context of VC. We compare our results with the state-of-the-art StarGAN-VC architecture. In particular, the AdaGAN achieves 31.73%, and 10.37% relative improvement compared to the StarGAN in MOS tests for speech quality and speaker similarity, respectively. The key strength of the proposed architectures is that it yields these results with less computational complexity. AdaGAN is 88.6% less complex than StarGAN-VC in terms of FLoating Operation Per Second (FLOPS), and 85.46% less complex in terms of trainable parameters.  ",
        "keywords": "Voice Conversion;Deep Learning;Non parallel;GAN;AdaGAN;AdaIN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Maitreya Patel;Mirali Purohit;Mihir Parmar;Nirmesh J. Shah;Hemant A. Patil",
        "authorids": "maitreya_patel@daiict.ac.in;purohit_mirali@daiict.ac.in;mihirparmar@asu.edu;nirmesh88_shah@daiict.ac.in;hemant_patil@daiict.ac.in",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\npatel2020adagan,\ntitle={Ada{\\{}GAN{\\}}: Adaptive {\\{}GAN{\\}} for Many-to-Many Non-Parallel Voice Conversion},\nauthor={Maitreya Patel and Mirali Purohit and Mihir Parmar and Nirmesh J. Shah and Hemant A. Patil},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlk-eHFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJlk-eHFwH",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "392;205;334",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "792;322;575",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            310.3333333333333,
            78.15511641743119
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            563.0,
            192.0642253691891
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10876150716707097330&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJlnC1rKPB",
        "title": "On the Relationship between Self-Attention and Convolutional Layers",
        "track": "main",
        "status": "Poster",
        "tldr": "A self-attention layer can perform convolution and often learns to do so in practice.",
        "abstract": "Recent trends of incorporating attention mechanisms in vision have led researchers to reconsider the supremacy of convolutional layers as a primary building block. Beyond helping CNNs to handle long-range dependencies, Ramachandran et al. (2019) showed that attention can completely replace convolution and achieve state-of-the-art performance on vision tasks. This raises the question: do learned attention layers operate similarly to convolutional layers? This work provides evidence that attention layers can perform convolution and, indeed, they often learn to do so in practice. Specifically, we prove that a multi-head self-attention layer with sufficient number of heads is at least as expressive as any convolutional layer. Our numerical experiments then show that self-attention layers attend to pixel-grid patterns similarly to CNN layers, corroborating our analysis. Our code is publicly available.",
        "keywords": "self-attention;attention;transformers;convolution;CNN;image;expressivity;capacity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jean-Baptiste Cordonnier;Andreas Loukas;Martin Jaggi",
        "authorids": "jean-baptiste.cordonnier@epfl.ch;andreas.loukas@epfl.ch;martin.jaggi@epfl.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nCordonnier2020On,\ntitle={On the Relationship between Self-Attention and Convolutional Layers},\nauthor={Jean-Baptiste Cordonnier and Andreas Loukas and Martin Jaggi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlnC1rKPB}\n}",
        "github": "https://github.com/epfml/attention-cnn",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlnC1rKPB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "182;1018;255",
        "wc_reply_reviewers": "0;677;0",
        "wc_reply_authors": "111;1385;109",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            485.0,
            378.0643684171608
        ],
        "wc_reply_reviewers_avg": [
            225.66666666666666,
            319.14086057552845
        ],
        "wc_reply_authors_avg": [
            535.0,
            601.0413186018634
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 767,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11977726124453844540&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJloElBYvB",
        "title": "Phase Transitions for the Information Bottleneck in Representation Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We give a theoretical analysis of the Information Bottleneck objective to understand and predict observed phase transitions in the prediction vs. compression tradeoff.",
        "abstract": "In the Information Bottleneck (IB), when tuning the relative strength between compression and prediction terms, how do the two terms behave, and what's their relationship with the dataset and the learned representation? In this paper, we set out to answer these questions by studying multiple phase transitions in the IB objective: IB_\u03b2[p(z|x)] = I(X; Z) \u2212 \u03b2I(Y; Z) defined on the encoding distribution p(z|x) for input X, target Y and representation Z, where sudden jumps of dI(Y; Z)/d\u03b2 and prediction accuracy are observed with increasing \u03b2. We introduce a definition for IB phase transitions as a qualitative change of the IB loss landscape, and show that the transitions correspond to the onset of learning new classes. Using second-order calculus of variations, we derive a formula that provides a practical condition for IB phase transitions, and draw its connection with the Fisher information matrix for parameterized models. We provide two perspectives to understand the formula, revealing that each IB phase transition is finding a component of maximum (nonlinear) correlation between X and Y orthogonal to the learned representation, in close analogy with canonical-correlation analysis (CCA) in linear settings. Based on the theory, we present an algorithm for discovering phase transition points. Finally, we verify that our theory and algorithm accurately predict phase transitions in categorical datasets, predict the onset of learning new classes and class difficulty in MNIST, and predict prominent phase transitions in CIFAR10.\n",
        "keywords": "Information Theory;Representation Learning;Phase Transition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tailin Wu;Ian Fischer",
        "authorids": "tailin@cs.stanford.edu;iansf@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nWu2020Phase,\ntitle={Phase Transitions for the Information Bottleneck in Representation Learning},\nauthor={Tailin Wu and Ian Fischer},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJloElBYvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJloElBYvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "243;417;113",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "547;728;89",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            257.6666666666667,
            124.54004264581823
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            454.6666666666667,
            268.91675704987637
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6029565525300985879&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJlrS1rYwH",
        "title": "Policy Tree Network",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Decision-time planning policies with implicit dynamics models have been shown to work in discrete action spaces with Q learning. However, decision-time planning with implicit dynamics models in continuous action space has proven to be a difficult problem. Recent work in Reinforcement Learning has allowed for implicit model based approaches to be extended to Policy Gradient methods. In this work we propose Policy Tree Network (PTN). Policy Tree Network lies at the intersection of Model-Based Reinforcement Learning and Model-Free Reinforcement Learning. Policy Tree Network is a novel approach which, for the first time, demonstrates how to leverage an implicit model to perform decision-time planning with Policy Gradient methods in continuous action spaces. This work is empirically justified on 8 standard MuJoCo environments so that it can easily be compared with similar work done in this area. Additionally, we offer a lower bound on the worst case change in the mean of the policy when tree planning is used and theoretically justify our design choices.",
        "keywords": "Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zac Wellmer;Sepanta Zeighami;James Kwok",
        "authorids": "zac@1984.ai;szeighami@connect.ust.hk;jamesk@cse.ust.hk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwellmer2020policy,\ntitle={Policy Tree Network},\nauthor={Zac Wellmer and Sepanta Zeighami and James Kwok},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlrS1rYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlrS1rYwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "435;339;636",
        "wc_reply_reviewers": "60;140;81",
        "wc_reply_authors": "581;1165;922",
        "reply_reviewers": "1;1;1",
        "reply_authors": "1;2;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            470.0,
            123.74974747448982
        ],
        "wc_reply_reviewers_avg": [
            93.66666666666667,
            33.86574801903671
        ],
        "wc_reply_authors_avg": [
            889.3333333333334,
            239.5333426106316
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "HJluEeHKwH",
        "title": "The Differentiable Cross-Entropy Method",
        "track": "main",
        "status": "Reject",
        "tldr": "DCEM learns latent domains for optimization problems and helps bridge the gap between model-based and model-free RL --- we create a differentiable controller and fine-tune parts of it with PPO",
        "abstract": "We study the Cross-Entropy Method (CEM) for the non-convex optimization of a continuous and parameterized objective function and introduce a differentiable variant (DCEM) that enables us to differentiate the output of CEM with respect to the objective function's parameters. In the machine learning setting this brings CEM inside of the end-to-end learning pipeline in cases this has otherwise been impossible. We show applications in a synthetic energy-based structured prediction task and in non-convex continuous control. In the control setting we show on the simulated cheetah and walker tasks that we can embed their optimal action sequences with DCEM and then use policy optimization to fine-tune components of the controller as a step towards combining model-based and model-free RL.",
        "keywords": "machine learning;differentiable optimization;control;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Brandon Amos;Denis Yarats",
        "authorids": "brandon.amos.cs@gmail.com;denisyarats@cs.nyu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\namos2020the,\ntitle={The Differentiable Cross-Entropy Method},\nauthor={Brandon Amos and Denis Yarats},\nyear={2020},\nurl={https://openreview.net/forum?id=HJluEeHKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJluEeHKwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "632;612;529",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1027;796;884",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            591.0,
            44.594469014292194
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            902.3333333333334,
            95.19220322881259
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 69,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5207717261153832790&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "HJlvCR4KDS",
        "title": "Why Does the VQA Model Answer No?: Improving Reasoning through Visual and Linguistic Inference",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In order to make Visual Question Answering (VQA) explainable, previous studies not only visualize the attended region of a VQA model but also generate textual explanations for its answers. However, when the model\u2019s answer is \u2018no,\u2019 existing methods have difficulty in revealing detailed arguments that lead to that answer. In addition, previous methods are insufficient to provide logical bases, when the question requires common sense to answer. In this paper, we propose a novel textual explanation method to overcome the aforementioned limitations. First, we extract keywords that are essential to infer an answer from a question. Second, for a pre-trained explanation generator, we utilize a novel Variable-Constrained\nBeam Search (VCBS) algorithm to generate phrases that best describes the relationship between keywords in images. Then, we complete an explanation by feeding the phrase to the generator. Furthermore, if the answer to the question is \u201cyes\u201d or \u201cno,\u201d we apply Natural Langauge Inference (NLI) to identify whether contents of the question can be inferred from the explanation using common sense. Our user study, conducted in Amazon Mechanical Turk (MTurk), shows that our proposed method generates more reliable explanations compared to the previous methods. Moreover, by modifying the VQA model\u2019s answer through the output of the NLI model, we show that VQA performance increases by 1.1% from the original model.",
        "keywords": "Image Captioning;Visual Question Answering;Explainable A.I;Beam Search;Constrained Beam Search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Seungjun Jung;Junyoung Byun;Kyujin Shim;Changick Kim",
        "authorids": "seungjun45@kaist.ac.kr;bjyoung@kaist.ac.kr;kjshim1028@kaist.ac.kr;changick@kaist.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njung2020why,\ntitle={Why Does the {\\{}VQA{\\}} Model Answer No?: Improving Reasoning through Visual and Linguistic Inference},\nauthor={Seungjun Jung and Junyoung Byun and Kyujin Shim and Changick Kim},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlvCR4KDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJlvCR4KDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "312;271;249",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "580;172;298",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            277.3333333333333,
            26.10661899893503
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            350.0,
            170.57549648176317
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OYID1ZziviMJ:scholar.google.com/&scioq=Why+Does+the+VQA+Model+Answer+No%3F:+Improving+Reasoning+through+Visual+and+Linguistic+Inference&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJlxIJBFDr",
        "title": "Sample Efficient Policy Gradient Methods with Recursive Variance Reduction",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Improving the sample efficiency in reinforcement learning has been a long-standing research problem. In this work, we aim to reduce the sample complexity of existing policy gradient methods. We propose a novel policy gradient algorithm called SRVR-PG, which only requires $O(1/\\epsilon^{3/2})$\\footnote{$O(\\cdot)$ notation hides constant factors.} episodes to find an $\\epsilon$-approximate stationary point of the nonconcave performance function $J(\\boldsymbol{\\theta})$ (i.e., $\\boldsymbol{\\theta}$ such that $\\|\\nabla J(\\boldsymbol{\\theta})\\|_2^2\\leq\\epsilon$). This sample complexity improves the existing result $O(1/\\epsilon^{5/3})$ for stochastic variance reduced policy gradient algorithms by a factor of $O(1/\\epsilon^{1/6})$. In addition, we also propose a variant of SRVR-PG with parameter exploration, which explores the initial policy parameter from a prior probability distribution. We conduct numerical experiments on classic control problems in reinforcement learning to validate the performance of our proposed algorithms.",
        "keywords": "Policy Gradient;Reinforcement Learning;Sample Efficiency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pan Xu;Felicia Gao;Quanquan Gu",
        "authorids": "panxu@cs.ucla.edu;fxgao1160@engineering.ucla.edu;qgu@cs.ucla.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nXu2020Sample,\ntitle={Sample Efficient Policy Gradient Methods with Recursive Variance Reduction},\nauthor={Pan Xu and Felicia Gao and Quanquan Gu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlxIJBFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJlxIJBFDr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "317;235;244",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1081;491;353",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            265.3333333333333,
            36.718145680606234
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            641.6666666666666,
            315.7228038784796
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 113,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14752391229837319859&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJlyLgrFvB",
        "title": "All Simulations Are Not Equal: Simulation Reweighing for Imperfect Information Games",
        "track": "main",
        "status": "Reject",
        "tldr": "Reweighing simulations through action history backwards verification can improve performances in imperfect information games.",
        "abstract": "Imperfect information games are challenging benchmarks for artificial intelligent systems. To reason and plan under uncertainty is a key towards general AI. Traditionally, large amounts of simulations are used in imperfect information games, and they sometimes perform sub-optimally due to large state and action spaces. In this work, we propose a simulation reweighing mechanism using neural networks. It performs backwards verification to public previous actions and assign proper belief weights to the simulations from the information set of the current observation, using an incomplete state solver network (ISSN). We use simulation reweighing in the playing phase of the game contract bridge, and show that it outperforms previous state-of-the-art Monte Carlo simulation based methods, and achieves better play per decision. ",
        "keywords": "Contract Bridge;Simulation;Imperfect Information Games;Reweigh;Belief Modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qucheng Gong;Yuandong Tian",
        "authorids": "qucheng@fb.com;yuandong@fb.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngong2020all,\ntitle={All Simulations Are Not Equal: Simulation Reweighing for Imperfect Information Games},\nauthor={Qucheng Gong and Yuandong Tian},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlyLgrFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJlyLgrFvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "409;1164;444",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;439;166",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            672.3333333333334,
            347.9543392777596
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            201.66666666666666,
            180.9868012376102
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:q8p1YnAsRMMJ:scholar.google.com/&scioq=All+Simulations+Are+Not+Equal:+Simulation+Reweighing+for+Imperfect+Information+Games&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJlys1BtwB",
        "title": "Data Annealing Transfer learning Procedure for Informal Language Understanding Tasks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "There are many applications for informal language understanding tasks in the real world. However, because informal language understanding tasks suffer more from data noise than formal ones, there is a huge performance gap between formal and informal language understanding tasks. The recent pre-trained models that improved the performance of formal language understanding tasks did not achieve the performance on informal language much. Although the formal tasks and informal tasks are similar in purpose, their language models significantly differ from each other. We propose a data annealing transfer learning procedure to bridge the performance gap on informal natural language understanding tasks.\nIn the data annealing procedure, the training set contains mainly formal text data at first; then we gradually increase the proportion of the informal text data during the training process.\nWe validate the data annealing procedure on three natural language understanding tasks: named entity recognition (NER), part-of-speech (POS) tagging, and chunking with two popular neural network models, LSTM and BERT. \nWhen BERT is fine-tuned with our learning procedure, it outperforms all the state-of-the-art models on the three informal tasks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jing Gu;Yu Zhou",
        "authorids": "jkgu@ucdavis.edu;joyu@ucdavis.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJlys1BtwB",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "95;300;214;743",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            338.0,
            244.89487540575445
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Qs_CtrDRKKAJ:scholar.google.com/&scioq=Data+Annealing+Transfer+learning+Procedure+for+Informal+Language+Understanding+Tasks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJlzxgBtwH",
        "title": "Minimally distorted Adversarial Examples with a Fast Adaptive Boundary Attack",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a white-box adversarial attack wrt the $l_1$-, $l_2$- and $l_\\infty$-norm achieving  state-of-the-art performances, minimizing the norm of the perturbations and being computationally cheap.",
        "abstract": "The evaluation of robustness against adversarial manipulations of neural networks-based classifiers is mainly tested with empirical attacks as the methods for the exact computation, even when available, do not scale to large networks. We propose in this paper a new white-box adversarial attack wrt the $l_p$-norms for $p \\in \\{1,2,\\infty\\}$ aiming at finding the minimal perturbation necessary to change the class of a given input. It has an intuitive geometric meaning, yields quickly high quality results, minimizes the size of the perturbation (so that it returns the robust accuracy at every threshold with a single run). It performs better or similarly to state-of-the-art attacks which are partially specialized to one $l_p$-norm.",
        "keywords": "adversarial attacks;adversarial robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Francesco Croce;Matthias Hein",
        "authorids": "francesco91.croce@gmail.com;matthias.hein@uni-tuebingen.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ncroce2020minimally,\ntitle={Minimally distorted Adversarial Examples with a Fast Adaptive Boundary Attack},\nauthor={Francesco Croce and Matthias Hein},\nyear={2020},\nurl={https://openreview.net/forum?id=HJlzxgBtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJlzxgBtwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "339;712;182",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "625;861;594",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            411.0,
            222.28060344228567
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            693.3333333333334,
            119.23180038153505
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 604,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11433432412885384423&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJx-3grYDB",
        "title": "Learning Nearly Decomposable Value Functions Via Communication Minimization",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Reinforcement learning encounters major challenges in multi-agent settings, such as scalability and non-stationarity. Recently, value function factorization learning emerges as a promising way to address these challenges in collaborative multi-agent systems. However, existing methods have been focusing on learning fully decentralized value functions, which are not efficient for tasks requiring communication. To address this limitation, this paper presents a novel framework for learning nearly decomposable Q-functions (NDQ) via communication minimization, with which agents act on their own most of the time but occasionally send messages to other agents in order for effective coordination. This framework hybridizes value function factorization learning and communication learning by introducing two information-theoretic regularizers. These regularizers are maximizing mutual information between agents' action selection and communication messages while minimizing the entropy of messages between agents. We show how to optimize these regularizers in a way that is easily integrated with existing value function factorization methods such as QMIX. Finally, we demonstrate that, on the StarCraft unit micromanagement benchmark, our framework significantly outperforms baseline methods and allows us to cut off more than $80\\%$ of communication without sacrificing the performance. The videos of our experiments are available at https://sites.google.com/view/ndq.",
        "keywords": "Multi-agent reinforcement learning;Nearly decomposable value function;Minimized communication;Multi-agent systems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tonghan Wang*;Jianhao Wang*;Chongyi Zheng;Chongjie Zhang",
        "authorids": "tonghanwang1996@gmail.com;1040594377@qq.com;chongyeezheng@gmail.com;chongjie@tsinghua.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWang*2020Learning,\ntitle={Learning Nearly Decomposable Value Functions Via Communication Minimization},\nauthor={Tonghan Wang* and Jianhao Wang* and Chongyi Zheng and Chongjie Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJx-3grYDB}\n}",
        "github": "https://github.com/TonghanWang/NDQ",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HJx-3grYDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "217;80;486",
        "wc_reply_reviewers": "0;0;23",
        "wc_reply_authors": "615;310;786",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            261.0,
            168.64360843704296
        ],
        "wc_reply_reviewers_avg": [
            7.666666666666667,
            10.842303978193728
        ],
        "wc_reply_authors_avg": [
            570.3333333333334,
            196.87615960857786
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 178,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9765925761850787056&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJx-akSKPS",
        "title": "Neural Subgraph Isomorphism Counting",
        "track": "main",
        "status": "Reject",
        "tldr": "In this paper, we study a new graph learning problem: learning to count subgraph isomorphisms.",
        "abstract": "In this paper, we study a new graph learning problem: learning to count subgraph isomorphisms. Although the learning based approach is inexact, we are able to generalize to count large patterns and data graphs in polynomial time compared to the exponential time of the original NP-complete problem. Different from other traditional graph learning problems such as node classification and link prediction, subgraph isomorphism counting requires more global inference to oversee the whole graph. To tackle this problem, we propose a dynamic intermedium attention memory network (DIAMNet) which augments different representation learning architectures and iteratively attends pattern and target data graphs to memorize different subgraph isomorphisms for the global counting. We develop both small graphs (<= 1,024 subgraph isomorphisms in each) and large graphs (<= 4,096 subgraph isomorphisms in each) sets to evaluate different models. Experimental results show that learning based subgraph isomorphism counting can help reduce the time complexity with acceptable accuracy. Our DIAMNet can further improve existing representation learning models for this more global problem.",
        "keywords": "subgraph isomorphism;graph neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin Liu;Haojie Pan;Mutian He;Yangqiu Song;Xin Jiang",
        "authorids": "xliucr@cse.ust.hk;hpanad@cse.ust.hk;mhear@cse.ust.hk;yqsong@cse.ust.hk;jiang.xin@huawei.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nliu2020neural,\ntitle={Neural Subgraph Isomorphism Counting},\nauthor={Xin Liu and Haojie Pan and Mutian He and Yangqiu Song and Xin Jiang},\nyear={2020},\nurl={https://openreview.net/forum?id=HJx-akSKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJx-akSKPS",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "291;477;428;163",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "350;447;352;205",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            339.75,
            122.72199273153936
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            338.5,
            86.47109343589915
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 93,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13818623981682769003&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJx0U64FwS",
        "title": "A Mechanism of Implicit Regularization in Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Despite a lot of theoretical efforts, very little is known about mechanisms of implicit regularization by which the low complexity contributes to generalization in deep learning. In particular, causality between the generalization performance, implicit regularization and nonlinearity of activation functions is one of the basic mysteries of deep neural networks (DNNs). In this work, we introduce a novel technique for DNNs called random walk analysis and reveal a mechanism of the implicit regularization caused by nonlinearity of ReLU activation. Surprisingly, our theoretical results suggest that the learned DNNs interpolate almost linearly between data points, which leads to the low complexity solutions in the over-parameterized regime. As a result, we prove that stochastic gradient descent can learn a class of continuously differentiable functions with generalization bounds of the order of $O(n^{-2})$ ($n$: the number of samples). Furthermore, our analysis is independent of the kernel methods, including neural tangent kernels.",
        "keywords": "Implicit Regularization;Generalization;Deep Neural Network;Low Complexity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Masayoshi Kubo;Genki Sugiura;Kenta Shinzato;Momose Oyama",
        "authorids": "kubo@i.kyoto-u.ac.jp;sugiura.genki.42n@st.kyoto-u.ac.jp;shinzato.kenta.82r@st.kyoto-u.ac.jp;oyama.momose.75c@st.kyoto-u.ac.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkubo2020a,\ntitle={A Mechanism of Implicit Regularization in Deep Learning},\nauthor={Masayoshi Kubo and Genki Sugiura and Kenta Shinzato and Momose Oyama},\nyear={2020},\nurl={https://openreview.net/forum?id=HJx0U64FwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJx0U64FwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "507;742;329",
        "wc_reply_reviewers": "590;0;0",
        "wc_reply_authors": "1038;1476;1041",
        "reply_reviewers": "2;0;0",
        "reply_authors": "3;3;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            526.0,
            169.14096684915415
        ],
        "wc_reply_reviewers_avg": [
            196.66666666666666,
            278.1286672667087
        ],
        "wc_reply_authors_avg": [
            1185.0,
            205.77171817331944
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qudzxLq03CQJ:scholar.google.com/&scioq=A+Mechanism+of+Implicit+Regularization+in+Deep+Learning&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "HJx0Yn4FPB",
        "title": "ATLPA:ADVERSARIAL TOLERANT LOGIT PAIRING WITH ATTENTION FOR CONVOLUTIONAL NEURAL NETWORK",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "In this paper, we propose a novel regularized adversarial training framework ATLPA,namely Adversarial Tolerant Logit Pairing with Attention.",
        "abstract": "Though deep neural networks have achieved the state of the art performance in visual classification, recent studies have shown that they are all vulnerable to the attack of adversarial examples. To solve the problem, some regularization adversarial training methods, constraining the output label or logit, have been studied. In this paper, we propose a novel regularized adversarial training framework ATLPA,namely Adversarial Tolerant Logit Pairing with Attention. Instead of constraining a hard distribution (e.g., one-hot vectors or logit) in adversarial training, ATLPA uses Tolerant Logit which consists of confidence distribution on top-k classes and captures inter-class similarities at the image level. Specifically, in addition to minimizing the empirical loss, ATLPA encourages attention map for pairs of examples to be similar. When applied to clean examples and their adversarial counterparts, ATLPA improves accuracy on adversarial examples over adversarial training. We evaluate ATLPA with the state of the art algorithms, the experiment results show that our method outperforms these baselines with higher accuracy. Compared with previous work, our work is evaluated under highly challenging PGD attack: the maximum perturbation $\\epsilon$ is 64 and 128 with 10 to 200 attack iterations.",
        "keywords": "adversarial examples;adversarial training;computer vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dou Goodman",
        "authorids": "250261953@qq.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nanonymous2020atlpaadversarial,\ntitle={{\\{}ATLPA{\\}}:{\\{}ADVERSARIAL{\\}} {\\{}TOLERANT{\\}} {\\{}LOGIT{\\}} {\\{}PAIRING{\\}} {\\{}WITH{\\}} {\\{}ATTENTION{\\}} {\\{}FOR{\\}} {\\{}CONVOLUTIONAL{\\}} {\\{}NEURAL{\\}} {\\{}NETWORK{\\}}},\nauthor={Anonymous},\nbooktitle={Submitted to International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJx0Yn4FPB},\nnote={under review}\n}",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=HJx0Yn4FPB",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            2,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IQzJHwwjQyIJ:scholar.google.com/&scioq=ATLPA:ADVERSARIAL+TOLERANT+LOGIT+PAIRING+WITH+ATTENTION+FOR+CONVOLUTIONAL+NEURAL+NETWORK&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJx4PAEYDH",
        "title": "R-TRANSFORMER: RECURRENT NEURAL NETWORK ENHANCED TRANSFORMER",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposes an effective generic sequence model which leverages the strengths of both RNNs and Multi-head attention.",
        "abstract": "Recurrent Neural Networks have long been the dominating choice for sequence modeling. However, it severely suffers from two issues: impotent in capturing very long-term dependencies and unable to parallelize the sequential computation procedure. Therefore, many non-recurrent sequence models that are built on convolution and attention operations have been proposed recently. Notably, models with multi-head attention such as Transformer have demonstrated extreme effectiveness in capturing long-term dependencies in a variety of sequence modeling tasks. Despite their success, however, these models lack necessary components to model local structures in sequences and heavily rely on position embeddings that have limited effects and require a considerable amount of design efforts. In this paper, we propose the R-Transformer which enjoys the advantages of both RNNs and the multi-head attention mechanism while avoids their respective drawbacks. The proposed model can effectively capture both local structures and global long-term dependencies in sequences without any use of position embeddings. We evaluate R-Transformer through extensive experiments with data from a wide range of domains and the empirical results show that R-Transformer outperforms the state-of-the-art methods by a large margin in most of the tasks.",
        "keywords": "Sequence Modeling;Multi-head Attention;RNNs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiwei Wang;Yao Ma;Zitao Liu;Jiliang Tang",
        "authorids": "wangzh65@msu.edu;mayao4@msu.edu;liuzitao@100tal.com;tangjili@msu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020rtransformer,\ntitle={R-{\\{}TRANSFORMER{\\}}: {\\{}RECURRENT{\\}} {\\{}NEURAL{\\}} {\\{}NETWORK{\\}} {\\{}ENHANCED{\\}} {\\{}TRANSFORMER{\\}}},\nauthor={Zhiwei Wang and Yao Ma and Zitao Liu and Jiliang Tang},\nyear={2020},\nurl={https://openreview.net/forum?id=HJx4PAEYDH}\n}",
        "github": "https://www.dropbox.com/sh/u35qgqnmjpywcqn/AAAITcId7DRPOD9KRooQW7i2a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJx4PAEYDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "1152;237;182",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            523.6666666666666,
            444.86577251521106
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 157,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11023549766315291894&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJx7uJStPH",
        "title": "Music Source Separation in the Waveform Domain",
        "track": "main",
        "status": "Reject",
        "tldr": "We match the performance of spectrogram based model with a model trained end-to-end in the waveform domain",
        "abstract": "Source separation for music is the task of isolating contributions, or stems, from different instruments recorded individually and arranged together to form a song.Such components include voice, bass, drums and any other accompaniments. While end-to-end models that directly generate the waveform are state-of-the-art in many audio synthesis problems, the best multi-instrument source separation models generate masks on the magnitude spectrum and achieve performances far above current end-to-end, waveform-to-waveform models. We present an in-depth analysis of a new architecture, which we will refer to as Demucs, based on a (transposed) convolutional autoencoder, with a bidirectional LSTM at the bottleneck layer and skip-connections as in U-Networks (Ronneberger et al., 2015). Compared to the state-of-the-art waveform-to-waveform model, Wave-U-Net (Stoller et al., 2018), the main features of our approach in addition of the bi-LSTM are the use of trans-posed convolution layers instead of upsampling-convolution blocks, the use of gated linear units, exponentially growing the number of channels with depth and a new careful initialization of the weights.  Results on the MusDB dataset show that our architecture achieves a signal-to-distortion ratio (SDR) nearly 2.2 points higher than the best waveform-to-waveform competitor (from 3.2 to 5.4 SDR). This makes our model match the state-of-the-art performances on this dataset, bridging the performance gap between models that operate on the spectrogram and end-to-end approaches.",
        "keywords": "source separation;audio synthesis;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexandre Defossez;Nicolas Usunier;Leon Bottou;Francis Bach",
        "authorids": "defossez@fb.com;usunier@fb.com;leonb@fb.com;francis.bach@inria.fr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ndefossez2020music,\ntitle={Music Source Separation in the Waveform Domain},\nauthor={Alexandre Defossez and Nicolas Usunier and Leon Bottou and Francis Bach},\nyear={2020},\nurl={https://openreview.net/forum?id=HJx7uJStPH}\n}",
        "github": "https://www.dropbox.com/sh/o0gps94s120v7l4/AABS5vDfuuRjgY_zDjdSm_Fsa?dl=1",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJx7uJStPH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "135;479;179",
        "wc_reply_reviewers": "68;0;0",
        "wc_reply_authors": "370;199;62",
        "reply_reviewers": "2;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            264.3333333333333,
            152.85141659649597
        ],
        "wc_reply_reviewers_avg": [
            22.666666666666668,
            32.05550741379015
        ],
        "wc_reply_authors_avg": [
            210.33333333333334,
            125.99559075177548
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 353,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3832088331804453056&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "HJx81ySKwr",
        "title": "Iterative energy-based projection on a normal data manifold for anomaly localization",
        "track": "main",
        "status": "Poster",
        "tldr": "We use gradient descent on a regularized autoencoder loss to correct anomalous images.",
        "abstract": "Autoencoder reconstructions are widely used for the task of unsupervised anomaly localization. Indeed, an autoencoder trained on normal data is expected to only be able to reconstruct normal features of the data, allowing the segmentation of anomalous pixels in an image via a simple comparison between the image and its autoencoder reconstruction. In practice however, local defects added to a normal image can deteriorate the whole reconstruction, making this segmentation challenging. To tackle the issue, we propose in this paper a new approach for projecting anomalous data on a autoencoder-learned normal data manifold, by using gradient descent on an energy derived from the autoencoder's loss function. This energy can be augmented with regularization terms that model priors on what constitutes the user-defined optimal projection. By iteratively updating the input of the autoencoder, we bypass the loss of high-frequency information caused by the autoencoder bottleneck. This allows to produce images of higher quality than classic reconstructions. Our method achieves state-of-the-art results on various anomaly localization datasets. It also shows promising results at an inpainting task on the CelebA dataset.",
        "keywords": "deep learning;visual inspection;unsupervised anomaly detection;anomaly localization;autoencoder;variational autoencoder;gradient descent;inpainting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David Dehaene;Oriel Frigo;S\u00e9bastien Combrexelle;Pierre Eline",
        "authorids": "david@anotherbrain.ai;oriel@anotherbrain.ai;sebastien@anotherbrain.ai;pierre@anotherbrain.ai",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nDehaene2020Iterative,\ntitle={Iterative energy-based projection on a normal data manifold for anomaly localization},\nauthor={David Dehaene and Oriel Frigo and S\u00e9bastien Combrexelle and Pierre Eline},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJx81ySKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJx81ySKwr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "199;106;180",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "543;353;232",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            161.66666666666666,
            40.119266637808266
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            376.0,
            128.00260414017626
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 213,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15311288139419393459&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJx8HANFDH",
        "title": "Four Things Everyone Should Know to Improve Batch Normalization",
        "track": "main",
        "status": "Poster",
        "tldr": "Four things that improve batch normalization across all batch sizes",
        "abstract": "A key component of most neural network architectures is the use of normalization layers, such as Batch Normalization. Despite its common use and large utility in optimizing deep architectures, it has been challenging both to generically improve upon Batch Normalization and to understand the circumstances that lend themselves to other enhancements. In this paper, we identify four improvements to the generic form of Batch Normalization and the circumstances under which they work, yielding performance gains across all batch sizes while requiring no additional computation during training. These contributions include proposing a method for reasoning about the current example in inference normalization statistics, fixing a training vs. inference discrepancy; recognizing and validating the powerful regularization effect of Ghost Batch Normalization for small and medium batch sizes; examining the effect of weight decay regularization on the scaling and shifting parameters \u03b3 and \u03b2; and identifying a new normalization algorithm for very small batch sizes by combining the strengths of Batch and Group Normalization. We validate our results empirically on six datasets: CIFAR-100, SVHN, Caltech-256, Oxford Flowers-102, CUB-2011, and ImageNet.",
        "keywords": "batch normalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cecilia Summers;Michael J. Dinneen",
        "authorids": "ceciliasummers07@gmail.com;mjd@cs.auckland.ac.nz",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nSummers2020Four,\ntitle={Four Things Everyone Should Know to Improve Batch Normalization},\nauthor={Cecilia Summers and Michael J. Dinneen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJx8HANFDH}\n}",
        "github": "https://github.com/ceciliaresearch/four_things_batch_norm",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJx8HANFDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "547;123;331",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "856;156;464",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            333.6666666666667,
            173.10754524925315
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            492.0,
            286.4588393934924
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 64,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8831824515210942226&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJxB3AEFDS",
        "title": "A novel text representation which enables image classifiers to perform text classification",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We introduce a novel text representation method which enables image classifiers to be applied to text classification problems, and apply the method to inventor name disambiguation.",
        "abstract": "We introduce a novel method for converting text data into abstract image representations, which allows image-based processing techniques (e.g. image classification networks) to be applied to text-based comparison problems. We apply the technique to entity disambiguation of inventor names in US patents. The method involves converting text from each pairwise comparison between two inventor name records into a 2D RGB (stacked) image representation. We then train an image classification neural network to discriminate between such pairwise comparison images, and use the trained network to label each pair of records as either matched (same inventor) or non-matched (different inventors), obtaining highly accurate results (F1: 99.09%, precision: 99.41%, recall: 98.76%). Our new text-to-image representation method could potentially be used more broadly for other NLP comparison problems, such as disambiguation of academic publications, or for problems that require simultaneous classification of both text and images.",
        "keywords": "supervised representation learning;natural language processing;image pattern recognition;named entity disambiguation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stephen M. Petrie;T'Mir D. Julius",
        "authorids": "spetrie@swin.edu.au;tdjempire@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJxB3AEFDS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "314;269;421",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            334.6666666666667,
            63.75125271100343
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13563706874499692547&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJxDugSFDB",
        "title": "Stochastic Latent Actor-Critic: Deep Reinforcement Learning with a Latent Variable Model",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep reinforcement learning (RL) algorithms can use high-capacity deep networks to learn directly from image observations. However, these kinds of observation spaces present a number of  challenges in practice, since the policy must now solve two problems: a representation learning problem, and a task learning problem. In this paper, we aim to explicitly learn representations that can accelerate reinforcement learning from images. We propose the stochastic latent actor-critic (SLAC) algorithm: a sample-efficient and high-performing RL algorithm for learning policies for complex continuous control tasks directly from high-dimensional image inputs. SLAC learns a compact latent representation space using a stochastic sequential latent variable model, and then learns a critic model within this latent space. By learning a critic within a compact state space, SLAC can learn much more efficiently than standard RL methods. The proposed model improves performance substantially over alternative representations as well, such as variational autoencoders. In fact, our experimental evaluation demonstrates that the sample efficiency of our resulting method is comparable to that of model-based RL methods that directly use a similar type of model for control. Furthermore, our method outperforms both model-free and model-based alternatives in terms of final performance and sample efficiency, on a range of difficult image-based control tasks. Our code and videos of our results are available at our website.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alex X. Lee;Anusha Nagabandi;Pieter Abbeel;Sergey Levine",
        "authorids": "alexlee_gk@cs.berkeley.edu;nagaban2@berkeley.edu;pabbeel@cs.berkeley.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nlee2020stochastic,\ntitle={Stochastic Latent Actor-Critic: Deep Reinforcement Learning with a Latent Variable Model},\nauthor={Alex X. Lee and Anusha Nagabandi and Pieter Abbeel and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxDugSFDB}\n}",
        "github": "https://rl-slac.github.io/slac/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxDugSFDB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "256;934;280",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "561;638;235",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            490.0,
            314.10826159144557
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            478.0,
            174.67875276251164
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 481,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9484557032945558988&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJxEhREKDH",
        "title": "On the Global Convergence of Training Deep Linear ResNets",
        "track": "main",
        "status": "Poster",
        "tldr": "Under certain condition on the input and output linear transformations, both GD and SGD can achieve global convergence for training deep linear ResNets.",
        "abstract": "We study the convergence of gradient descent (GD) and stochastic gradient descent (SGD) for training $L$-hidden-layer linear residual networks (ResNets). We prove that for training deep residual networks with certain linear transformations at input and output layers, which are fixed throughout training, both GD and SGD with zero initialization on all hidden weights can converge to the global minimum of the training loss. Moreover, when specializing to appropriate Gaussian random linear transformations, GD and SGD provably optimize wide enough deep linear ResNets. Compared with the global convergence result of GD for training standard deep linear networks \\citep{du2019width}, our condition on the neural network width is sharper by a factor of $O(\\kappa L)$, where $\\kappa$ denotes the condition number of the covariance matrix of the training data. We further propose a modified identity input and output transformations, and show that a $(d+k)$-wide neural network is sufficient to guarantee the global convergence of GD/SGD, where $d,k$ are the input and output dimensions respectively.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Difan Zou;Philip M. Long;Quanquan Gu",
        "authorids": "knowzou@ucla.edu;plong@google.com;qgu@cs.ucla.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nZou2020On,\ntitle={On the Global Convergence  of Training Deep Linear ResNets},\nauthor={Difan Zou and Philip M. Long and Quanquan Gu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxEhREKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxEhREKDH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "149;730;341",
        "wc_reply_reviewers": "0;152;18",
        "wc_reply_authors": "147;2134;574",
        "reply_reviewers": "0;2;1",
        "reply_authors": "1;4;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.6666666666667,
            241.69448115797394
        ],
        "wc_reply_reviewers_avg": [
            56.666666666666664,
            67.81019261307421
        ],
        "wc_reply_authors_avg": [
            951.6666666666666,
            854.0165233894613
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 46,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14939552231000659245&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJxJ2h4tPr",
        "title": "HighRes-net: Multi-Frame Super-Resolution by Recursive Fusion",
        "track": "main",
        "status": "Reject",
        "tldr": "The first deep learning approach to MFSR to solve registration, fusion, up-sampling in an end-to-end manner.",
        "abstract": "Generative deep learning has sparked a new wave of Super-Resolution (SR) algorithms that enhance single images with impressive aesthetic results, albeit with imaginary details. Multi-frame Super-Resolution (MFSR) offers a more grounded approach to the ill-posed problem, by conditioning on multiple low-resolution views. This is important for satellite monitoring of human impact on the planet -- from deforestation, to human rights violations -- that depend on reliable imagery. To this end, we present HighRes-net, the first deep learning approach to MFSR that learns its sub-tasks in an end-to-end fashion: (i) co-registration, (ii) fusion, (iii) up-sampling, and (iv) registration-at-the-loss. Co-registration of low-res views is learned implicitly through a reference-frame channel, with no explicit registration mechanism. We learn a global fusion operator that is applied recursively on an arbitrary number of low-res pairs. We introduce a registered loss, by learning to align the SR output to a ground-truth through ShiftNet. We show that by learning deep representations of multiple views, we can super-resolve low-resolution signals and enhance Earth observation data at scale. Our approach recently topped the European Space Agency's MFSR competition on real-world satellite imagery.",
        "keywords": "multi-frame super-resolution;super-resolution;remote sensing;fusion;de-aliasing;deep learning;registration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michel Deudon;Alfredo Kalaitzis;Md Rifat Arefin;Israel Goytom;Zhichao Lin;Kris Sankaran;Vincent Michalski;Samira E Kahou;Julien Cornebise;Yoshua Bengio",
        "authorids": "michel.deudon@elementai.com;freddie@element.ai;rifat.arefin515@gmail.com;isrugeek@gmail.com;zhichao.lin@elementai.com;sankaran.kris@gmail.com;vincent.michalski@gmx.de;samira.ebrahimi-kahou@polymtl.ca;julien@elementai.com;yoshua.bengio@mila.quebec",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@misc{\ndeudon2020highresnet,\ntitle={HighRes-net: Multi-Frame Super-Resolution by Recursive Fusion},\nauthor={Michel Deudon and Alfredo Kalaitzis and Md Rifat Arefin and Israel Goytom and Zhichao Lin and Kris Sankaran and Vincent Michalski and Samira E Kahou and Julien Cornebise and Yoshua Bengio},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxJ2h4tPr}\n}",
        "github": "https://anonymous.4open.science/r/b3404d0d-e541-4f52-bbe9-f84f2a52972e/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJxJ2h4tPr",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "340;424;834",
        "wc_reply_reviewers": "291;0;0",
        "wc_reply_authors": "2584;932;1755",
        "reply_reviewers": "1;0;0",
        "reply_authors": "6;2;8",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            532.6666666666666,
            215.81679473314603
        ],
        "wc_reply_reviewers_avg": [
            97.0,
            137.17871555019022
        ],
        "wc_reply_authors_avg": [
            1757.0,
            674.4276585866469
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            5.333333333333333,
            2.494438257849294
        ],
        "replies_avg": [
            22,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16278079426996319287&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HJxJdp4YvS",
        "title": "Variational pSOM: Deep Probabilistic Clustering with Self-Organizing Maps",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a new deep architecture, VarPSOM, and its extension to time series data, VarTPSOM,  which achieve superior clustering performance compared to current deep clustering methods on static and temporal data.",
        "abstract": "Generating visualizations and interpretations from high-dimensional data is a\ncommon problem in many fields. Two key approaches for tackling this problem \nare clustering and representation learning. There are very performant deep\nclustering models on the one hand and interpretable representation learning techniques, \noften relying on latent topological structures such as self-organizing maps,\non the other hand. However, current methods do not yet successfully combine\nthese two approaches. We present a new deep architecture for probabilistic clustering, \nVarPSOM, and its extension to time series data, VarTPSOM, composed of VarPSOM \nmodules connected by LSTM cells. We show that they achieve superior \nclustering performance compared to current deep clustering methods on static \nMNIST/Fashion-MNIST data as well as medical time series, while inducing an\ninterpretable representation. Moreover, on the medical time series, VarTPSOM\nsuccessfully predicts future trajectories in the original data space.",
        "keywords": "Self-organizing maps;Generative models;Unsupervised representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Laura Manduchi;Matthias H\u00fcser;Gunnar R\u00e4tsch;Vincent Fortuin",
        "authorids": "lauraman@student.ethz.ch;matthias.hueser@inf.ethz.ch;gunnar.ratsch@ratschlab.org;fortuin@inf.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmanduchi2020variational,\ntitle={Variational p{\\{}SOM{\\}}: Deep Probabilistic Clustering with Self-Organizing Maps},\nauthor={Laura Manduchi and Matthias H{\\\"u}ser and Gunnar R{\\\"a}tsch and Vincent Fortuin},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxJdp4YvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxJdp4YvS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "743;484;521",
        "wc_reply_reviewers": "0;244;0",
        "wc_reply_authors": "1053;1334;768",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            582.6666666666666,
            114.37462811111368
        ],
        "wc_reply_reviewers_avg": [
            81.33333333333333,
            115.02270307301174
        ],
        "wc_reply_authors_avg": [
            1051.6666666666667,
            231.0704558258272
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10651980574113740497&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HJxK5pEYvr",
        "title": "Tree-Structured Attention with Hierarchical Accumulation",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Incorporating hierarchical structures like constituency trees has been shown to be effective for various natural language processing (NLP) tasks. However, it is evident that state-of-the-art (SOTA) sequence-based models like the Transformer struggle to encode such structures inherently. On the other hand, dedicated models like the Tree-LSTM, while explicitly modeling hierarchical structures, do not perform as efficiently as the Transformer. In this paper, we attempt to bridge this gap with Hierarchical Accumulation to encode parse tree structures into self-attention at constant time complexity. Our approach outperforms SOTA methods in four IWSLT translation tasks and the WMT'14 English-German task. It also yields improvements over Transformer and Tree-LSTM on three text classification tasks. We further demonstrate that using hierarchical priors can compensate for data shortage, and that our model prefers phrase-level attentions over token-level attentions.",
        "keywords": "Tree;Constituency Tree;Hierarchical Accumulation;Machine Translation;NMT;WMT;IWSLT;Text Classification;Sentiment Analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xuan-Phi Nguyen;Shafiq Joty;Steven Hoi;Richard Socher",
        "authorids": ";;;",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nNguyen2020Tree-Structured,\ntitle={Tree-Structured Attention with Hierarchical Accumulation},\nauthor={Xuan-Phi Nguyen and Shafiq Joty and Steven Hoi and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxK5pEYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxK5pEYvr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "247;235;287",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            256.3333333333333,
            22.23110933404409
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 93,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18170940372302439219&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJxKhyStPH",
        "title": "Toward Understanding The Effect of Loss Function on The Performance of Knowledge Graph Embedding",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Knowledge graphs (KGs) represent world's facts in structured forms.  KG completion exploits the existing facts in a KG to discover new ones. Translation-based embedding model (TransE) is a prominent formulation to do KG completion. \nDespite the efficiency of TransE in memory and time, it suffers from several limitations in encoding relation patterns such as  symmetric, reflexive etc. To resolve this problem, most of the attempts have circled around the revision of the score function of TransE i.e., proposing a more complicated score function such as Trans(A, D, G, H, R, etc) to mitigate the limitations.  In this paper, we tackle this problem from a different perspective. We show that existing theories corresponding to the limitations of TransE are inaccurate because they ignore the effect of loss function. Accordingly, we pose theoretical investigations of the main limitations of TransE in the light of loss function. To the best of our knowledge, this has not been investigated so far comprehensively. We show that by a proper selection of the loss function for training the TransE model, the main limitations of the model are mitigated. This is explained by setting upper-bound for the scores of positive samples, showing the region of truth (i.e., the region that a triple is considered positive by the model).\nOur theoretical proofs with experimental results fill the gap between the capability of translation-based class of embedding models and the loss function. The theories emphasis the importance of the selection of the loss functions for training the models. Our experimental evaluations on different loss functions used for training the models justify our theoretical proofs and confirm the importance of the loss functions on the performance.\n\n",
        "keywords": "Knowledge graph embedding;Translation based embedding;loss function;relation pattern",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mojtaba Nayyeri;Chengjin Xu;Yadollah Yaghoobzadeh;Hamed Shariat Yazdi;Jens Lehmann",
        "authorids": "nayyeri@cs.uni-bonn.de;xuc@cs.uni-bonn.de;yayaghoo@microsoft.com;shariat@cs.uni-bonn.de;jens.lehmann@cs.uni-bonn.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nnayyeri2020toward,\ntitle={Toward Understanding The Effect of Loss Function on The Performance of Knowledge Graph Embedding},\nauthor={Mojtaba Nayyeri and Chengjin Xu and Yadollah Yaghoobzadeh and Hamed Shariat Yazdi and Jens Lehmann},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxKhyStPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJxKhyStPH",
        "pdf_size": 0,
        "rating": "1;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "290;197;290;254",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "2208;201;794;573",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "4;1;1;1",
        "rating_avg": [
            3.25,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            257.75,
            38.028772002261654
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            944.0,
            759.9154558238699
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.75,
            1.299038105676658
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14934540227632401861&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJxMYANtPH",
        "title": "The Local Elasticity of Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "This paper presents a phenomenon in neural networks that we refer to as local elasticity. Roughly speaking, a classifier is said to be locally elastic if its prediction at a feature vector x' is not significantly perturbed, after the classifier is updated via stochastic gradient descent at a (labeled) feature vector x that is dissimilar to x' in a certain sense. This phenomenon is shown to persist for neural networks with nonlinear activation functions through extensive simulations on real-life and synthetic datasets, whereas this is not observed in linear classifiers. In addition, we offer a geometric interpretation of local elasticity using the neural tangent kernel (Jacot et al., 2018). Building on top of local elasticity, we obtain pairwise similarity measures between feature vectors, which can be used for clustering in conjunction with K-means. The effectiveness of the clustering algorithm on the MNIST and CIFAR-10 datasets in turn corroborates the hypothesis of local elasticity of neural networks on real-life data. Finally, we discuss some implications of local elasticity to shed light on several intriguing aspects of deep neural networks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hangfeng He;Weijie Su",
        "authorids": "hangfeng@seas.upenn.edu;suw@wharton.upenn.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nHe2020The,\ntitle={The Local Elasticity of Neural Networks},\nauthor={Hangfeng He and Weijie Su},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxMYANtPH}\n}",
        "github": "[![github](/images/github_icon.svg) HornHehhf/LocalElasticity](https://github.com/HornHehhf/LocalElasticity)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJxMYANtPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "223;316;202",
        "wc_reply_reviewers": "24;0;0",
        "wc_reply_authors": "407;441;350",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            247.0,
            49.53786430600334
        ],
        "wc_reply_reviewers_avg": [
            8.0,
            11.313708498984761
        ],
        "wc_reply_authors_avg": [
            399.3333333333333,
            37.54404820415022
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 50,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2497659647078092985&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HJxN0CNFPB",
        "title": "Ladder Polynomial Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposes LPNN, which is a new type of polynomial neural networks that can have arbitrary polynomial order.  ",
        "abstract": "The underlying functions of polynomial neural networks are polynomial functions. These networks are shown to have nice theoretical properties by previous analysis, but they are actually hard to train when their polynomial orders are high. In this work, we devise a new type of activations and then create the Ladder Polynomial Neural Network (LPNN). This new network\ncan be trained with generic optimization algorithms. With a feedforward structure, it can also be combined with deep learning techniques such as batch normalization and dropout. Furthermore, an LPNN provides good control of its polynomial order because its polynomial order increases by 1 with each of its hidden layers. In our empirical study, deep LPNN models achieve good performances in a series of regression and classification tasks.",
        "keywords": "polynomial neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Li-Ping Liu;Ruiyuan Gu;Xiaozhe Hu",
        "authorids": "liping.liu@tufts.edu;ruiyuan.gu@tufts.edu;xiaozhe.hu@tufts.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nliu2020ladder,\ntitle={Ladder Polynomial Neural Networks},\nauthor={Li-Ping Liu and Ruiyuan Gu and Xiaozhe Hu},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxN0CNFPB}\n}",
        "github": "https://anonymous.4open.science/r/ba53e832-0151-48f2-bc45-3746390b72e4/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJxN0CNFPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "240;395;160",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "468;368;383",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            265.0,
            97.55340417774597
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            406.3333333333333,
            44.03281604540969
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2050425136460097522&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJxNAnVtDS",
        "title": "On the Convergence of FedAvg on Non-IID Data",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "Federated learning enables a large amount of edge computing devices to jointly learn a model without data sharing. As a leading algorithm in this setting, Federated Averaging (\\texttt{FedAvg}) runs Stochastic Gradient Descent (SGD) in parallel on a small subset of the total devices and averages the sequences only once in a while. Despite its simplicity, it lacks theoretical guarantees under realistic settings. In this paper, we analyze the convergence of \\texttt{FedAvg} on non-iid data and establish a convergence rate of $\\mathcal{O}(\\frac{1}{T})$ for strongly convex and smooth problems, where $T$ is the number of SGDs. Importantly, our bound demonstrates a trade-off between communication-efficiency and convergence rate. As user devices may be disconnected from the server, we relax the assumption of full device participation to partial device participation and study different averaging schemes; low device participation rate can be achieved without severely slowing down the learning.  Our results indicate that heterogeneity of data slows down the convergence, which matches empirical observations. Furthermore, we provide a necessary condition for \\texttt{FedAvg} on non-iid data: the learning rate $\\eta$ must decay, even if full-gradient is used; otherwise, the solution will be $\\Omega (\\eta)$ away from the optimal.",
        "keywords": "Federated Learning;stochastic optimization;Federated Averaging",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiang Li;Kaixuan Huang;Wenhao Yang;Shusen Wang;Zhihua Zhang",
        "authorids": "smslixiang@pku.edu.cn;hackyhuang@pku.edu.cn;yangwhsms@gmail.com;shusen.wang@stevens.edu;zhzhang@math.pku.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLi2020On,\ntitle={On the Convergence of FedAvg on Non-IID Data},\nauthor={Xiang Li and Kaixuan Huang and Wenhao Yang and Shusen Wang and Zhihua Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxNAnVtDS}\n}",
        "github": "https://github.com/lx10077/fedavgpy",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJxNAnVtDS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "395;108;220",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "111;11;120",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            241.0,
            118.10447352520846
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            80.66666666666667,
            49.398605468395786
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3022,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3930147365387936427&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJxR7R4FvS",
        "title": "RaCT: Toward Amortized Ranking-Critical Training For Collaborative Filtering",
        "track": "main",
        "status": "Poster",
        "tldr": "We apply the actor-critic methodology from reinforcement learning to collaborative filtering, resulting in improved performance across a variety of latent-variable models",
        "abstract": "We investigate new methods for training collaborative filtering models based on actor-critic reinforcement learning, to more directly maximize ranking-based objective functions. Specifically, we train a critic network to approximate ranking-based metrics, and then update the actor network to directly optimize against the learned metrics. In contrast to traditional learning-to-rank methods that require re-running the optimization procedure for new lists, our critic-based method amortizes the scoring process with a neural network, and can directly provide the (approximate) ranking scores for new lists.\n\nWe demonstrate the actor-critic's ability to significantly improve the performance of a variety of prediction models, and achieve better or comparable performance to a variety of strong baselines on three large-scale datasets.\n",
        "keywords": "Collaborative Filtering;Recommender Systems;Actor-Critic;Learned Metrics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sam Lobel*;Chunyuan Li*;Jianfeng Gao;Lawrence Carin",
        "authorids": "samuel_lobel@brown.edu;chunyuan.li@microsoft.com;jfgao@microsoft.com;lcarin@duke.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLobel*2020RaCT:,\ntitle={RaCT: Toward Amortized Ranking-Critical Training For Collaborative Filtering },\nauthor={Sam Lobel* and Chunyuan Li* and Jianfeng Gao and Lawrence Carin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxR7R4FvS}\n}",
        "github": "https://github.com/samlobel/RaCT_CF",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxR7R4FvS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "1611;355;154",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "540;167;5",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            706.6666666666666,
            644.7037217892331
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            237.33333333333334,
            224.0034721953112
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 38,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8553109289082894079&as_sdt=40005&sciodt=0,10&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HJxRMlrtPH",
        "title": "Verification of Generative-Model-Based Visual Transformations",
        "track": "main",
        "status": "Reject",
        "tldr": "We verify deterministic and probabilistic properties of neural networks using non-convex relaxations over visible transformations specified by generative models",
        "abstract": "Generative networks are promising models for specifying visual transformations. Unfortunately, certification of generative models is challenging as one needs to capture sufficient non-convexity so to produce precise bounds on the output. Existing verification methods either fail to scale to generative networks or do not capture enough non-convexity. In this work, we present a new verifier, called ApproxLine, that can certify non-trivial properties of generative networks. ApproxLine performs both deterministic and probabilistic abstract interpretation and captures infinite sets of outputs of generative networks. We show that ApproxLine can verify interesting interpolations in the network's latent space.",
        "keywords": "robustness certification;formal verification;robustness analysis;latent space interpolations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matthew Mirman;Timon Gehr;Martin Vechev",
        "authorids": "matthew.mirman@inf.ethz.ch;timon.gehr@inf.ethz.ch;martin.vechev@inf.ethz.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmirman2020verification,\ntitle={Verification of Generative-Model-Based Visual Transformations},\nauthor={Matthew Mirman and Timon Gehr and Martin Vechev},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxRMlrtPH}\n}",
        "github": "https://www.dropbox.com/s/np89rh2q8hzr1pj/approxline_submit.tar.gz?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJxRMlrtPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "325;613;219",
        "wc_reply_reviewers": "0;149;0",
        "wc_reply_authors": "70;1219;421",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            385.6666666666667,
            166.471886181688
        ],
        "wc_reply_reviewers_avg": [
            49.666666666666664,
            70.23927359786371
        ],
        "wc_reply_authors_avg": [
            570.0,
            480.76397535589126
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CyF4ViFhnB4J:scholar.google.com/&scioq=Verification+of+Generative-Model-Based+Visual+Transformations&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJxTgeBtDr",
        "title": "Towards Interpretable Evaluations: A Case Study of Named Entity Recognition",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a generalized evaluation methodology to interpret model biases, dataset biases, and their correlation.",
        "abstract": "    With the proliferation of models for natural language processing (NLP) tasks, it is even harder to understand the differences between models and their relative merits. Simply looking at differences between holistic metrics such as accuracy, BLEU, or F1 do not tell us \\emph{why} or \\emph{how} a particular method is better and how dataset biases influence the choices of model design.\n    In this paper, we present a general methodology for {\\emph{interpretable}} evaluation of NLP systems and choose the task of named entity recognition (NER) as a case study, which is a core task of identifying people, places, or organizations in text. The proposed evaluation method enables us to interpret the \\textit{model biases}, \\textit{dataset biases}, and how the \\emph{differences in the datasets} affect the design of the models, identifying the strengths and weaknesses of current approaches. By making our {analysis} tool available, we make it easy for future researchers to run similar analyses and drive the progress in this area.",
        "keywords": "interpretable evaluation;dataset biases;model biases;NER",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jinlan Fu;Pengfei Liu;Xuanjing Huang",
        "authorids": "fujl16@fudan.edu.cn;pfliu14@fudan.edu.cn;xjhuang@fudan.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfu2020towards,\ntitle={Towards Interpretable Evaluations: A Case Study of Named Entity Recognition},\nauthor={Jinlan Fu and Pengfei Liu and Xuanjing Huang},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxTgeBtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxTgeBtDr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "348;694;313",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "990;1115;311",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            451.6666666666667,
            171.95025120333173
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            805.3333333333334,
            353.2518774032049
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bPReCnN0vAAJ:scholar.google.com/&scioq=Towards+Interpretable+Evaluations:+A+Case+Study+of+Named+Entity+Recognition&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJxV-ANKDH",
        "title": "Efficient Riemannian Optimization on the Stiefel Manifold via the Cayley Transform",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper is about efficient Riemannian optimization on the Stiefel manifold that enforces the parameter matrices orthonormal.",
        "abstract": "Strictly enforcing orthonormality constraints on parameter matrices has been shown advantageous in deep learning. This amounts to Riemannian optimization on the Stiefel manifold, which, however, is computationally expensive. To address this challenge, we present two main contributions: (1) A new efficient retraction map based on an iterative Cayley transform for optimization updates, and (2) An implicit vector transport mechanism based on the combination of a projection of the momentum and the Cayley transform on the Stiefel manifold. We specify two new optimization algorithms: Cayley SGD with momentum, and Cayley ADAM on the Stiefel manifold. Convergence of Cayley SGD is theoretically analyzed. Our experiments for CNN training demonstrate that both algorithms: (a) Use less running time per iteration relative to existing approaches that enforce orthonormality of CNN parameters; and (b) Achieve faster convergence rates than the baseline SGD and ADAM algorithms without compromising the performance of the CNN. Cayley SGD and Cayley ADAM are also shown to reduce the training time for optimizing the unitary transition matrices in RNNs.",
        "keywords": "Orthonormality;Efficient Riemannian Optimization;the Stiefel manifold.",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jun Li;Fuxin Li;Sinisa Todorovic",
        "authorids": "liju2@oregonstate.edu;fuxin.li@oregonstate.edu;sinisa@oregonstate.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLi2020Efficient,\ntitle={Efficient Riemannian Optimization on the Stiefel Manifold via the Cayley Transform},\nauthor={Jun Li and Fuxin Li and Sinisa Todorovic},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxV-ANKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJxV-ANKDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "1072;518;177",
        "wc_reply_reviewers": "285;303;0",
        "wc_reply_authors": "669;472;170",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            589.0,
            368.8152202210026
        ],
        "wc_reply_reviewers_avg": [
            196.0,
            138.787607515945
        ],
        "wc_reply_authors_avg": [
            437.0,
            205.21370974344444
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 117,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17483299401259127541&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HJxV5yHYwB",
        "title": "Solving single-objective tasks by preference multi-objective reinforcement learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Solving complex single-objective tasks by preference multi-objective reinforcement learning.",
        "abstract": "There ubiquitously exist many single-objective tasks in the real world that are inevitably related to some other objectives and influenced by them. We call such task as the objective-constrained task, which is inherently a multi-objective problem. Due to the conflict among different objectives, a trade-off is needed. A common compromise is to design a scalar reward function through clarifying the relationship among these objectives using the prior knowledge of experts. However, reward engineering is extremely cumbersome. This will result in behaviors that optimize our reward function without actually satisfying our preferences. In this paper, we explicitly cast the objective-constrained task as preference multi-objective reinforcement learning, with the overall goal of finding a Pareto optimal policy. Combined with Trajectory Preference Domination we propose, a weight vector that reflects the agent's preference for each objective can be learned. We analyzed the feasibility of our algorithm in theory, and further proved in experiments its better performance compared to those that design the reward function by experts.",
        "keywords": "reinforcement learning;single-objective tasks;multi-objectivization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jinsheng Ren;Shangqi Guo;Feng Chen",
        "authorids": "rjs17@mails.tsinghua.edu.cn;gsq15@mails.tsinghua.edu.cn;chenfeng@mail.tsinghua.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nren2020solving,\ntitle={Solving single-objective tasks by preference multi-objective reinforcement learning},\nauthor={Jinsheng Ren and Shangqi Guo and Feng Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxV5yHYwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJxV5yHYwB",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "431;205",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "944;238",
        "reply_reviewers": "0;0",
        "reply_authors": "2;1",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            318.0,
            113.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            591.0,
            353.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VRaEcLGP6aMJ:scholar.google.com/&scioq=Solving+single-objective+tasks+by+preference+multi-objective+reinforcement+learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJxVC1SYwr",
        "title": "Crafting Data-free Universal Adversaries with Dilate Loss",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We introduce a method to create Universal Adversarial Perturbations (UAP) for a given CNN in a data-free manner. Data-free approaches suite scenarios where the original training data is unavailable for crafting adversaries. We show that the adversary generation with full training data can be approximated to a formulation without data. This is realized through a sequential optimization of the adversarial perturbation with the proposed dilate loss. Dilate loss basically maximizes the Euclidean norm of the output before nonlinearity at any layer. By doing so, the perturbation constrains the ReLU activation function at every layer to act roughly linear for data points and thus eliminate the dependency on data for crafting UAPs. Extensive experiments demonstrate that our method not only has theoretical support, but achieves higher fooling rate than the existing data-free work. Furthermore, we evidence improvement in limited data cases as well.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Deepak Babu Sam;ABINAYA K;Sudharsan K A;Venkatesh Babu RADHAKRISHNAN",
        "authorids": "deepaksam@iisc.ac.in;abinayak@iisc.ac.in;sudharsanka16@gmail.com;venky@iisc.ac.in",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsam2020crafting,\ntitle={Crafting Data-free Universal Adversaries with Dilate Loss},\nauthor={Deepak Babu Sam and ABINAYA K and Sudharsan K A and Venkatesh Babu RADHAKRISHNAN},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxVC1SYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJxVC1SYwr",
        "pdf_size": 0,
        "rating": "3;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "410;399;247;175",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "254;390;128;29",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            307.75,
            100.11836744573894
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            200.25,
            135.49976937249747
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1831435226481250638&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJxWl0NKPB",
        "title": "Combining MixMatch and Active Learning for Better Accuracy with Fewer Labels",
        "track": "main",
        "status": "Reject",
        "tldr": "We combine MixMatch and active learning to obtain better accuracy with fewer labels and we follow this by a cost analysis comparing labeling data vs adding unlabeled data.. ",
        "abstract": "We propose using active learning based techniques to further improve the state-of-the-art semi-supervised learning MixMatch algorithm. We provide a thorough empirical evaluation of several active-learning and baseline methods, which successfully demonstrate a significant improvement on the benchmark CIFAR-10, CIFAR-100, and SVHN datasets (as much as 1.5% in absolute accuracy). \nWe also provide an empirical analysis of the cost trade-off between incrementally gathering more labeled versus unlabeled data. This analysis can be used to measure the relative value of labeled/unlabeled data at different points of the learning curve, where we find that although the incremental value of labeled data can be as much as 20x that of unlabeled, it quickly diminishes to less than 3x once more than 2,000 labeled example are observed.",
        "keywords": "active learning;semi-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shuang Song;David Berthelot;Afshin Rostamizadeh",
        "authorids": "shuangsong@google.com;dberth@google.com;rostami@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsong2020combining,\ntitle={Combining MixMatch and Active Learning for Better Accuracy with Fewer Labels},\nauthor={Shuang Song and David Berthelot and Afshin Rostamizadeh},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxWl0NKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJxWl0NKPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "501;465;406",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "187;694;337",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            457.3333333333333,
            39.16063783387032
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            406.0,
            212.65464960823218
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1144478528164657235&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HJxYHT4tPS",
        "title": "Mining GANs for knowledge transfer to small domains",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "One of the attractive characteristics of deep neural networks is their ability to transfer knowledge obtained in one domain to other related domains. As a result, high-quality networks can be trained in domains with relatively little training data. This property has been extensively studied for discriminative networks but has received significantly less attention for generative models.  Given the often enormous effort which is required to train GANs, both computationally as well as in the collection of datasets, the re-use of pretrained GANs is a desirable objective. Therefore, we investigate various scenarios of knowledge transfer for generative models and propose methods to mine the knowledge that is most beneficial to a specific target domain from a single or multiple pretrained GANs. This is done using a miner network that identifies which part of the generative distribution of the pretrained GAN outputs samples closest to the target domain. In the multiple GAN case, We also train a selector to learn a prior over the available pretrained GANs.  We show that both the selector and the miner can be trained by applying a selective backpropagation procedure on the critic output. We perform experiments on several complex datasets using various GAN architectures (BigGAN, Progressive GAN) and show that the proposed method, called MineGAN, effectively transfers knowledge to small domains, outperforming existing methods. In addition, MineGAN can successfully transfer knowledge from multiple pretrained GANs. ",
        "keywords": "Generative adversarial networks;transferring learning;small domains;deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yaxing Wang;Abel Gonzalez-Garcia;David Berga;Luis Herranz;Fahad Shahbaz Khan;Joost van de Weijer",
        "authorids": "yaxing@cvc.uab.es;agonzalez@cvc.uab.es;dberga@cvc.uab.es;lherranz@cvc.uab.es;fahad.khan@liu.se;joost@cvc.uab.es",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxYHT4tPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "243;190;375",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            269.3333333333333,
            77.78745971484665
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:s1tk7ZR83s8J:scholar.google.com/&scioq=Mining+GANs+for+knowledge+transfer+to+small+domains&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HJx_d34YDB",
        "title": "VIDEO AFFECTIVE IMPACT PREDICTION WITH MULTIMODAL FUSION AND LONG-SHORT TEMPORAL CONTEXT",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Predicting the emotional impact of videos using machine learning is a challenging task. Feature extraction, multi-modal fusion and temporal context fusion are crucial stages for predicting valence and arousal values in the emotional impact, but\nhave not been successfully exploited. In this paper, we proposed a comprehensive framework with innovative designs of model structure and multi-modal fusion strategy. We select the most suitable modalities for valence and arousal tasks respectively and each modal feature is extracted using the modality-specific pre-trained deep model on large generic dataset. Two-time-scale structures, one for the intra-clip and the other for the inter-clip, are proposed to capture the temporal dependency of video content and emotional states. To combine the complementary information from multiple modalities, an effective and efficient residual-based progressive training strategy is proposed. Each modality is step-wisely combined into the\nmulti-modal model, responsible for completing the missing parts of features. With all those above, our proposed prediction framework achieves better performance with a large margin compared to the state-of-the-art.",
        "keywords": "multi-modal fusion;affective computing;temporal context;residual-based training strategy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yin Zhao;Longjun Cai;Chaoping Tu;Jie Zhang;Wu Wei",
        "authorids": "yinzhao.zy@alibaba-inc.com;longjun.clj@alibaba-inc.com;chaoping.tcp@alibaba-inc.com;auzj_alex@mail.scut.edu.cn;weiwu@scut.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhao2020video,\ntitle={{\\{}VIDEO{\\}} {\\{}AFFECTIVE{\\}} {\\{}IMPACT{\\}} {\\{}PREDICTION{\\}} {\\{}WITH{\\}} {\\{}MULTIMODAL{\\}} {\\{}FUSION{\\}} {\\{}AND{\\}} {\\{}LONG{\\}}-{\\{}SHORT{\\}} {\\{}TEMPORAL{\\}} {\\{}CONTEXT{\\}}},\nauthor={Yin Zhao and Longjun Cai and Chaoping Tu and Jie Zhang and Wu Wei},\nyear={2020},\nurl={https://openreview.net/forum?id=HJx_d34YDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HJx_d34YDB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "269;255;102",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            208.66666666666666,
            75.64096479788948
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4339158085353219245&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJxaC1rKDS",
        "title": "Imbalanced Classification via Adversarial Minority Over-sampling",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We develop a new method for imbalanced classification using adversarial examples",
        "abstract": "In most real-world scenarios, training datasets are highly class-imbalanced, where deep neural networks suffer from generalizing to a balanced testing criterion. In this paper, we explore a novel yet simple way to alleviate this issue via synthesizing less-frequent classes with adversarial examples of other classes. Surprisingly, we found this counter-intuitive method can effectively learn generalizable features of minority classes by transferring and leveraging the diversity of the majority information. Our experimental results on various types of class-imbalanced datasets in image classification and natural language processing show that the proposed method not only improves the generalization of minority classes significantly compared to other re-sampling or re-weighting methods, but also surpasses other methods of state-of-art level for the class-imbalanced classification. ",
        "keywords": "imbalanced classification;adversarial examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jaehyung Kim;Jongheon Jeong;Jinwoo Shin",
        "authorids": "jaehyungkim@kaist.ac.kr;jongheonj@kaist.ac.kr;jinwoos@kaist.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxaC1rKDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "657;216;396",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            423.0,
            181.04695523537532
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7464502676492321793&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJxcP2EFDS",
        "title": "Amharic Negation Handling",
        "track": "main",
        "status": "Reject",
        "tldr": "This work presents Amharic Negation Handling for efficient Sentiment Classification.",
        "abstract": "User generated content contains opinionated texts not only in dominant languages (like English) but also less dominant languages( like Amharic). However, negation handling techniques that supports for sentiment detection is not developed in such less dominant language(i.e. Amharic). Negation handling is one of the challenging tasks for sentiment classification. Thus, this work builds negation handling schemes which enhances Amharic Sentiment classification.  The proposed Negation Handling framework combines the lexicon based approach and character ngram based machine learning model.  The performance of framework is evaluated using the annotated Amharic News Comments. The system is outperforming the best of all models and the baselines by an accuracy of 98.0. The result is compared with the baselines (without negation handling and word level ngram model).",
        "keywords": "Negation Handling Algorithm;Amharic Sentiment Analysis;Amharic Sentiment lexicon;char level;word level ngram;machine learning;hybrid",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Girma Neshir",
        "authorids": "girma1978@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nneshir2020amharic,\ntitle={Amharic Negation Handling},\nauthor={Girma Neshir},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxcP2EFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJxcP2EFDS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "216;93;33",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            114.0,
            76.17086057016817
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hE68FV83gWsJ:scholar.google.com/&scioq=Amharic+Negation+Handling&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "HJxdTxHYvB",
        "title": "BREAKING CERTIFIED DEFENSES: SEMANTIC ADVERSARIAL EXAMPLES WITH SPOOFED ROBUSTNESS CERTIFICATES",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Defenses against adversarial attacks can be classified into certified and non-certified. Certifiable defenses make networks robust within a certain $\\ell_p$-bounded radius, so that it is impossible for the adversary to make adversarial examples in the certificate bound. We present an attack that maintains the imperceptibility property of adversarial examples while being outside of the certified radius. Furthermore, the proposed \"Shadow Attack\" can fool certifiably robust networks by producing an imperceptible adversarial example that gets misclassified and produces a strong ``spoofed'' certificate.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amin Ghiasi;Ali Shafahi;Tom Goldstein",
        "authorids": "amin@cs.umd.edu;ashafahi@cs.umd.edu;tomg@cs.umd.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nGhiasi2020BREAKING,\ntitle={BREAKING  CERTIFIED  DEFENSES:  SEMANTIC  ADVERSARIAL  EXAMPLES  WITH  SPOOFED  ROBUSTNESS  CERTIFICATES},\nauthor={Amin Ghiasi and Ali Shafahi and Tom Goldstein},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxdTxHYvB}\n}",
        "github": "[![github](/images/github_icon.svg) AminJun/BreakingCertifiableDefenses](https://github.com/AminJun/BreakingCertifiableDefenses)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJxdTxHYvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "345;187;191",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "781;508;75",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            241.0,
            73.55723395198237
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            454.6666666666667,
            290.6800455636556
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 84,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15252610687731481790&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HJxf53EtDr",
        "title": "Unifying Graph Convolutional Networks as Matrix Factorization",
        "track": "main",
        "status": "Reject",
        "tldr": "We unify graph convolutional networks as co-training and unitized matrix factorization.",
        "abstract": "In recent years, substantial progress has been made on graph convolutional networks (GCN). In this paper, for the first time, we theoretically analyze the connections between GCN and matrix factorization (MF), and unify GCN as matrix factorization with co-training and unitization. Moreover, under the guidance of this theoretical analysis, we propose an alternative model to GCN named Co-training and Unitized Matrix Factorization (CUMF). The correctness of our analysis is verified by thorough experiments. The experimental results show that CUMF achieves similar or superior performances compared to GCN. In addition, CUMF inherits the benefits of MF-based methods to naturally support constructing mini-batches, and is more friendly to distributed computing comparing with GCN. The distributed CUMF on semi-supervised node classification significantly outperforms distributed GCN methods. Thus, CUMF greatly benefits large scale and complex real-world applications.",
        "keywords": "graph convolutional networks;matrix factorization;unification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhaocheng Liu;Qiang Liu;Haoli Zhang;Jun Zhu",
        "authorids": "zhaocheng.liu@realai.ai;qiang.liu@realai.ai;haoli.zhang@realai.ai;dcszj@mail.tsinghua.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nliu2020unifying,\ntitle={Unifying Graph Convolutional Networks as Matrix Factorization},\nauthor={Zhaocheng Liu and Qiang Liu and Haoli Zhang and Jun Zhu},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxf53EtDr}\n}",
        "github": "https://github.com/code20190923/CUMF",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJxf53EtDr",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "485;384;315",
        "wc_reply_reviewers": "0;38;23",
        "wc_reply_authors": "106;173;163",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            394.6666666666667,
            69.81085557100383
        ],
        "wc_reply_reviewers_avg": [
            20.333333333333332,
            15.627610892974722
        ],
        "wc_reply_authors_avg": [
            147.33333333333334,
            29.51082663852182
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14615086828212639349&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HJxhUpVKDr",
        "title": "Branched Multi-Task Networks: Deciding What Layers To Share",
        "track": "main",
        "status": "Reject",
        "tldr": "A method for the automated construction of branched multi-task networks with strong experimental evaluation on diverse multi-tasking datasets.",
        "abstract": "In the context of multi-task learning, neural networks with branched architectures have often been employed to jointly tackle the tasks at hand. Such ramified networks typically start with a number of shared layers, after which different tasks branch out into their own sequence of layers. Understandably, as the number of possible network configurations is combinatorially large, deciding what layers to share and where to branch out becomes cumbersome. Prior works have either relied on ad hoc methods to determine the level of layer sharing, which is suboptimal, or utilized neural architecture search techniques to establish the network design, which is considerably expensive. In this paper, we go beyond these limitations and propose a principled approach to automatically construct branched multi-task networks, by leveraging the employed tasks' affinities. Given a specific budget, i.e. number of learnable parameters, the proposed approach generates architectures, in which shallow layers are task-agnostic, whereas deeper ones gradually grow more task-specific. Extensive experimental analysis across numerous, diverse multi-tasking datasets shows that, for a given budget, our method consistently yields networks with the highest performance, while for a certain performance threshold it requires the least amount of learnable parameters.",
        "keywords": "Multi-Task Learning;Neural Network Architectures;Deep learning;Efficient Architectures",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Simon Vandenhende;Stamatios Georgoulis;Bert De Brabandere;Luc Van Gool",
        "authorids": "simon.vandenhende@kuleuven.be;georgous@ee.ethz.ch;bert.debrabandere@esat.kuleuven.be;vangool@vision.ee.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nvandenhende2020branched,\ntitle={Branched Multi-Task Networks: Deciding What Layers To Share},\nauthor={Simon Vandenhende and Stamatios Georgoulis and Bert De Brabandere and Luc Van Gool},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxhUpVKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxhUpVKDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "420;103;447",
        "wc_reply_reviewers": "0;386;0",
        "wc_reply_authors": "785;1220;666",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            323.3333333333333,
            156.18863239329835
        ],
        "wc_reply_reviewers_avg": [
            128.66666666666666,
            181.96214502533823
        ],
        "wc_reply_authors_avg": [
            890.3333333333334,
            238.11808461816213
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 168,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2438892134230617057&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "HJxhWa4KDr",
        "title": "MMD GAN with Random-Forest Kernels",
        "track": "main",
        "status": "Reject",
        "tldr": "Equip MMD GANs with a new random-forest kernel.",
        "abstract": "In this paper, we propose a novel kind of kernel, random forest kernel, to enhance the empirical performance of MMD GAN. Different from common forests with deterministic routings, a probabilistic routing variant is used in our innovated random-forest kernel, which is possible to merge with the CNN frameworks. Our proposed random-forest kernel has the following advantages: From the perspective of random forest, the output of GAN discriminator can be viewed as feature inputs to the forest, where each tree gets access to merely a fraction of the features, and thus the entire forest benefits from ensemble learning. In the aspect of kernel method, random-forest kernel is proved to be characteristic, and therefore suitable for the MMD structure. Besides, being an asymmetric kernel, our random-forest kernel is much more flexible, in terms of capturing the differences between distributions. Sharing the advantages of CNN, kernel method, and ensemble learning, our random-forest kernel based MMD GAN obtains desirable empirical performances on CIFAR-10, CelebA and LSUN bedroom data sets. Furthermore, for the sake of completeness, we also put forward comprehensive theoretical analysis to support our experimental results.",
        "keywords": "GANs;MMD;kernel;random forest;unbiased gradients",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tao Huang;Zhen Han;Xu Jia;Hanyuan Hang",
        "authorids": "tao.huang2018@ruc.edu.cn;handarkholme@ruc.edu.cn;jiayushenyang@gmail.com;hanyuan0725@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhuang2020mmd,\ntitle={{\\{}MMD{\\}} {\\{}GAN{\\}} with Random-Forest Kernels},\nauthor={Tao Huang and Zhen Han and Xu Jia and Hanyuan Hang},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxhWa4KDr}\n}",
        "github": "http://anonymous.4open.science/repository/a233b01f-f072-430d-8b3d-3871804c58f1",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJxhWa4KDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "158;146;289",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            197.66666666666666,
            64.76796190161375
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NS6_0dttFQ8J:scholar.google.com/&scioq=MMD+GAN+with+Random-Forest+Kernels&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJxiMAVtPH",
        "title": "Multi-scale Attributed Node Embedding",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop efficient multi-scale approximate attributed network embedding procedures with provable properties.",
        "abstract": "We present network embedding algorithms that capture information about a node from the local distribution over node attributes around it, as observed over random walks following an approach similar to Skip-gram. Observations from neighborhoods of different sizes are either pooled (AE) or encoded distinctly in a multi-scale approach (MUSAE).  Capturing attribute-neighborhood relationships over multiple scales is useful for a diverse range of applications, including latent feature identification across disconnected networks with similar attributes. We prove theoretically that matrices of node-feature pointwise mutual information are implicitly factorized by the embeddings. Experiments show that our algorithms are robust, computationally efficient and outperform comparable models on social, web and citation network datasets.",
        "keywords": "network embedding;graph embedding;node embedding;network science;graph representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Benedek Rozemberczki;Carl Allen;Rik Sarkar",
        "authorids": "benedek.rozemberczki@gmail.com;carl.allen@ed.ac.uk;rsarkar@inf.ed.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nrozemberczki2020multiscale,\ntitle={Multi-scale Attributed Node Embedding},\nauthor={Benedek Rozemberczki and Carl Allen and Rik Sarkar},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxiMAVtPH}\n}",
        "github": "https://github.com/iclr2020/MUSAE/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJxiMAVtPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "268;193;146",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "913;419;264",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            202.33333333333334,
            50.24163833139025
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            532.0,
            276.73934788292513
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1013,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9153297727287794283&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "HJxkvlBtwH",
        "title": "Certifying Neural Network Audio Classifiers",
        "track": "main",
        "status": "Reject",
        "tldr": "We present the first approach to certify robustness of neural networks against noise-based perturbations in the audio domain.",
        "abstract": "We present the first end-to-end verifier of audio classifiers. Compared to existing methods, our approach enables analysis of both, the entire audio processing stage as well as recurrent neural network architectures (e.g., LSTM). The audio processing is verified using novel convex relaxations tailored to feature extraction operations used in audio (e.g., Fast Fourier Transform) while recurrent architectures are certified via a novel binary relaxation for the recurrent unit update. We show the verifier scales to large networks while computing significantly tighter bounds than existing methods for common audio classification benchmarks: on the challenging Google Speech Commands dataset we certify 95% more inputs than the interval approximation (only prior scalable method), for a perturbation of -90dB.",
        "keywords": "Adversarial Examples;Audio Classifier;Speech Recognition;Certified Robustness;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wonryong Ryou;Mislav Balunovic;Gagandeep Singh;Martin Vechev",
        "authorids": "wryou@student.ethz.ch;bmislav@student.ethz.ch;gsingh@inf.ethz.ch;martin.vechev@inf.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nryou2020certifying,\ntitle={Certifying Neural Network Audio Classifiers},\nauthor={Wonryong Ryou and Mislav Balunovic and Gagandeep Singh and Martin Vechev},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxkvlBtwH}\n}",
        "github": "https://drive.google.com/file/d/13dFJb3hwFaMortWr3D_3Z5H4j6HhurAh/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxkvlBtwH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "255;572;252",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "276;640;327",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.6666666666667,
            150.1473350486855
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            414.3333333333333,
            160.9230319818211
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fTZmfZITsOMJ:scholar.google.com/&scioq=Certifying+Neural+Network+Audio+Classifiers&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJxnM1rFvr",
        "title": "HUBERT Untangles BERT to Improve Transfer across NLP Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce HUBERT which combines the power of Tensor-Product Representations and BERT language model.",
        "abstract": "We introduce HUBERT which combines the structured-representational power of Tensor-Product Representations (TPRs) and BERT, a pre-trained bidirectional transformer language model. We validate the effectiveness of our model on the GLUE benchmark and HANS dataset. We also show that there is shared structure between different NLP datasets which HUBERT, but not BERT, is able to learn and leverage. Extensive transfer-learning experiments are conducted to confirm this proposition.",
        "keywords": "Tensor Product Representation;BERT;Transfer Learning;Neuro-Symbolic Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mehrad Moradshahi;Hamid Palangi;Monica S. Lam;Paul Smolensky;Jianfeng Gao",
        "authorids": "mehrad@stanford.edu;hpalangi@microsoft.com;lam@cs.stanford.edu;paul.smolensky@gmail.com;jfgao@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nmoradshahi2020hubert,\ntitle={{\\{}HUBERT{\\}} Untangles {\\{}BERT{\\}} to Improve Transfer across {\\{}NLP{\\}} Tasks},\nauthor={Mehrad Moradshahi and Hamid Palangi and Monica S. Lam and Paul Smolensky and Jianfeng Gao},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxnM1rFvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxnM1rFvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "356;341;409",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "512;294;501",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            368.6666666666667,
            29.169999809545573
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            435.6666666666667,
            100.2740688757013
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6085899947722125455&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJxp9kBFDS",
        "title": "Invariance vs Robustness of Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural networks achieve human-level accuracy on many standard datasets used in image classification. The next step is to achieve better generalization to natural (or non-adversarial) perturbations as well as known pixel-wise adversarial perturbations of inputs. Previous work has studied generalization to natural geometric transformations (e.g., rotations) as invariance, and generalization to adversarial perturbations as robustness. In this paper, we examine the interplay between invariance and robustness. We empirically study the following two cases:(a) change in adversarial robustness as we improve only the invariance using equivariant models and training augmentation, (b) change in invariance as we improve only the adversarial robustness using adversarial training. We observe that the rotation invariance of equivariant models (StdCNNs and GCNNs) improves by training augmentation with progressively larger rotations but while doing so, their adversarial robustness does not improve, or worse, it can even drop significantly on datasets such as MNIST. As a plausible explanation for this phenomenon we observe that the average perturbation distance of the test points to the decision boundary decreases as the model learns larger and larger rotations. On the other hand, we take adversarially trained LeNet and ResNet models which have good \\ell_\\infty adversarial robustness on MNIST and CIFAR-10, and observe that adversarially training them with progressively larger norms keeps their rotation invariance essentially unchanged. In fact, the difference between test accuracy on unrotated test data and on randomly rotated test data upto \\theta , for all \\theta in [0, 180], remains essentially unchanged after adversarial training . As a plausible explanation for the observed phenomenon we show empirically that the principal components of adversarial perturbations and perturbations given by small rotations are nearly orthogonal",
        "keywords": "Invariance;Adversarial;Robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sandesh Kamath;Amit Deshpande;K V Subrahmanyam",
        "authorids": "amitdesh@microsoft.com;ksandeshk@cmi.ac.in;kv@cmi.ac.in",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkamath2020invariance,\ntitle={Invariance vs Robustness of Neural Networks},\nauthor={Sandesh Kamath and Amit Deshpande and K V Subrahmanyam},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxp9kBFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxp9kBFDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "120;435;298",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            284.3333333333333,
            128.96080369200902
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "HJxrVA4FDS",
        "title": "Disentangling neural mechanisms for perceptual grouping",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Horizontal and top-down feedback connections are responsible for complementary perceptual grouping strategies in biological and recurrent vision systems.",
        "abstract": "Forming perceptual groups and individuating objects in visual scenes is an essential step towards visual intelligence. This ability is thought to arise in the brain from computations implemented by bottom-up, horizontal, and top-down connections between neurons. However, the relative contributions of these connections to perceptual grouping are poorly understood. We address this question by systematically evaluating neural network architectures featuring combinations bottom-up, horizontal, and top-down connections on two synthetic visual tasks, which stress low-level \"Gestalt\" vs. high-level object cues for perceptual grouping. We show that increasing the difficulty of either task strains learning for networks that rely solely on bottom-up connections. Horizontal connections resolve straining on tasks with Gestalt cues by supporting incremental grouping, whereas top-down connections rescue learning on tasks with high-level object cues by modifying coarse predictions about the position of the target object. Our findings dissociate the computational roles of bottom-up, horizontal and top-down connectivity, and demonstrate how a model featuring all of these interactions can more flexibly learn to form perceptual groups.",
        "keywords": "Perceptual grouping;visual cortex;recurrent feedback;horizontal connections;top-down connections",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junkyung Kim*;Drew Linsley*;Kalpit Thakkar;Thomas Serre",
        "authorids": "junkyung_kim@brown.edu;drew_linsley@brown.edu;kalpit_thakkar@brown.edu;thomas_serre@brown.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nKim*2020Disentangling,\ntitle={Disentangling neural mechanisms for perceptual grouping},\nauthor={Junkyung Kim* and Drew Linsley* and Kalpit Thakkar and Thomas Serre},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxrVA4FDS}\n}",
        "github": "https://bit.ly/2wdQYGd",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HJxrVA4FDS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "231;239;339",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "175;355;572",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            269.6666666666667,
            49.13473539383541
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            367.3333333333333,
            162.30903308880323
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 59,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5057678300510017268&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "HJxw9lStPH",
        "title": "Understanding the (Un)interpretability of Natural Image Distributions Using Generative Models",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We examine the relationship between probability density values and image content in non-invertible GANs.",
        "abstract": "Probability density estimation is a classical and well studied problem, but standard density estimation methods have historically lacked the power to model complex and high-dimensional image distributions.  More recent generative models leverage the power of neural networks to implicitly learn and represent probability models over complex images.  We describe methods to extract explicit probability density estimates from GANs, and explore the properties of these image density functions.  We perform sanity check experiments to provide evidence that these probabilities are reasonable.  However, we also show that density functions of natural images are difficult to interpret and thus limited in use.  We study reasons for this lack of interpretability, and suggest that we can get better interpretability by doing density estimation on latent representations of images.  ",
        "keywords": "GANs;Generative Models;Density Estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ryen Krusinga;Sohil Shah;Matthias Zwicker;Tom Goldstein;David Jacobs",
        "authorids": "krusinga@cs.umd.edu;sohilas@umd.edu;zwicker@inf.unibe.ch;tom@cs.umd.edu;djacobs@umiacs.umd.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxw9lStPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "755;305;250",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            436.6666666666667,
            226.21278085515465
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1198722956500947768&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HJxwvCEFvH",
        "title": "SPECTRA: Sparse Entity-centric Transitions",
        "track": "main",
        "status": "Reject",
        "tldr": "Sparse slot-structured transition model. Training is done such that such that latent slots correspond to relevant entities of the visual scene.",
        "abstract": "Learning an agent that interacts with objects is ubiquituous in many RL tasks. In most of them the  agent's actions have sparse effects : only a small subset of objects in the visual scene will be affected by the action taken. We introduce SPECTRA, a model for learning slot-structured transitions from raw visual observations that embodies this sparsity assumption. Our model is composed of a perception module that decomposes the visual scene into a set of  latent objects representations (i.e. slot-structured) and a transition module that predicts the next latent set slot-wise and in a sparse way. We show that learning a perception module jointly with a sparse slot-structured transition model not only  biases the model towards more entity-centric perceptual groupings  but also enables intrinsic exploration strategy that aims at maximizing the number of objects changed in the agent\u2019s trajectory.",
        "keywords": "representation learning;slot-structured representations;sparse slot-structured transitions;entity-centric representation;unsupervised learning;object-centric",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rim Assouel;Yoshua Bengio",
        "authorids": "rim.assouel@hotmail.fr;yoshua.bengio@mila.quebec",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nassouel2020spectra,\ntitle={{\\{}SPECTRA{\\}}: Sparse Entity-centric Transitions},\nauthor={Rim Assouel and Yoshua Bengio},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxwvCEFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxwvCEFvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "434;1079;553",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            688.6666666666666,
            280.25028496367713
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:D2nEwupFo48J:scholar.google.com/&scioq=SPECTRA:+Sparse+Entity-centric+Transitions&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HJxyZkBKDr",
        "title": "NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search",
        "track": "main",
        "status": "Spotlight",
        "tldr": "A NAS benchmark applicable to almost any NAS algorithms.",
        "abstract": "Neural architecture search (NAS) has achieved breakthrough success in a great number of applications in the past few years.\nIt could be time to take a step back and analyze the good and bad aspects in the field of NAS. A variety of algorithms search architectures under different search space. These searched architectures are trained using different setups, e.g., hyper-parameters, data augmentation, regularization. This raises a comparability problem when comparing the performance of various NAS algorithms. NAS-Bench-101 has shown success to alleviate this problem. In this work, we propose an extension to NAS-Bench-101: NAS-Bench-201 with a different search space, results on multiple datasets, and more diagnostic information. NAS-Bench-201 has a fixed search space and provides a unified benchmark for almost any up-to-date NAS algorithms. The design of our search space is inspired by the one used in the most popular cell-based searching algorithms, where a cell is represented as a directed acyclic graph. Each edge here is associated with an operation selected from a predefined operation set. For it to be applicable for all NAS algorithms, the search space defined in NAS-Bench-201 includes all possible architectures generated by 4 nodes and 5 associated operation options, which results in 15,625 neural cell candidates in total. The training log using the same setup and the performance for each architecture candidate are provided for three datasets. This allows researchers to avoid unnecessary repetitive training for selected architecture and focus solely on the search algorithm itself. The training time saved for every architecture also largely improves the efficiency of most NAS algorithms and presents a more computational cost friendly NAS community for a broader range of researchers. We provide additional diagnostic information such as fine-grained loss and accuracy, which can give inspirations to new designs of NAS algorithms. In further support of the proposed NAS-Bench-102, we have analyzed it from many aspects and benchmarked 10 recent NAS algorithms, which verify its applicability.",
        "keywords": "Neural Architecture Search;AutoML;Benchmark",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xuanyi Dong;Yi Yang",
        "authorids": "xuanyi.dxy@gmail.com;yi.yang@uts.edu.au",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nDong2020NAS-Bench-201:,\ntitle={NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search},\nauthor={Xuanyi Dong and Yi Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HJxyZkBKDr}\n}",
        "github": "https://github.com/D-X-Y/NAS-Bench-201",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HJxyZkBKDr",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "391;1049;393",
        "wc_reply_reviewers": "18;18;0",
        "wc_reply_authors": "344;738;321",
        "reply_reviewers": "2;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            611.0,
            309.713846423867
        ],
        "wc_reply_reviewers_avg": [
            12.0,
            8.48528137423857
        ],
        "wc_reply_authors_avg": [
            467.6666666666667,
            191.38501044288245
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            22,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 944,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2881643654372303535&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Hke-WTVtwr",
        "title": "Encoding word order in complex embeddings",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "Sequential word order is important when processing text. Currently, neural networks (NNs) address this by modeling word position using position embeddings. The problem is that position embeddings capture the position of individual words, but not the ordered relationship (e.g., adjacency or precedence) between individual word positions. We present a novel and principled solution for modeling both the global absolute positions of words and their order relationships. Our solution generalizes word embeddings, previously defined as independent vectors, to continuous word functions over a variable (position). The benefit of continuous functions over variable positions is that word representations shift smoothly with increasing positions. Hence, word representations in different positions can correlate with each other in a continuous function. The general solution of these functions can be extended to complex-valued variants. We extend CNN, RNN and Transformer NNs to complex-valued versions to incorporate our complex embedding (we make all code available). Experiments on text classification, machine translation and language modeling show gains over both classical word embeddings and position-enriched word embeddings. To our knowledge, this is the first work in NLP to link imaginary numbers in complex-valued representations to concrete meanings (i.e., word order).",
        "keywords": "word embedding;complex-valued neural network;position embedding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Benyou Wang;Donghao Zhao;Christina Lioma;Qiuchi Li;Peng Zhang;Jakob Grue Simonsen",
        "authorids": "wang@dei.unipd.it;zhaodh@tju.edu.cn;chrh@di.ku.dk;qiuchili@dei.unipd.it;pzhang@tju.edu.cn;simonsen@di.ku.dk",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nWang2020Encoding,\ntitle={Encoding word order in complex embeddings},\nauthor={Benyou Wang and Donghao Zhao and Christina Lioma and Qiuchi Li and Peng Zhang and Jakob Grue Simonsen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke-WTVtwr}\n}",
        "github": "https://github.com/iclr-complex-order/complex-order",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hke-WTVtwr",
        "pdf_size": 0,
        "rating": "6;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "570;339;376;262",
        "wc_reply_reviewers": "0;282;96;0",
        "wc_reply_authors": "155;272;215;59",
        "reply_reviewers": "0;2;1;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            386.75,
            113.51073737757147
        ],
        "wc_reply_reviewers_avg": [
            94.5,
            115.12927516492059
        ],
        "wc_reply_authors_avg": [
            175.25,
            78.8428024616071
        ],
        "reply_reviewers_avg": [
            0.75,
            0.82915619758885
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 148,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4348415605145944586&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Hke0K1HKwr",
        "title": "Sequential Latent Knowledge Selection for Knowledge-Grounded Dialogue",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Our approach is the first attempt to leverage a sequential latent variable model for knowledge selection in the multi-turn knowledge-grounded dialogue. It achieves the new state-of-the-art performance on Wizard of Wikipedia benchmark.",
        "abstract": "Knowledge-grounded dialogue is a task of generating an informative response based on both discourse context and external knowledge. As we focus on better modeling the knowledge selection in the multi-turn knowledge-grounded dialogue, we propose a sequential latent variable model as the first approach to this matter. The model named sequential knowledge transformer (SKT) can keep track of the prior and posterior distribution over knowledge; as a result, it can not only reduce the ambiguity caused from the diversity in knowledge selection of conversation but also better leverage the response information for proper choice of knowledge. Our experimental results show that the proposed model improves the knowledge selection accuracy and subsequently the performance of utterance generation. We achieve the new state-of-the-art performance on Wizard of Wikipedia (Dinan et al., 2019) as one of the most large-scale and challenging benchmarks. We further validate the effectiveness of our model over existing conversation methods in another knowledge-based dialogue Holl-E dataset (Moghe et al., 2018).",
        "keywords": "dialogue;knowledge;language;conversation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Byeongchang Kim;Jaewoo Ahn;Gunhee Kim",
        "authorids": "byeongchang.kim@vision.snu.ac.kr;jaewoo.ahn@vision.snu.ac.kr;gunhee@snu.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nKim2020Sequential,\ntitle={Sequential Latent Knowledge Selection for Knowledge-Grounded Dialogue},\nauthor={Byeongchang Kim and Jaewoo Ahn and Gunhee Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke0K1HKwr}\n}",
        "github": "https://github.com/bckim92/sequential-knowledge-transformer",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Hke0K1HKwr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "262;644;512",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "449;758;765",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            472.6666666666667,
            158.41155962309765
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            657.3333333333334,
            147.34162872574592
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 186,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7577905586548983330&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Hke0V1rKPS",
        "title": "Jacobian Adversarially Regularized Networks for Robustness",
        "track": "main",
        "status": "Poster",
        "tldr": "We show that training classifiers to produce salient input Jacobian matrices with a GAN-like regularization can boost adversarial robustness.",
        "abstract": "Adversarial examples are crafted with imperceptible perturbations with the intent to fool neural networks. Against such attacks, adversarial training and its variants stand as the strongest defense to date. Previous studies have pointed out that robust models that have undergone adversarial training tend to produce more salient and interpretable Jacobian matrices than their non-robust counterparts. A natural question is whether a model trained with an objective to produce salient Jacobian can result in better robustness. This paper answers this question with affirmative empirical results. We propose Jacobian Adversarially Regularized Networks (JARN) as a method to optimize the saliency of a classifier's Jacobian by adversarially regularizing the model's Jacobian to resemble natural training images. Image classifiers trained with JARN show improved robust accuracy compared to standard models on the MNIST, SVHN and CIFAR-10 datasets, uncovering a new angle to boost robustness without using adversarial training.",
        "keywords": "adversarial examples;robust machine learning;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alvin Chan;Yi Tay;Yew Soon Ong;Jie Fu",
        "authorids": "guoweial001@e.ntu.edu.sg;ytay017@e.ntu.edu.sg;asysong@ntu.edu.sg;jie.fu@polymtl.ca",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nChan2020Jacobian,\ntitle={Jacobian Adversarially Regularized Networks for Robustness},\nauthor={Alvin Chan and Yi Tay and Yew Soon Ong and Jie Fu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke0V1rKPS}\n}",
        "github": "[![github](/images/github_icon.svg) alvinchangw/JARN_ICLR2020](https://github.com/alvinchangw/JARN_ICLR2020)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hke0V1rKPS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "310;493;342",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "575;722;733",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            381.6666666666667,
            79.80114173507934
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            676.6666666666666,
            72.02931501980441
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 92,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8296271536774350168&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Hke0lRNYwS",
        "title": "Convolutional Bipartite Attractor Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We revisit attractor nets in light of modern deep learning methods and propose a convolutional bipartite architecture with a novel training loss, activation function, and connectivity constraints.",
        "abstract": "In human perception and cognition, a fundamental operation that brains perform is interpretation: constructing coherent neural states from noisy, incomplete, and intrinsically ambiguous evidence. The problem of interpretation is well matched to an early and often overlooked architecture, the attractor network---a recurrent neural net that performs constraint satisfaction, imputation of missing features, and clean up of noisy data via energy minimization dynamics. We revisit attractor nets in light of modern deep learning methods and propose a convolutional bipartite architecture with a novel training loss, activation function, and connectivity constraints. We tackle larger problems than have been previously explored with attractor nets and demonstrate their potential for image completion and super-resolution. We argue that this architecture is better motivated than ever-deeper feedforward models and is a viable alternative to more costly sampling-based generative methods on a range of supervised and unsupervised tasks.",
        "keywords": "attractor network;recurrent network;energy function;convolutional network;image completion;super-resolution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael L. Iuzzolino;Yoram Singer;Michael C. Mozer",
        "authorids": "michael.iuzzolino@colorado.edu;yoram.singer@gmail.com;mcmozer@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\niuzzolino2020convolutional,\ntitle={Convolutional Bipartite Attractor Networks},\nauthor={Michael L. Iuzzolino and Yoram Singer and Michael C. Mozer},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke0lRNYwS}\n}",
        "github": "https://drive.google.com/drive/folders/1CYmmxBQhW9v47rJ2em2MrUZo3JWvmJ8F?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hke0lRNYwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "364;680;263",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "325;811;284",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            435.6666666666667,
            177.62194559106587
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            473.3333333333333,
            239.3523669311744
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5690263145773598603&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Hke0oa4KwS",
        "title": "Empirical confidence estimates for classification by deep neural networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Provably accurate results for top 1 and top k confidence using a simple binning method",
        "abstract": "How well can we estimate the probability that the classification predicted by a deep neural network is correct (or in the Top 5)?  It is well-known that the softmax values of the network are not estimates of the probabilities of class labels.  However, there is a misconception that these values are not informative.  We define the notion of implied loss and prove that if an uncertainty measure is an implied loss, then low uncertainty means high probability of correct (or Top-k) classification on the test set.   We demonstrate empirically that these values can be used to measure the confidence that the classification is correct.  Our method is simple to use on existing networks: we proposed confidence measures for Top-k which can be evaluated by binning values on the test set. ",
        "keywords": "confidence;classification;uncertainty;anomaly;robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chris Finlay;Adam M. Oberman",
        "authorids": "christopher.finlay@gmail.com;adam.oberman@mcgill.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nfinlay2020empirical,\ntitle={Empirical confidence estimates for classification by deep neural networks},\nauthor={Chris Finlay and Adam M. Oberman},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke0oa4KwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Hke0oa4KwS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "2963;409;300",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "45;93;254",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1224.0,
            1230.4635982696386
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            130.66666666666666,
            89.38431381151591
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18285944926876347584&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hke12T4KPS",
        "title": "Using Hindsight to Anchor Past Knowledge in Continual Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "A continual learning method that uses replay buffer to construct anchors by maximizing the forgetting of a task and later keep the predictions on these anchors invariant by a meta-learning objective.",
        "abstract": "In continual learning, the learner faces a stream of data whose distribution changes over time. Modern neural networks are known to suffer under this setting, as they quickly forget previously acquired knowledge. To address such catastrophic forgetting, state-of-the-art continual learning methods implement different types of experience replay, re-learning on past data stored in a small buffer known as episodic memory. In this work, we complement experience replay with a meta-learning technique that we call anchoring: the learner updates its knowledge on the current task, while keeping predictions on some anchor points of past tasks intact. These anchor points are learned using gradient-based optimization as to maximize forgetting of the current task, in hindsight, when the learner is fine-tuned on the episodic memory of past tasks. Experiments on several supervised learning benchmarks for continual learning demonstrate that our approach improves the state of the art in terms of both accuracy and forgetting metrics and for various sizes of episodic memories. ",
        "keywords": "Continual Learning;Lifelong Learning;Catastrophic Forgetting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arslan Chaudhry;Albert Gordo;David Lopez-Paz;Puneet K. Dokania;Philip Torr",
        "authorids": "arslan.chaudhry@eng.ox.ac.uk;agordo@fb.com;david@lopezpaz.org;puneet@robots.ox.ac.uk;philip.torr@eng.ox.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchaudhry2020using,\ntitle={Using Hindsight to Anchor Past Knowledge in Continual Learning},\nauthor={Arslan Chaudhry and Albert Gordo and David Lopez-Paz and Puneet K. Dokania and Philip Torr},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke12T4KPS}\n}",
        "github": "https://bit.ly/2mw8bsE",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Hke12T4KPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "107;227;729",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "189;294;970",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            354.3333333333333,
            269.420777883881
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            484.3333333333333,
            346.0831627738564
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 281,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6948653214479858170&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Hke1gySFvB",
        "title": "Enhancing Language Emergence through Empathy",
        "track": "main",
        "status": "Reject",
        "tldr": "An auxiliary prediction task can speed up learning in language emergence setups.",
        "abstract": "The emergence of language in multi-agent settings is a promising research direction to ground natural language in simulated agents. If AI would be able to understand the meaning of language through its using it, it could also transfer it to other situations flexibly. That is seen as an important step towards achieving general AI. The scope of emergent communication is so far, however, still limited. It is necessary to enhance the learning possibilities for skills associated with communication to increase the emergable complexity. We took an example from human language acquisition and the importance of the empathic connection in this process. We propose an approach to introduce the notion of empathy to multi-agent deep reinforcement learning. We extend existing approaches on referential games with an auxiliary task for the speaker to predict the listener's mind change improving the learning time. Our experiments show the high potential of this architectural element by doubling the learning speed of the test setup. ",
        "keywords": "multi-agent deep reinforcement learning;emergent communication;auxiliary tasks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marie Ossenkopf",
        "authorids": "mos@vs.uni-kassel.de",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nossenkopf2020enhancing,\ntitle={Enhancing Language Emergence through Empathy},\nauthor={Marie Ossenkopf},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke1gySFvB}\n}",
        "github": "https://github.com/AnonymRobotika/ICLR2020",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hke1gySFvB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "286;640;265",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            397.0,
            172.0406928607299
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-nUkR2sZ0pAJ:scholar.google.com/&scioq=Enhancing+Language+Emergence+through+Empathy&hl=en&as_sdt=0,33",
        "gs_version_total": 2
    },
    {
        "id": "Hke2Rh4Kvr",
        "title": "Boosting Generative Models by Leveraging Cascaded Meta-Models",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Propose an approach for boosting generative models by cascading hidden variable models",
        "abstract": "A deep generative model is a powerful method of learning a data distribution, which has achieved tremendous success in numerous scenarios. However, it is nontrivial for a single generative model to faithfully capture the distributions of the complex data such as images with complicate structures. In this paper, we propose a novel approach of cascaded boosting for boosting generative models, where meta-models (i.e., weak learners) are cascaded together to produce a stronger model. Any hidden variable meta-model can be leveraged as long as it can support the likelihood evaluation. We derive a decomposable variational lower bound of the boosted model, which allows each meta-model to be trained separately and greedily. We can further improve the learning power of the generative models by combing our cascaded boosting framework with the multiplicative boosting framework.",
        "keywords": "Probabilistic Machine Learning;Learning Generative Models;Unsupervised Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fan Bao;Hang Su;Jun Zhu",
        "authorids": "bf19@mails.tsinghua.edu.cn;suhangss@mail.tsinghua.edu.cn;dcszj@mail.tsinghua.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://github.com/c-bgm/cascaded_boosting",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hke2Rh4Kvr",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "188;553;138",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            293.0,
            184.9774761063267
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HG1NgjpBpusJ:scholar.google.com/&scioq=Boosting+Generative+Models+by+Leveraging+Cascaded+Meta-Models&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "Hke3gyHYwH",
        "title": "Simple and Effective Regularization Methods for Training on Noisily Labeled Data with Generalization Guarantee",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Over-parameterized deep neural networks trained by simple first-order methods are known to be able to fit any labeling of data. Such over-fitting ability hinders generalization when mislabeled training examples are present. On the other hand, simple regularization methods like early-stopping can often achieve highly nontrivial performance on clean test data in these scenarios, a phenomenon not theoretically understood. This paper proposes and analyzes two simple and intuitive regularization methods: (i) regularization by the distance between the network parameters to initialization, and (ii) adding a trainable auxiliary variable to the network output for each training example. Theoretically, we prove that gradient descent training with either of these two methods leads to a generalization guarantee on the clean data distribution despite being trained using noisy labels. Our generalization analysis relies on the connection between wide neural network and neural tangent kernel (NTK). The generalization bound is independent of the network size, and is comparable to the bound one can get when there is no label noise. Experimental results verify the effectiveness of these methods on noisily labeled datasets.",
        "keywords": "deep learning theory;regularization;noisy labels",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei Hu;Zhiyuan Li;Dingli Yu",
        "authorids": "huwei@cs.princeton.edu;zhiyuanli@cs.princeton.edu;dingliy@cs.princeton.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nHu2020Simple,\ntitle={Simple and Effective Regularization Methods for Training on Noisily Labeled Data with Generalization Guarantee},\nauthor={Wei Hu and Zhiyuan Li and Dingli Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke3gyHYwH}\n}",
        "github": "https://drive.google.com/drive/folders/1TDlUuL0I-EzIybjz2pMAgyaYP5F6dq6o",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hke3gyHYwH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "237;399;206",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "441;188;134",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            280.6666666666667,
            84.6259744732996
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            254.33333333333334,
            133.8216059619007
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 91,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10366396620256108166&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Hke4_JrYDr",
        "title": "Global-Local Network for Learning Depth with Very Sparse Supervision",
        "track": "main",
        "status": "Reject",
        "tldr": "An approach to learning depth from very sparse supervision.",
        "abstract": "Natural intelligent agents learn to perceive the three dimensional structure of the world without training on large datasets and are unlikely to have the precise equations of projective geometry hard-wired in the brain. Such skill would also be  valuable  to  artificial  systems  in  order  to  avoid  the  expensive  collection  of labeled datasets, as well as tedious tuning required by methods based on multi-view geometry. Inspired by natural agents, who interact with the environment via visual and haptic feedback, this paper explores a new approach to learning depth from images and very sparse depth measurements, just a few pixels per image. To learn from such extremely sparse supervision, we introduce an appropriate inductive bias by designing a specialized global-local network architecture. Experiments on several datasets show that the proposed model can learn monocular dense depth estimation when trained with very sparse ground truth, even a single pixel per image. Moreover, we find that the global parameters extracted by the network are predictive of the metric agent motion.",
        "keywords": "Depth Perception;Learning from Sparse Supervision;Learning from Interaction.",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Antonio Loquercio;Alexey Dosovitskiy;Davide Scaramuzza",
        "authorids": "loquercio@ifi.uzh.ch;adosovitskiy@google.com;sdavide@ifi.uzh.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nloquercio2020globallocal,\ntitle={Global-Local Network for Learning Depth with Very Sparse Supervision},\nauthor={Antonio Loquercio and Alexey Dosovitskiy and Davide Scaramuzza},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke4_JrYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hke4_JrYDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "451;283;263",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "509;550;320",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            332.3333333333333,
            84.30631978419859
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            459.6666666666667,
            100.16763726651216
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kkvuNNjSs-sJ:scholar.google.com/&scioq=Global-Local+Network+for+Learning+Depth+with+Very+Sparse+Supervision&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkeAepVKDH",
        "title": "QGAN: Quantize Generative Adversarial Networks to Extreme low-bits",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The intensive computation and memory requirements of generative adversarial neural networks (GANs) hinder its real-world deployment on edge devices such as smartphones. Despite the success in model reduction of convolutional neural networks (CNNs), neural network quantization methods have not yet been studied on GANs, which are mainly faced with the issues of both the effectiveness of quantization algorithms and the instability of training GAN models. In this paper, we start with an extensive study on applying existing successful CNN quantization methods to quantize GAN models to extreme low bits. Our observation reveals that none of them generates samples with reasonable quality because of the underrepresentation of quantized weights in models, and the generator and discriminator networks show different sensitivities upon the quantization precision. Motivated by these observations, we develop a novel quantization method for GANs based on EM algorithms, named as QGAN. We also propose a multi-precision algorithm to help find an appropriate quantization precision of GANs given image qualities requirements. Experiments on CIFAR-10 and CelebA show that QGAN can quantize weights in GANs to even 1-bit or 2-bit representations with results of quality comparable to original models.",
        "keywords": "generative adversarial networks;quantization;extreme low bits",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Peiqi Wang;Yu Ji;Xinfeng Xie;Yongqiang Lyu;Dongsheng Wang;Yuan Xie",
        "authorids": "wpq14@tsinghua.org.cn;jiy15@mails.tsinghua.edu.cn;xinfeng@ucsb.edu;luyq@tsinghua.edu.cn;wds@mail.tsinghua.edu.cn;yuanxie@ucsb.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nwang2020qgan,\ntitle={{\\{}QGAN{\\}}: Quantize Generative Adversarial Networks to Extreme low-bits},\nauthor={Peiqi Wang and Yu Ji and Xinfeng Xie and Yongqiang Lyu and Dongsheng Wang and Yuan Xie},\nyear={2020},\nurl={https://openreview.net/forum?id=HkeAepVKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkeAepVKDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "439;273;427",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "452;431;561",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            379.6666666666667,
            75.58365490560037
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            481.3333333333333,
            56.981478472297376
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QIQxqL7-Gr0J:scholar.google.com/&scioq=QGAN:+Quantize+Generative+Adversarial+Networks+to+Extreme+low-bits&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkeFQgrFDr",
        "title": "GMM-UNIT: Unsupervised Multi-Domain and Multi-Modal Image-to-Image Translation via Attribute Gaussian Mixture Modelling",
        "track": "main",
        "status": "Withdraw",
        "tldr": "GMM-UNIT is an image-to-image translation model that maps an image to multiple domains in a stochastic fashion.",
        "abstract": "Unsupervised image-to-image translation aims to learn a mapping between several visual domains by using unpaired training pairs. Recent studies have shown remarkable success in image-to-image translation for multiple domains but they suffer from two main limitations: they are either built from several two-domain mappings that are required to be learned independently and/or they generate low-diversity results, a phenomenon known as model collapse. To overcome these limitations, we propose a method named GMM-UNIT based on a content-attribute disentangled representation, where the attribute space is fitted with a GMM. Each GMM component represents a domain, and this simple assumption has two prominent advantages. First, the dimension of the attribute space does not grow linearly with the number of domains, as it is the case in the literature. Second, the continuous domain encoding allows for interpolation between domains and for extrapolation to unseen domains. Additionally, we show how GMM-UNIT can be constrained down to different methods in the literature, meaning that GMM-UNIT is a unifying framework for unsupervised image-to-image translation.",
        "keywords": "GANs;image-to-image translation;multi-domain image translation;multi-modal image translation;Gaussian Mixture Model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yahui Liu;Marco De Nadai;Jian Yao;Nicu Sebe;Bruno Lepri;Xavier Alameda-Pineda",
        "authorids": "yahui.liu@unitn.it;denadai@fbk.eu;jian.yao@whu.edu.cn;niculae.sebe@unitn.it;lepri@fbk.eu;xavier.alameda-pineda@inria.fr",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkeFQgrFDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "849;452;1171",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            824.0,
            294.0623516648581
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17346833536899337546&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkeJjeBFDB",
        "title": "Noisy Collaboration in Knowledge Distillation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Inspired by trial-to-trial variability in the brain that can result from multiple noise sources, we introduce variability through noise in the knowledge distillation framework and studied their effect on generalization and robustness.",
        "abstract": "Knowledge distillation is an effective model compression technique in which a smaller model is trained to mimic a larger pretrained model. However in order to make these compact models suitable for real world deployment, not only do\nwe need to reduce the performance gap but also we need to make them more robust to commonly occurring and adversarial perturbations. Noise permeates every level of the nervous system, from the perception of sensory signals to the\ngeneration of motor responses. We therefore believe that noise could be a crucial element in improving neural networks training and addressing the apparently contradictory goals of improving both the generalization and robustness of the\nmodel. Inspired by trial-to-trial variability in the brain that can result from multiple noise sources, we introduce variability through noise at either the input level or the supervision signals. Our results show that noise can improve both the generalization and robustness of the model. \u201dFickle Teacher\u201d which uses dropout in teacher model as a source of response variation leads to significant generalization improvement. \u201dSoft Randomization\u201d, which matches the output distribution of\nthe student model on the image with Gaussian noise to the output of the teacher on original image, improves the adversarial robustness manifolds compared to the student model trained with Gaussian noise. We further show the surprising effect of random label corruption on a model\u2019s adversarial robustness. The study highlights the benefits of adding constructive noise in the knowledge distillation framework and hopes to inspire further work in the area.",
        "keywords": "Knowledge distillation;noise;generalization;adversarial robustness;natural robustness;robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fahad Sarfraz;Elahe Arani;Bahram Zonooz",
        "authorids": "fahad.sarfraz@navinfo.eu;elahe.arani@navinfo.eu;b.yoosefizonooz@navinfo.eu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=HkeJjeBFDB",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9047617808562543039&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HkeJzANFwS",
        "title": "Contextual Text Style Transfer",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, we introduce a new task, Contextual Text Style Transfer, to translate a sentence within a paragraph context into the desired style (e.g., informal to formal, offensive to non-offensive). Two new datasets, Enron-Context and Reddit-Context, are introduced for this new task, focusing on formality and offensiveness, respectively. Two key challenges exist in contextual text style transfer: 1) how to preserve the semantic meaning of the target sentence and its consistency with the surrounding context when generating an alternative sentence with a specific style; 2) how to deal with the lack of labeled parallel data. To address these challenges, we propose a Context-Aware Style Transfer (CAST) model, which leverages both parallel and non-parallel data for joint model training. For parallel training data, CAST uses two separate encoders to encode each input sentence and its surrounding context, respectively. The encoded feature vector, together with the target style information, are then used to generate the target sentence. A classifier is further used to ensure contextual consistency of the generated sentence. In order to lever-age massive non-parallel corpus and to enhance sentence encoder and decoder training, additional self-reconstruction and back-translation losses are introduced. Experimental results on Enron-Context and Reddit-Context demonstrate the effectiveness of the proposed model over state-of-the-art style transfer methods, across style accuracy, content preservation, and contextual consistency metrics.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Cheng;Zhe Gan;Yizhe Zhang;Oussama Elachqar;Dianqi Li;Jingjing Liu",
        "authorids": "yu.cheng@microsoft.com;zhe.gan@microsoft.com;yizhe.zhang@microsoft.com;ouelachq@microsoft.com;dianqili@uw.edu;jingjl@microsoft.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ncheng2020contextual,\ntitle={Contextual Text Style Transfer},\nauthor={Yu Cheng and Zhe Gan and Yizhe Zhang and Oussama Elachqar and Dianqi Li and Jingjing Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=HkeJzANFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkeJzANFwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "749;484;216",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            483.0,
            217.5974877305955
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13579809588475045203&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkeMYJHYvS",
        "title": "High-Frequency guided Curriculum Learning for Class-specific Object Boundary Detection",
        "track": "main",
        "status": "Reject",
        "tldr": "This work proposes a novel ConvNet architecture and a two-stage training scheme for class-specific object boundary estimation with improved performance levels.",
        "abstract": "This work addresses class-specific object boundary extraction, i.e., retrieving boundary pixels that belong to a class of objects in the given image. Although recent ConvNet-based approaches demonstrate impressive results, we notice that they produce several false-alarms and misdetections when used in real-world applications. We hypothesize that although boundary detection is simple at some pixels that are rooted in identifiable high-frequency locations, other pixels pose a higher level of difficulties, for instance, region pixels with an appearance similar to the boundaries; or boundary pixels with insignificant edge strengths. Therefore, the training process needs to account for different levels of learning complexity in different regions to overcome false alarms. In this work, we devise a curriculum-learning-based training process for object boundary detection. This multi-stage training process first trains the network at simpler pixels (with sufficient edge strengths) and then at harder pixels in the later stages of the curriculum.  We also propose a novel system for object boundary detection that relies on a fully convolutional neural network (FCN) and wavelet decomposition of image frequencies.  This system uses high-frequency bands from the wavelet pyramid and augments them to conv features from different layers of FCN. Our ablation studies with contourMNIST dataset, a simulated digit contours from MNIST, demonstrate that this explicit high-frequency augmentation helps the model to converge faster. Our model trained by the proposed curriculum scheme outperforms a state-of-the-art object boundary detection method by a significant margin on a challenging aerial image dataset.  \n",
        "keywords": "Computer Vision;Object Contour Detection;Curriculum Learning;Wavelets;Aerial Imagery",
        "primary_area": "",
        "supplementary_material": "",
        "author": "VSR Veeravasarapu;Deepak Mittal;Abhishek Goel;Maneesh Singh",
        "authorids": "vsr.veera@gmail.com;deepak.mittal@verisk.com;abhishek.goel@verisk.com;maneesh.singh@verisk.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nveeravasarapu2020highfrequency,\ntitle={High-Frequency guided Curriculum Learning for Class-specific Object Boundary Detection},\nauthor={VSR Veeravasarapu and Deepak Mittal and Abhishek Goel and Maneesh Singh},\nyear={2020},\nurl={https://openreview.net/forum?id=HkeMYJHYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkeMYJHYvS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "353;342;311",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            335.3333333333333,
            17.78263822446552
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:D70LtNckFF4J:scholar.google.com/&scioq=High-Frequency+guided+Curriculum+Learning+for+Class-specific+Object+Boundary+Detection&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkeO104tPB",
        "title": "Reinforcement Learning without Ground-Truth State",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposes to use an indicator function for specifying reward in goal-conditioned reinforcement learning, eliminating the need for reward engineering.",
        "abstract": "To perform robot manipulation tasks, a low-dimensional state of the environment typically needs to be estimated. However, designing a state estimator can sometimes be difficult, especially in environments with deformable objects. An alternative is to learn an end-to-end policy that maps directly from high-dimensional sensor inputs to actions. However, if this policy is trained with reinforcement learning, then without a state estimator, it is hard to specify a reward function based on high-dimensional observations. To meet this challenge, we propose a simple indicator reward function for goal-conditioned reinforcement learning: we only give a positive reward when the robot's observation exactly matches a target goal observation. We show that by relabeling the original goal with the achieved goal to obtain positive rewards (Andrychowicz et al., 2017), we can learn with the indicator reward function even in continuous state spaces. We propose two methods to further speed up convergence with indicator rewards: reward balancing and reward filtering. We show comparable performance between our method and an oracle which uses the ground-truth state for computing rewards. We show that our method can perform complex tasks in continuous state spaces such as rope manipulation from RGB-D images, without knowledge of the ground-truth state.",
        "keywords": "Self-supervised;goal-conditioned reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xingyu Lin;Harjatin Singh Baweja;David Held",
        "authorids": "xlin3@cs.cmu.edu;dheld@andrew.cmu.edu;harjatis@andrew.cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlin2020reinforcement,\ntitle={Reinforcement Learning without Ground-Truth State},\nauthor={Xingyu Lin and Harjatin Singh Baweja and David Held},\nyear={2020},\nurl={https://openreview.net/forum?id=HkeO104tPB}\n}",
        "github": "https://drive.google.com/file/d/1qvSzOJNsTT7jB2eMx2EatLn4gNbkiYW0/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkeO104tPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "904;703;509",
        "wc_reply_reviewers": "0;119;0",
        "wc_reply_authors": "0;572;0",
        "reply_reviewers": "0;1;0",
        "reply_authors": "0;2;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            705.3333333333334,
            161.26651508880846
        ],
        "wc_reply_reviewers_avg": [
            39.666666666666664,
            56.09713797413277
        ],
        "wc_reply_authors_avg": [
            190.66666666666666,
            269.6433858924702
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7195506906738680406&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkeO76EKDr",
        "title": "Min-max Entropy for Weakly Supervised Pointwise Localization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A deep learning method for weakly-supervised pointwise localization that learns using image-level label only. It relies on conditional entropy to localize relevant and irrelevant regions aiming to minimize false positive regions.",
        "abstract": "Pointwise localization allows more precise localization and accurate interpretability, compared to bounding box, in applications where objects are highly unstructured such as in medical domain. In this work, we focus on  weakly supervised localization (WSL) where a model is trained to classify an image and localize regions of interest at pixel-level using only global image annotation. Typical convolutional attentions maps are prune to high false positive regions. To alleviate this issue, we propose a new deep learning method for WSL, composed of a localizer and a classifier, where the localizer is constrained to determine relevant and irrelevant regions using conditional entropy (CE) with the aim to reduce false positive regions. Experimental results on a public medical dataset and two natural datasets, using Dice index, show that, compared to state of the art WSL methods, our proposal can provide significant improvements in terms of image-level classification and pixel-level localization (low false positive) with robustness to overfitting. A public reproducible PyTorch implementation is provided.",
        "keywords": "weakly supervised pointwise localization;deep learning;interpretability;computer vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Belharbi Soufiane;Rony J\u00e9r\u00f4me;Dolz Jose;Ben Ayed Ismail;McCaffrey Luke;Granger Eric",
        "authorids": "soufiane.belharbi.1@etsmtl.net;jerome.rony.1@etsmtl.net;jose.dolz@etsmtl.ca;ismail.benayed@etsmtl.ca;luke.mccaffrey@mcgill.ca;eric.granger@etsmtl.ca",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "https://drive.google.com/file/d/1qi2bjYq9jhL1H2D-zwtMl4X43viSDW7R/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkeO76EKDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "430;407;316",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            384.3333333333333,
            49.22284925610147
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17300514847595741876&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HkePNpVKPB",
        "title": "Compositional languages emerge in a neural iterated learning model",
        "track": "main",
        "status": "Poster",
        "tldr": "Use iterated learning framework to facilitate the dominance of high compositional language in multi-agent games.",
        "abstract": "The principle of compositionality, which enables natural language to represent complex concepts via a structured combination of simpler ones, allows us to convey an open-ended set of messages using a limited vocabulary. If compositionality is indeed a natural property of language, we may expect it to appear in communication protocols that are created by neural agents via grounded language learning. Inspired by the iterated learning framework, which simulates the process of language evolution, we propose an effective neural iterated learning algorithm that, when applied to interacting neural agents, facilitates the emergence of a more structured type of language. Indeed, these languages provide specific advantages to neural agents during training, which translates as a larger posterior probability, which is then incrementally amplified via the iterated learning procedure. Our experiments confirm our analysis, and also demonstrate that the emerged languages largely improve the generalization of the neural agent communication.",
        "keywords": "Compositionality;Multi-agent;Emergent language;Iterated learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yi Ren;Shangmin Guo;Matthieu Labeau;Shay B. Cohen;Simon Kirby",
        "authorids": "y.ren-18@sms.ed.ac.uk;s.guo-16@sms.ed.ac.uk;matthieu.labeau@gmail.com;scohen@inf.ed.ac.uk;simon.kirby@ed.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nRen2020Compositional,\ntitle={Compositional languages emerge in a neural iterated learning model},\nauthor={Yi Ren and Shangmin Guo and Matthieu Labeau and Shay B. Cohen and Simon Kirby},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkePNpVKPB}\n}",
        "github": "https://github.com/Joshua-Ren/Neural_Iterated_Learning",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkePNpVKPB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "363;521;451",
        "wc_reply_reviewers": "0;167;159",
        "wc_reply_authors": "1713;1723;1509",
        "reply_reviewers": "0;1;1",
        "reply_authors": "4;5;6",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            445.0,
            64.64260720814613
        ],
        "wc_reply_reviewers_avg": [
            108.66666666666667,
            76.90831482283882
        ],
        "wc_reply_authors_avg": [
            1648.3333333333333,
            98.60809072056692
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            5.0,
            0.816496580927726
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 115,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12260597755376568294&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkePOCNtPH",
        "title": "Non-Sequential Melody Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "Representing melodies as images with semantic units aligned we can generate them using a DCGAN without any recurrent components.",
        "abstract": "In this paper we present a method for algorithmic melody generation using a generative adversarial network without recurrent components. Music generation has been successfully done using recurrent neural networks, where the model learns sequence information that can help create authentic sounding melodies.  Here, we use DCGAN architecture with dilated convolutions and towers to capture sequential information as spatial image information, and learn long-range dependencies in fixed-length melody forms such as Irish traditional reel. ",
        "keywords": "melody generation;DCGAN;dilated convolutions",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mitchell Billard;Robert Bishop;Moustafa Elsisy;Laura Graves;Antonina Kolokolova;Vineel Nagisetty;Zachary Northcott;Heather Patey",
        "authorids": "mlb238@mun.ca;r.bishop@mun.ca;mmatelsisy@mun.ca;cmgraves@mun.ca;kol@mun.ca;vnagisetty@mun.ca;zmnorthcott@mun.ca;hpatey@gmail.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nbillard2020nonsequential,\ntitle={Non-Sequential Melody Generation},\nauthor={Mitchell Billard and Robert Bishop and Moustafa Elsisy and Laura Graves and Antonina Kolokolova and Vineel Nagisetty and Zachary Northcott and Heather Patey},\nyear={2020},\nurl={https://openreview.net/forum?id=HkePOCNtPH}\n}",
        "github": "https://github.com/gan-music-generation/gan_music_generation",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkePOCNtPH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "549;403;544",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "82;73;46",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            498.6666666666667,
            67.6773390598524
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            67.0,
            15.297058540778355
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6601851516632862210&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HkeQ6ANYDB",
        "title": "Blending Diverse Physical Priors with Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "A new method for physics-based learning is proposed that can handle a more diverse range of quality in the physical prior and dataset.",
        "abstract": "Rethinking physics in the era of deep learning is an increasingly important topic. This topic is special because, in addition to data, one can leverage a vast library of physical prior models (e.g. kinematics, fluid flow, etc) to perform more robust inference. The nascent sub-field of physics-based learning (PBL) studies this problem of blending neural networks with physical priors. While previous PBL algorithms have been applied successfully to specific tasks, it is hard to generalize existing PBL methods to a wide range of physics-based problems. Such generalization would require an architecture that can adapt to variations in the correctness of the physics, or in the quality of training data. No such architecture exists. In this paper, we aim to generalize PBL, by making a first attempt to bring neural architecture search (NAS) to the realm of PBL. We introduce a new method known as physics-based neural architecture search (PhysicsNAS) that is a top-performer across a diverse range of quality in the physical model and the dataset. ",
        "keywords": "Physics-based learning;Physics-aware learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yunhao Ba;Guangyuan Zhao;Achuta Kadambi",
        "authorids": "yhba@ucla.edu;zhaoguangyuan@ucla.edu;achuta@ee.ucla.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nba2020blending,\ntitle={Blending Diverse Physical Priors with Neural Networks},\nauthor={Yunhao Ba and Guangyuan Zhao and Achuta Kadambi},\nyear={2020},\nurl={https://openreview.net/forum?id=HkeQ6ANYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkeQ6ANYDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "231;485;164",
        "wc_reply_reviewers": "0;123;0",
        "wc_reply_authors": "123;431;149",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            293.3333333333333,
            138.26142709455237
        ],
        "wc_reply_reviewers_avg": [
            41.0,
            57.982756057296896
        ],
        "wc_reply_authors_avg": [
            234.33333333333334,
            139.46883363517298
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 55,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1991452799981505635&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HkeSdCEtDS",
        "title": "Alternating Recurrent Dialog Model with Large-Scale Pre-Trained Language Models",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a simple, general, and effective framework with the large pre-trained language model GPT-2.",
        "abstract": "Existing dialog system models require extensive human annotations and are difficult to generalize to different tasks. The recent success of large pre-trained language models such as BERT and GPT-2 have suggested the effectiveness of incorporating language priors in down-stream NLP tasks. However, how much pre-trained language models can help dialog response generation is still under exploration. In this paper, we propose a simple, general, and effective framework: Alternating Recurrent Dialog Model (ARDM). ARDM models each speaker separately and takes advantage of the large pre-trained language model. It requires no supervision from human annotations such as belief states or dialog acts to achieve effective conversations. ARDM outperforms or is on par with state-of-the-art methods on two popular task-oriented dialog datasets: CamRest676 and MultiWOZ. Moreover, we can generalize ARDM to more challenging, non-collaborative tasks such as persuasion. In persuasion tasks, ARDM is capable of generating human-like responses to persuade people to donate to a charity.",
        "keywords": "NLP;Pre-training;GPT-2;Text Generation;Dialog Generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qingyang Wu;Yichi Zhang;Yu Li;Zhou Yu",
        "authorids": "wilwu@ucdavis.edu;zhangyic17@mails.tsinghua.edu.cn;yooli@ucdavis.edu;joyu@ucdavis.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwu2020alternating,\ntitle={Alternating Recurrent Dialog Model with Large-Scale Pre-Trained Language Models},\nauthor={Qingyang Wu and Yichi Zhang and Yu Li and Zhou Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=HkeSdCEtDS}\n}",
        "github": "https://anonymous.4open.science/r/99c2260f-b85c-4ed7-9067-3333e7ac14ce/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkeSdCEtDS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "258;783;94",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "135;419;23",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            378.3333333333333,
            293.8710979702193
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            192.33333333333334,
            166.67199991466939
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 65,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16749853548537350318&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HkeUDCNFPS",
        "title": "Learning Temporal Abstraction with Information-theoretic Constraints for Hierarchical Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel HRL framework, in which we formulate the temporal abstraction problem as learning a latent representation  of  action  sequence.",
        "abstract": "Applying reinforcement learning (RL) to real-world problems will require reasoning about action-reward correlation over long time horizons. Hierarchical reinforcement learning (HRL) methods handle this by dividing the task into hierarchies, often with hand-tuned network structure or pre-defined subgoals. We propose a novel HRL framework TAIC, which learns the temporal abstraction from past experience or expert demonstrations without task-specific knowledge. We formulate the temporal abstraction problem as learning latent representations of action sequences and present a novel approach of regularizing the latent space by adding information-theoretic constraints. Specifically, we maximize the mutual information between the latent variables and the state changes.\nA visualization of the latent space demonstrates that our algorithm learns an effective abstraction of the long action sequences. The learned abstraction allows us to learn new tasks on higher level more efficiently. We convey a significant speedup in convergence over benchmark learning problems. These results demonstrate that learning temporal abstractions is an effective technique in increasing the convergence rate and sample efficiency of RL algorithms.",
        "keywords": "hierarchical reinforcement learning;temporal abstraction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenshan Wang;Yaoyu Hu;Sebastian Scherer",
        "authorids": "wenshanw@andrew.cmu.edu;yaoyuh@andrew.cmu.edu;basti@andrew.cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwang2020learning,\ntitle={Learning Temporal Abstraction with Information-theoretic Constraints for Hierarchical Reinforcement Learning},\nauthor={Wenshan Wang and Yaoyu Hu and Sebastian Scherer},\nyear={2020},\nurl={https://openreview.net/forum?id=HkeUDCNFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkeUDCNFPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1830;708;243",
        "wc_reply_reviewers": "553;0;31",
        "wc_reply_authors": "2224;1135;471",
        "reply_reviewers": "2;0;1",
        "reply_authors": "5;2;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            927.0,
            666.1396250036474
        ],
        "wc_reply_reviewers_avg": [
            194.66666666666666,
            253.6957933343704
        ],
        "wc_reply_authors_avg": [
            1276.6666666666667,
            722.6360694629689
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            3.0,
            1.4142135623730951
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0ObHX9GFZ08J:scholar.google.com/&scioq=Learning+Temporal+Abstraction+with+Information-theoretic+Constraints+for+Hierarchical+Reinforcement+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkeZQJBKDB",
        "title": "Universal approximations of permutation invariant/equivariant functions by deep neural networks",
        "track": "main",
        "status": "Reject",
        "tldr": "For a given $G$-invariant/equivariant function, we construct its universal approximator by deep neural network whose layers equip $G$-actions and each affine transformations are $G$-equivariant/invariant. ",
        "abstract": "In this paper, we develop a theory about the relationship between $G$-invariant/equivariant functions and deep neural networks for finite group $G$. Especially, for a given $G$-invariant/equivariant function, we construct its universal approximator by deep neural network whose layers equip $G$-actions and each affine transformations are $G$-equivariant/invariant. Due to representation theory, we can show that this approximator has exponentially fewer free parameters than usual models. ",
        "keywords": "finite group;invariant;equivariant;neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akiyoshi Sannai;Yuuki Takai;Matthieu Cordonnier",
        "authorids": "akiyoshi.sannai@riken.jp;yuuki.takai@riken.jp;matthieu.cordonnier@ens-paris-saclay.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsannai2020universal,\ntitle={Universal approximations of permutation invariant/equivariant functions by deep neural networks},\nauthor={Akiyoshi Sannai and Yuuki Takai and Matthieu Cordonnier},\nyear={2020},\nurl={https://openreview.net/forum?id=HkeZQJBKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkeZQJBKDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "353;166;523",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "351;149;352",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            347.3333333333333,
            145.79971040971088
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            284.0,
            95.4602884275271
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 82,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17542628043261792531&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Hke_f0EYPH",
        "title": "Efficient Training of Robust and Verifiable Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recent works have developed several methods of defending neural networks against adversarial attacks with certified guarantees. We propose that many common certified defenses can be viewed under a unified framework of regularization. This unified framework provides a technique for comparing different certified defenses with respect to robust generalization. In addition, we develop a new regularizer that is both more efficient than existing certified defenses and can be used to train networks with higher certified accuracy. Our regularizer also extends to an L0 threat model and ensemble models. Through experiments on MNIST, CIFAR-10 and GTSRB, we demonstrate improvements in training speed and certified accuracy compared to state-of-the-art certified defenses.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akhilan Boopathy;Lily Weng;Sijia Liu;Pin-Yu Chen;Luca Daniel",
        "authorids": "akhilan@mit.edu;twweng@mit.edu;sijia.liu@ibm.com;pin-yu.chen@ibm.com;dluca@mit.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nboopathy2020efficient,\ntitle={Efficient Training of Robust and Verifiable Neural Networks},\nauthor={Akhilan Boopathy and Lily Weng and Sijia Liu and Pin-Yu Chen and Luca Daniel},\nyear={2020},\nurl={https://openreview.net/forum?id=Hke_f0EYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hke_f0EYPH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "379;2530;341",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "228;368;198",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1083.3333333333333,
            1023.0654372467525
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            264.6666666666667,
            74.08703590297623
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oLANeBwTtFQJ:scholar.google.com/&scioq=Efficient+Training+of+Robust+and+Verifiable+Neural+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HkebMlrFPS",
        "title": "Learning Multi-facet Embeddings of Phrases and Sentences using Sparse Coding for Unsupervised Semantic Applications",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose an unsupervised way to learn multiple embeddings for sentences and phrases ",
        "abstract": "Most deep learning for NLP represents each word with a single point or single-mode region in semantic space, while the existing multi-mode word embeddings cannot represent longer word sequences like phrases or sentences. We introduce a phrase representation (also applicable to sentences) where each phrase has a distinct set of multi-mode codebook embeddings to capture different semantic facets of the phrase's meaning. The codebook embeddings can be viewed as the cluster centers which summarize the distribution of possibly co-occurring words in a pre-trained word embedding space. We propose an end-to-end trainable neural model that directly predicts the set of cluster centers from the input text sequence (e.g., a phrase or a sentence) during test time. We find that the per-phrase/sentence codebook embeddings not only provide a more interpretable semantic representation but also outperform strong baselines (by a large margin in some tasks) on benchmark datasets for unsupervised phrase similarity, sentence similarity, hypernym detection, and extractive summarization.",
        "keywords": "lexical semantics;sparse coding;multi-mode embeddings;unsupervised sentence embedding;clustering;set decoder;semantic similarity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haw-Shiuan Chang;Amol Agrawal;Andrew McCallum",
        "authorids": "hschang@cs.umass.edu;amolagrawal@cs.umass.edu;mccallum@cs.umass.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer5;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HkebMlrFPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "195;619;733",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            515.6666666666666,
            231.4725805696121
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DmlvtKHpeHMJ:scholar.google.com/&scioq=Learning+Multi-facet+Embeddings+of+Phrases+and+Sentences+using+Sparse+Coding+for+Unsupervised+Semantic+Applications&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkedQp4tPr",
        "title": "Parallel Scheduled Sampling",
        "track": "main",
        "status": "Reject",
        "tldr": "We describe a simple technique to parallelize Scheduled Sampling across time  which gives better sample quality and train almost as fast as teacher-forcing.",
        "abstract": "Auto-regressive models are widely used in sequence generation problems.  The output sequence is typically generated in a predetermined order, one discrete unit(pixel or word or character) at a time. The models are trained by teacher-forcing where ground-truth history is fed to the model as input,  which at test time is replaced by the model prediction. Scheduled Sampling (Bengio et al., 2015) aimsto mitigate this discrepancy between train and test time by randomly replacing some discrete units in the history with the model\u2019s prediction. While teacher-forced training works well with ML accelerators as the computation can be parallelized across time, Scheduled Sampling involves undesirable sequential processing. In this paper, we introduce a simple technique to parallelize Scheduled Sampling across time. Experimentally, we find the proposed technique leads to equivalent or better performance on image generation, summarization, dialog generation, and translation compared to teacher-forced training.  n dialog response generation task,Parallel Scheduled Sampling achieves 1.6 BLEU score (11.5%) improvement over teacher-forcing while in image generation it achieves 20% and 13.8% improvement in Frechet Inception Distance (FID) and Inception Score (IS) respectively. Further, we discuss the effects of different hyper-parameters associated with Scheduled Sampling on the model performance.",
        "keywords": "deep learning;generative models;teacher forcing;scheduled sampling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Duckworth;Arvind Neelakantan;Ben Goodrich;Lukasz Kaiser;Samy Bengio",
        "authorids": "duckworthd@google.com;aneelakantan@google.com;bgoodrich@google.com;lukaszkaiser@google.com;bengio@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nduckworth2020parallel,\ntitle={Parallel Scheduled Sampling},\nauthor={Daniel Duckworth and Arvind Neelakantan and Ben Goodrich and Lukasz Kaiser and Samy Bengio},\nyear={2020},\nurl={https://openreview.net/forum?id=HkedQp4tPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkedQp4tPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "401;188;422",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "212;74;130",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            337.0,
            105.70714261581381
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            138.66666666666666,
            56.67058809960909
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4598570057666852340&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Hkee1JBKwB",
        "title": "Convolutional Tensor-Train LSTM for Long-Term Video Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "we propose convolutional tensor-train LSTM,  which learns higher-order Convolutional LSTM efficiently using convolutional tensor-train decomposition. ",
        "abstract": "Long-term video prediction is highly challenging since it entails simultaneously capturing spatial and temporal information across a long range of image frames.Standard recurrent models are ineffective since they are prone to error propagation and cannot effectively capture higher-order correlations. A potential solution is to extend to higher-order spatio-temporal recurrent models. However, such a model requires  a  large number of parameters and operations, making it intractable  to learn in practice and is prone to overfitting. In this work, we propose convolutional tensor-train LSTM (Conv-TT-LSTM), which  learns higher-orderConvolutional LSTM (ConvLSTM) efficiently using convolutional  tensor-train decomposition (CTTD). Our proposed model naturally incorporates higher-order spatio-temporal information at a small cost of memory and computation by using efficient low-rank tensor representations. We evaluate our model on Moving-MNIST and KTH datasets and show improvements over standard ConvLSTM and better/comparable results to other ConvLSTM-based approaches, but with much fewer parameters.",
        "keywords": "Tensor decomposition;Video prediction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiahao Su;Wonmin Byeon;Furong Huang;Jan Kautz;Animashree Anandkumar",
        "authorids": "jiahaosu@terpmail.umd.edu;wonmin.byeon@gmail.com;furongh@cs.umd.edu;jkautz@nvidia.com;animakumar@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsu2020convolutional,\ntitle={Convolutional Tensor-Train {\\{}LSTM{\\}} for Long-Term Video Prediction},\nauthor={Jiahao Su and Wonmin Byeon and Furong Huang and Jan Kautz and Animashree Anandkumar},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkee1JBKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Hkee1JBKwB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "276;440;226",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "155;382;258",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            314.0,
            91.40386570964418
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            265.0,
            92.80445391610613
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GGAcOdgoHawJ:scholar.google.com/&scioq=Convolutional+Tensor-Train+LSTM+for+Long-Term+Video+Prediction&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkeeITEYDr",
        "title": "Robust Reinforcement Learning with Wasserstein Constraint",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Robust Reinforcement Learning aims to find the optimal policy with some degree of robustness to environmental dynamics. Existing learning algorithms usually enable the robustness though disturbing the current state or simulated environmental parameters in a heuristic way, which lack quantified robustness to the system dynamics (i.e.  transition probability).  To overcome this issue, we leverage Wasserstein distance to measure the disturbance to the reference transition probability. With Wasserstein distance, we are able to connect transition probability disturbance to the state disturbance, and reduces an infinite-dimensional optimization problem to a finite-dimensional risk-aware problem. Through the derived risk-aware optimal Bellman equation,  we first show the existence of optimal robust policies,  provide  a  sensitivity  analysis  for  the  perturbations,  and  then  design  a novel robust learning algorithm\u2014WassersteinRobustAdvantageActor-Critic algorithm (WRA2C). The effectiveness of the proposed algorithm is verified in theCart-Pole environment.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Linfang Hou;Liang Pang;Xin Hong;Yanyan Lan;Zhiming Ma;Dawei Yin",
        "authorids": "houlinfang09@gmail.com;pangliang@ict.ac.cn;hongxin19b@ict.ac.cn;lanyanyan@ict.ac.cn;mazm@amt.ac.cn;yindawei@acm.org",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nhou2020robust,\ntitle={Robust Reinforcement Learning with Wasserstein Constraint},\nauthor={Linfang Hou and Liang Pang and Xin Hong and Yanyan Lan and Zhiming Ma and Dawei Yin},\nyear={2020},\nurl={https://openreview.net/forum?id=HkeeITEYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkeeITEYDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "347;262;412",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "220;216;151",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            340.3333333333333,
            61.41841924229427
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            195.66666666666666,
            31.626290048347787
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14968458823107625207&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Hkeh21BKPH",
        "title": "Towards Finding Longer Proofs",
        "track": "main",
        "status": "Reject",
        "tldr": "We present FLoP, a reinforcement learning based guidance system for automated theorem proving geared towards Finding Longer Proofs.",
        "abstract": "We present a reinforcement learning (RL) based guidance system for automated theorem proving geared towards Finding Longer Proofs (FLoP). FLoP focuses on generalizing from short proofs to longer ones of similar structure. To achieve that, FLoP uses state-of-the-art RL approaches that were previously not applied in theorem proving.  In particular, we show that curriculum learning significantly outperforms previous learning-based proof guidance on a synthetic dataset of increasingly difficult arithmetic problems.",
        "keywords": "automated theorem proving;reinforcement learning;curriculum learning;internal guidance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zsolt Zombori;Adri\u00e1n Csisz\u00e1rik;Henryk Michalewski;Cezary Kaliszyk;Josef Urban",
        "authorids": "zombori@renyi.hu;csadrian@renyi.hu;henrykmichalewski@gmail.com;cezary.kaliszyk@uibk.ac.at;josef.urban@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzombori2020towards,\ntitle={Towards Finding Longer Proofs},\nauthor={Zsolt Zombori and Adri{\\'a}n Csisz{\\'a}rik and Henryk Michalewski and Cezary Kaliszyk and Josef Urban},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkeh21BKPH}\n}",
        "github": "http://bit.ly/code_atpcurr",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hkeh21BKPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "304;491;263",
        "wc_reply_reviewers": "0;6;0",
        "wc_reply_authors": "429;651;176",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            352.6666666666667,
            99.23820948718402
        ],
        "wc_reply_reviewers_avg": [
            2.0,
            2.8284271247461903
        ],
        "wc_reply_authors_avg": [
            418.6666666666667,
            194.05554760314263
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 31,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15163759876633240046&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 16
    },
    {
        "id": "HkehD3VtvS",
        "title": "Deep Reasoning Networks: Thinking Fast and Slow, for Pattern De-mixing",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce Deep Reasoning Networks (DRNets), an end-to-end framework that combines deep learning with reasoning for solving pattern de-mixing tasks, typically in an unsupervised or weakly-supervised setting. ",
        "abstract": "We introduce Deep Reasoning Networks (DRNets), an end-to-end framework that combines deep learning with reasoning for solving pattern de-mixing problems, typically in an unsupervised or weakly-supervised setting.  DRNets exploit problem structure and prior knowledge by tightly combining logic and constraint reasoning with stochastic-gradient-based neural network optimization.  We illustrate the power of DRNets on de-mixing overlapping hand-written Sudokus (Multi-MNIST-Sudoku) and on a substantially more complex task in scientific discovery that concerns inferring crystal structures of materials from X-ray diffraction data (Crystal-Structure-Phase-Mapping). DRNets significantly outperform the state of the art and experts' capabilities on Crystal-Structure-Phase-Mapping, recovering more precise and physically meaningful crystal structures. On Multi-MNIST-Sudoku, DRNets perfectly recovered the mixed Sudokus' digits, with 100% digit accuracy, outperforming the supervised state-of-the-art MNIST de-mixing models.",
        "keywords": "Deep Reasoning Network;Pattern De-mixing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Di Chen;Yiwei Bai;Wenting Zhao;Sebastian Ament;John M. Gregoire;Carla P. Gomes",
        "authorids": "di@cs.cornell.edu;bywbilly@cs.cornell.edu;wzhao@cs.cornell.edu;ament@cs.cornell.edu;gregoire@caltech.edu;gomes@cs.cornell.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nchen2020deep,\ntitle={Deep Reasoning Networks:  Thinking Fast and Slow, for Pattern De-mixing},\nauthor={Di Chen and Yiwei Bai and Wenting Zhao and Sebastian Ament and John M. Gregoire and Carla P. Gomes},\nyear={2020},\nurl={https://openreview.net/forum?id=HkehD3VtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkehD3VtvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "513;291;845",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "757;777;816",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            549.6666666666666,
            227.6508047182985
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            783.3333333333334,
            24.499433100017278
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2152930038159169129&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HkejNgBtPB",
        "title": "Variational Template Machine for Data-to-Text Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "How to generate descriptions from structured data organized in tables? Existing approaches using neural encoder-decoder models often suffer from lacking diversity. We claim that an open set of templates is crucial for enriching the phrase constructions and realizing varied generations.Learning such templates is prohibitive since it often requires a large paired <table,description>, which is seldom available. This paper explores the problem of automatically learning reusable \"templates\" from paired and non-paired data. We propose the variational template machine (VTM), a novel method to generate text descriptions from data tables. Our contributions include:  a) we carefully devise a specific model architecture and losses to explicitly disentangle text template and semantic content information, in the latent spaces, and b) we utilize both small parallel data and large raw text without aligned tables to enrich the template learning. Experiments on datasets from a variety of different domains show that VTM is able to generate more diversely while keeping a good fluency and quality. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rong Ye;Wenxian Shi;Hao Zhou;Zhongyu Wei;Lei Li",
        "authorids": "rye18@fudan.edu.cn;shiwenxian@bytedance.com;zhouhao.nlp@bytedance.com;zywei@fudan.edu.cn;lileilab@bytedance.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nYe2020Variational,\ntitle={Variational Template Machine for Data-to-Text Generation},\nauthor={Rong Ye and Wenxian Shi and Hao Zhou and Zhongyu Wei and Lei Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkejNgBtPB}\n}",
        "github": "[![github](/images/github_icon.svg) ReneeYe/VariationalTemplateMachine](https://github.com/ReneeYe/VariationalTemplateMachine)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkejNgBtPB",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "372;94;113",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "687;228;158",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            193.0,
            126.80956851384151
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            357.6666666666667,
            234.62073414105774
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 55,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7425104340562846421&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "Hkekl0NFPr",
        "title": "Conditional Learning of Fair Representations",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose a novel algorithm for learning fair representations that can simultaneously mitigate two notions of disparity among different demographic subgroups.",
        "abstract": "We propose a novel algorithm for learning fair representations that can simultaneously mitigate two notions of disparity among different demographic subgroups in the classification setting. Two key components underpinning the design of our algorithm are balanced error rate and conditional alignment of representations. We show how these two components contribute to ensuring accuracy parity and equalized false-positive and false-negative rates across groups without impacting demographic parity. Furthermore, we also demonstrate both in theory and on two real-world experiments that the proposed algorithm leads to a better utility-fairness trade-off on balanced datasets compared with existing algorithms on learning fair representations for classification. \n",
        "keywords": "algorithmic fairness;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Han Zhao;Amanda Coston;Tameem Adel;Geoffrey J. Gordon",
        "authorids": "han.zhao@cs.cmu.edu;acoston@cs.cmu.edu;tah47@cam.ac.uk;ggordon@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nZhao2020Conditional,\ntitle={Conditional Learning of Fair Representations},\nauthor={Han Zhao and Amanda Coston and Tameem Adel and Geoffrey J. Gordon},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkekl0NFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hkekl0NFPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "336;231;426",
        "wc_reply_reviewers": "0;0;274",
        "wc_reply_authors": "481;318;1307",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            331.0,
            79.68688725254614
        ],
        "wc_reply_reviewers_avg": [
            91.33333333333333,
            129.1648386967427
        ],
        "wc_reply_authors_avg": [
            702.0,
            432.94418423933894
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 129,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7807356012370667606&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Hkem-lrtvH",
        "title": "BayesOpt Adversarial Attack",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a query-efficient black-box attack which uses Bayesian optimisation in combination with Bayesian model selection to optimise over the adversarial perturbation and the optimal degree of search space dimension reduction. ",
        "abstract": "Black-box adversarial attacks require a large number of attempts before finding successful adversarial examples that are visually indistinguishable from the original input. Current approaches relying on substitute model training, gradient estimation or genetic algorithms often require an excessive number of queries. Therefore, they are not suitable for real-world systems where the maximum query number is limited due to cost. We propose a query-efficient black-box attack which uses Bayesian optimisation in combination with Bayesian model selection to optimise over the adversarial perturbation and the optimal degree of search space dimension reduction. We demonstrate empirically that our method can achieve comparable success rates with 2-5 times fewer queries compared to previous state-of-the-art black-box attacks.",
        "keywords": "Black-box Adversarial Attack;Bayesian Optimisation;Gaussian Process",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Binxin Ru;Adam Cobb;Arno Blaas;Yarin Gal",
        "authorids": "robin@robots.ox.ac.uk;adam.cobb@worc.ox.ac.uk;arno@robots.ox.ac.uk;yarin@cs.ox.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nRu2020BayesOpt,\ntitle={BayesOpt Adversarial Attack},\nauthor={Binxin Ru and Adam Cobb and Arno Blaas and Yarin Gal},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkem-lrtvH}\n}",
        "github": "https://github.com/rubinxin/BayesOpt_Attack",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkem-lrtvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "339;367;139",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "719;781;253",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            281.6666666666667,
            101.52613237760787
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            584.3333333333334,
            235.6513432076201
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 98,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5312602823710274038&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkenPn4KPH",
        "title": "When Does Self-supervision Improve Few-shot Learning?",
        "track": "main",
        "status": "Reject",
        "tldr": "Self-supervision improves few-shot recognition on small and challenging datasets without relying on extra data; Extra data helps only when it is from the same or similar domain.",
        "abstract": "We present a technique to improve the generalization of deep representations learned on small labeled datasets by introducing self-supervised tasks as auxiliary loss functions. Although recent research has shown benefits of self-supervised learning (SSL) on large unlabeled datasets, its utility on small datasets is unknown. We find that SSL reduces the relative error rate of few-shot meta-learners by 4%-27%, even when the datasets are small and only utilizing images within the datasets. The improvements are greater when the training set is smaller or the task is more challenging. Though the benefits of SSL may increase with larger training sets, we observe that SSL can have a negative impact on performance when there is a domain shift between distribution of images used for meta-learning and SSL. Based on this analysis we present a technique that automatically select images for SSL from a large, generic pool of unlabeled images for a given dataset using a domain classifier that provides further improvements. We present results using several meta-learners and self-supervised tasks across datasets with varying degrees of domain shifts and label sizes to characterize the effectiveness of SSL for few-shot learning.",
        "keywords": "Few-shot learning;Self-supervised learning;Meta-learning;Multi-task learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jong-Chyi Su;Subhransu Maji;Bharath Hariharan",
        "authorids": "jcsu@cs.umass.edu;smaji@cs.umass.edu;bharathh@cs.cornell.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsu2020when,\ntitle={When Does Self-supervision Improve Few-shot Learning?},\nauthor={Jong-Chyi Su and Subhransu Maji and Bharath Hariharan},\nyear={2020},\nurl={https://openreview.net/forum?id=HkenPn4KPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkenPn4KPH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "1024;315;560",
        "wc_reply_reviewers": "0;0;6",
        "wc_reply_authors": "375;300;313",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            633.0,
            294.0147388595794
        ],
        "wc_reply_reviewers_avg": [
            2.0,
            2.8284271247461903
        ],
        "wc_reply_authors_avg": [
            329.3333333333333,
            32.72443871006635
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 225,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17974493116081389845&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "HkerqCEtvS",
        "title": "Stochastic Gradient Methods with Block Diagonal Matrix Adaptation",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "",
        "abstract": "Adaptive gradient approaches that automatically adjust the learning rate on a per-feature basis have been very popular for training deep networks. This rich class of algorithms includes Adagrad, RMSprop, Adam, and recent extensions. All these algorithms have adopted diagonal matrix adaptation, due to the prohibitive computational burden of manipulating full matrices in high-dimensions. In this paper, we show that block-diagonal matrix adaptation can be a practical and powerful solution that can effectively utilize structural characteristics of deep learning architectures to significantly improve convergence and out-of-sample generalization. We present AdaBlock, a general framework for block-diagonal matrix adaption via coordinate grouping, which includes counterparts of the aforementioned algorithms. We prove its convergence in non-convex optimization and provide generalization error bounds, highlighting benefits compared to diagonal versions. In addition, we propose two techniques enriching the AdaBlock family: i) an efficient spectrum-clipping scheme that benefits from superior generalization performance of SGD and ii) a randomized layer-wise block diagonal adaptation scheme to further reduce computational cost. Extensive experiments show that AdaBlock achieves state-of-the-art results on several deep learning tasks, and can outperform adaptive diagonal methods, vanilla SGD, as well as a modified version of full-matrix adaptation proposed very recently.",
        "keywords": "Optimization for Deep Networks;Non-convex Optimization;Stochastic Optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jihun Yun;Aur\u00e9lie C. Lozano;Eunho Yang",
        "authorids": "arcprime@kaist.ac.kr;aclozano@us.ibm.com;eunhoy@kaist.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nanonymous2020stochastic,\ntitle={Stochastic Gradient Methods with Block Diagonal Matrix Adaptation},\nauthor={Anonymous},\nbooktitle={Submitted to International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkerqCEtvS},\nnote={under review}\n}",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=HkerqCEtvS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7531989221214331527&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HkeryxBtPB",
        "title": "MMA Training: Direct Input Space Margin Maximization through Adversarial Training",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose MMA training to directly maximize input space margin in order to improve adversarial robustness primarily by removing the requirement of specifying a fixed distortion bound.",
        "abstract": "We study adversarial robustness of neural networks from a margin maximization perspective, where margins are defined as the distances from inputs to a classifier's decision boundary.\nOur study shows that maximizing margins can be achieved by minimizing the adversarial loss on the decision boundary at the \"shortest successful perturbation\", demonstrating a close connection between adversarial losses and the margins. We propose Max-Margin Adversarial (MMA) training to directly maximize the margins to achieve adversarial robustness. \nInstead of adversarial training with a fixed $\\epsilon$, MMA offers an improvement by enabling adaptive selection of the \"correct\" $\\epsilon$ as the margin individually for each datapoint. In addition, we rigorously analyze adversarial training with the perspective of margin maximization, and provide an alternative interpretation for adversarial training, maximizing either a lower or an upper bound of the margins. Our experiments empirically confirm our theory and demonstrate MMA training's efficacy on the MNIST and CIFAR10 datasets w.r.t. $\\ell_\\infty$ and $\\ell_2$ robustness.",
        "keywords": "adversarial robustness;perturbation;margin maximization;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gavin Weiguang Ding;Yash Sharma;Kry Yik Chau Lui;Ruitong Huang",
        "authorids": "gavin.w.ding@gmail.com;yash.sharma@bethgelab.org;yikchau.y.lui@borealisai.com;ruitong.huang@borealisai.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nDing2020MMA,\ntitle={MMA Training: Direct Input Space Margin Maximization through Adversarial Training},\nauthor={Gavin Weiguang Ding and Yash Sharma and Kry Yik Chau Lui and Ruitong Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkeryxBtPB}\n}",
        "github": "https://github.com/BorealisAI/mma_training",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkeryxBtPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "873;150;145",
        "wc_reply_reviewers": "0;0;21",
        "wc_reply_authors": "2408;154;702",
        "reply_reviewers": "0;0;1",
        "reply_authors": "5;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            389.3333333333333,
            342.0100713266919
        ],
        "wc_reply_reviewers_avg": [
            7.0,
            9.899494936611665
        ],
        "wc_reply_authors_avg": [
            1088.0,
            959.8180383107346
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.699673171197595
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 358,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2454066962339603131&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HketzTNYwS",
        "title": "Unifying Question Answering, Text Classification, and Regression via Span Extraction",
        "track": "main",
        "status": "Reject",
        "tldr": "Question answering, regression and classification can be unified as span extraction with improved generality and performance. ",
        "abstract": "Even as pre-trained language encoders such as BERT are shared across many tasks, the output layers of question answering, text classification, and regression models are significantly different. Span decoders are frequently used for question answering, fixed-class, classification layers for text classification, and similarity-scoring layers for regression tasks. We show that this distinction is not necessary and that all three can be unified as span extraction. A unified, span-extraction approach leads to superior or comparable performance in supplementary supervised pre-trained,  low-data, and multi-task learning experiments on several question answering, text classification, and regression benchmarks.",
        "keywords": "NLP;span-extraction;BERT",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nitish Shirish Keskar;Bryan McCann;Caiming Xiong;Richard Socher",
        "authorids": "nkeskar@salesforce.com;bmccann@salesforce.com;cxiong@salesforce.com;rsocher@salesforce.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkeskar2020unifying,\ntitle={Unifying Question Answering, Text Classification, and Regression via Span Extraction},\nauthor={Nitish Shirish Keskar and Bryan McCann and Caiming Xiong and Richard Socher},\nyear={2020},\nurl={https://openreview.net/forum?id=HketzTNYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HketzTNYwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "789;176;211",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "562;434;350",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            392.0,
            281.0848033364071
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            448.6666666666667,
            87.16778201963281
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 40,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14254143294713316189&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HkeuD34KPH",
        "title": "SSE-PT: Sequential Recommendation Via Personalized Transformer",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Temporal information is crucial for recommendation problems because user preferences are naturally dynamic in the real world. Recent advances in deep learning, especially the discovery of various attention mechanisms and newer architectures in addition to widely used RNN and CNN in natural language processing, have allowed for better use of the temporal ordering of items that each user has engaged with. In particular, the SASRec model, inspired by the popular Transformer model in natural languages processing, has achieved state-of-the-art results. However, SASRec, just like the original Transformer model, is inherently an un-personalized model and does not include personalized user embeddings. To overcome this limitation, we propose a Personalized Transformer (SSE-PT) model, outperforming SASRec by almost 5% in terms of NDCG@10 on 5 real-world datasets. Furthermore, after examining some random users' engagement history, we find our model not only more interpretable but also able to focus on recent engagement patterns for each user. Moreover, our SSE-PT model with a slight modification, which we call SSE-PT++, can handle extremely long sequences and outperform SASRec in ranking results with comparable training speed, striking a balance between performance and speed requirements. Our novel application of the Stochastic Shared Embeddings (SSE) regularization is essential to the success of personalization. Code and data are open-sourced at https://github.com/SSE-PT/SSE-PT.",
        "keywords": "sequential recommendation;personalized transformer;stochastic shared embeddings",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liwei Wu;Shuqing Li;Cho-Jui Hsieh;James Sharpnack",
        "authorids": "liwu@ucdavis.edu;qshli@ucdavis.edu;chohsieh@cs.ucla.edu;jsharpna@ucdavis.deu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwu2020ssept,\ntitle={{\\{}SSE{\\}}-{\\{}PT{\\}}: Sequential Recommendation Via Personalized Transformer},\nauthor={Liwei Wu and Shuqing Li and Cho-Jui Hsieh and James Sharpnack},\nyear={2020},\nurl={https://openreview.net/forum?id=HkeuD34KPH}\n}",
        "github": "https://github.com/SSE-PT/SSE-PT",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkeuD34KPH",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "297;214;274",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "661;246;347",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            261.6666666666667,
            34.988887124660344
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            418.0,
            176.7050272818141
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 301,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12982475839278702202&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HkevCyrFDS",
        "title": "Unsupervised Video-to-Video Translation via Self-Supervised Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A temporally consistent and modality flexible unsupervised video-to-video translation framework trained in a self-supervised manner.",
        "abstract": "Existing unsupervised video-to-video translation methods fail to produce translated videos which are frame-wise realistic, semantic information preserving and video-level consistent. In this work, we propose a novel unsupervised video-to-video translation model. Our model decomposes the style and the content, uses specialized encoder-decoder structure and propagates the inter-frame information through bidirectional recurrent neural network (RNN) units. The style-content decomposition mechanism enables us to achieve long-term style-consistent video translation results as well as provides us with a good interface for modality flexible translation. In addition, by changing the input frames and style codes incorporated in our translation, we propose a video interpolation loss, which captures temporal information within the sequence to train our building blocks in a self-supervised manner. Our model can produce photo-realistic, spatio-temporal consistent translated videos in a multimodal way. Subjective and objective experimental results validate the superiority of our model over the existing methods.",
        "keywords": "Self-supervised learning;Unsupervised video-to-video translation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kangning Liu;Shuhang Gu;Radu Timofte",
        "authorids": "kl3141@nyu.edu;shuhang.gu@vision.ee.ethz.ch;radu.timofte@vision.ee.ethz.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkevCyrFDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "265;511;100",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "510;1183;539",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.0,
            168.87273314540747
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            744.0,
            310.6455643762947
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eU30OgYRlMIJ:scholar.google.com/&scioq=Unsupervised+Video-to-Video+Translation+via+Self-Supervised+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkewNJStDr",
        "title": "Efficient High-Dimensional Data Representation Learning via Semi-Stochastic Block Coordinate Descent Methods",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "With the increase of data volume and data dimension, sparse representation learning attracts more and more attention. For high-dimensional data, randomized block coordinate descent methods perform well because they do not need to calculate the gradient along the whole dimension. Existing hard thresholding algorithms evaluate gradients followed by a hard thresholding operation to update the model parameter, which leads to slow convergence. To address this issue, we propose a novel hard thresholding algorithm, called Semi-stochastic Block Coordinate Descent Hard Thresholding Pursuit (SBCD-HTP). Moreover, we present its sparse and asynchronous parallel variants. We theoretically analyze the convergence properties of our algorithms, which show that they have a significantly lower hard thresholding complexity than existing algorithms. Our empirical evaluations on real-world datasets and face recognition tasks demonstrate the superior performance of our algorithms for sparsity-constrained optimization problems.",
        "keywords": "Sparse learning;Hard thresholding;High-dimensional regression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bingkun Wei;Yangyang Li;Fanhua Shang;Yuanyuan Liu;Hongying Liu;Shengmei Shen",
        "authorids": "bkwei028@gmail.com;1615401247li@gmail.com;fhshang@xidian.edu.cn;yyliu@xidian.edu.cn;hyliu@xidian.edu.cn;jane.shen@pensees.ai",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nwei2020efficient,\ntitle={Efficient High-Dimensional Data Representation Learning via Semi-Stochastic Block Coordinate Descent Methods},\nauthor={Bingkun Wei and Yangyang Li and Fanhua Shang and Yuanyuan Liu and Hongying Liu and Shengmei Shen},\nyear={2020},\nurl={https://openreview.net/forum?id=HkewNJStDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkewNJStDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "169;322;942",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "454;588;315",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            477.6666666666667,
            334.22181589809816
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            452.3333333333333,
            111.45801401823418
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:V02h-s9maY4J:scholar.google.com/&scioq=Efficient+High-Dimensional+Data+Representation+Learning+via+Semi-Stochastic+Block+Coordinate+Descent+Methods&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Hkex2a4FPr",
        "title": "On Variational Learning of Controllable Representations for Text without Supervision",
        "track": "main",
        "status": "Reject",
        "tldr": "why previous VAEs on text cannot learn controllable latent representation as on images, as well as a fix to enable the first success towards controlled text generation without supervision",
        "abstract": "The variational autoencoder (VAE) has found success in modelling the manifold of natural images on certain datasets, allowing meaningful images to be generated while interpolating or extrapolating in the latent code space, but it is unclear whether similar capabilities are feasible for text considering its discrete nature. In this work, we investigate the reason why unsupervised learning of controllable representations fails for text. We find that traditional sequence VAEs can learn disentangled representations through their latent codes to some extent, but they often fail to properly decode when the latent factor is being manipulated, because the manipulated codes often land in holes or vacant regions in the aggregated posterior latent space, which the decoding network is not trained to process. Both as a validation of the explanation and as a fix to the problem, we propose to constrain the posterior mean to a learned probability simplex, and performs manipulation within this simplex. Our proposed method mitigates the latent vacancy problem and achieves the first success in unsupervised learning of controllable representations for text. Empirically, our method significantly outperforms unsupervised baselines and is competitive with strong supervised approaches on text style transfer. Furthermore, when switching the latent factor (e.g., topic) during a long sentence generation, our proposed framework can often complete the sentence in a seemingly natural way -- a capability that has never been attempted by previous methods. ",
        "keywords": "sequence variational autoencoders;unsupervised learning;controllable text generation;text style transfer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Peng Xu;Yanshuai Cao;Jackie Chi Kit Cheung",
        "authorids": "pxu4@ualberta.ca;yanshuaicao@gmail.com;jcheung@cs.mcgill.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nxu2020on,\ntitle={On Variational Learning of Controllable Representations for Text without Supervision},\nauthor={Peng Xu and Yanshuai Cao and Jackie Chi Kit Cheung},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkex2a4FPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkex2a4FPr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "517;555;154",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "649;509;56",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            408.6666666666667,
            180.7435260866132
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            404.6666666666667,
            253.08277082584837
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 67,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2089630781496630830&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Hkexw1BtDr",
        "title": "Deep Auto-Deferring Policy for Combinatorial Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new scalable framework based on deep reinforcement learning for solving combinatorial optimization on large graphs.",
        "abstract": "Designing efficient algorithms for combinatorial optimization appears ubiquitously in various scientific fields. Recently, deep reinforcement learning (DRL) frameworks have gained considerable attention as a new approach: they can automatically learn the design of a good solver without using any sophisticated knowledge or hand-crafted heuristic specialized for the target problem. However, the number of stages (until reaching the final solution) required by existing DRL solvers is proportional to the size of the input graph, which hurts their scalability to large-scale instances. In this paper, we seek to resolve this issue by proposing a novel design of DRL's policy, coined auto-deferring policy (ADP), automatically stretching or shrinking its decision process. Specifically, it decides whether to finalize the value of each vertex at the current stage or defer to determine it at later stages. We apply the proposed ADP framework to the maximum independent set (MIS) problem, a prototype of NP-complete problems, under various scenarios. Our experimental results demonstrate significant improvement of ADP over the current state-of-the-art DRL scheme in terms of computational efficiency and approximation quality. The reported performance of our generic DRL scheme is also comparable with that of the state-of-the-art solvers specialized for MIS, e.g., ADP outperforms them for some graphs with millions of vertices. ",
        "keywords": "deep reinforcement learning;combinatorial optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sungsoo Ahn;Younggyo Seo;Jinwoo Shin",
        "authorids": "sungsoo.ahn@kaist.ac.kr;younggyo.seo@kaist.ac.kr;jinwoos@kaist.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nahn2020deep,\ntitle={Deep Auto-Deferring Policy for Combinatorial Optimization},\nauthor={Sungsoo Ahn and Younggyo Seo and Jinwoo Shin},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkexw1BtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkexw1BtDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "723;359;779",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1075;559;1039",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            620.3333333333334,
            186.19941520376003
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            891.0,
            235.21904684782652
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3CR1Zu3KREAJ:scholar.google.com/&scioq=Deep+Auto-Deferring+Policy+for+Combinatorial+Optimization&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Hkg-xgrYvH",
        "title": "Empirical Bayes Transductive Meta-Learning with Synthetic Gradients",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a transductive meta-learning algorithm using synthetic gradients, analyze its generalization via information bottleneck, show SOTA results on few-shot learning.",
        "abstract": "We propose a meta-learning approach that learns from multiple tasks in a transductive setting, by leveraging the unlabeled query set in addition to the  support set to generate a more powerful model for each task. To develop our framework, we revisit the empirical Bayes formulation for multi-task learning.   The evidence lower bound of the marginal log-likelihood of empirical Bayes decomposes as a sum of local KL divergences between the variational posterior and the true posterior on the query set of each task.\nWe derive a novel amortized variational inference that couples all the variational posteriors via a meta-model, which consists of a synthetic gradient   network and an initialization network. Each variational posterior is derived from synthetic gradient descent to approximate the true posterior on the query  set, although where we do not have access to the true gradient.\nOur results on the Mini-ImageNet and CIFAR-FS benchmarks for episodic few-shot classification outperform previous state-of-the-art methods. Besides, we conduct two zero-shot learning experiments to further explore the potential of the synthetic gradient.",
        "keywords": "Meta-learning;Empirical Bayes;Synthetic Gradient;Information Bottleneck",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shell Xu Hu;Pablo Garcia Moreno;Yang Xiao;Xi Shen;Guillaume Obozinski;Neil Lawrence;Andreas Damianou",
        "authorids": "dom343@gmail.com;morepabl@amazon.com;yang.xiao@enpc.fr;xi.shen@enpc.fr;guillaume.obozinski@epfl.ch;n.lawrence@sheffield.ac.uk;damianou@amazon.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nHu2020Empirical,\ntitle={Empirical Bayes Transductive Meta-Learning with Synthetic Gradients},\nauthor={Shell Xu Hu and Pablo Garcia Moreno and Yang Xiao and Xi Shen and Guillaume Obozinski and Neil Lawrence and Andreas Damianou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkg-xgrYvH}\n}",
        "github": "https://github.com/amzn/xfer",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Hkg-xgrYvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "656;539;364",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1031;798;613",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            519.6666666666666,
            119.98981438253648
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            814.0,
            171.02241568480628
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 186,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7058757378789244678&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "Hkg0olStDr",
        "title": "Multi-Step Decentralized Domain Adaptation",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel method for decentralized and distributed domain adaptation, as a way to make these methods more practical in real ML systems.",
        "abstract": "Despite the recent breakthroughs in unsupervised domain adaptation (uDA), no prior work has studied the challenges of applying these methods in practical machine learning scenarios. In this paper, we highlight two significant bottlenecks for uDA, namely excessive centralization and poor support for distributed domain datasets. Our proposed framework, MDDA, is powered by a novel collaborator selection algorithm and an effective distributed adversarial training method, and allows for uDA methods to work in a decentralized and privacy-preserving way.  \n",
        "keywords": "domain adaptation;decentralization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akhil Mathur;Shaoduo Gan;Anton Isopoussu;Fahim Kawsar;Nadia Berthouze;Nicholas D. Lane",
        "authorids": "akhilmathurs@gmail.com;sgan@inf.ethz.ch;anton.isopoussu@gmail.com;fahim.kawsar@gmail.com;nadia.berthouze@gmail.com;nicholasd.lane@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nmathur2020multistep,\ntitle={Multi-Step Decentralized Domain Adaptation},\nauthor={Akhil Mathur and Shaoduo Gan and Anton Isopoussu and Fahim Kawsar and Nadia Berthouze and Nicholas D. Lane},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkg0olStDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkg0olStDr",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "571;548;208;149",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "1163;826;234;757",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;2;1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.0,
            191.81110499655645
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            745.0,
            332.6221580111584
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JCj-VEiWGTQJ:scholar.google.com/&scioq=Multi-Step+Decentralized+Domain+Adaptation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Hkg5lAEtvS",
        "title": "Towards Physics-informed Deep Learning for Turbulent Flow Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "While deep learning has shown tremendous success in a wide range of domains, it remains a grand challenge to incorporate physical principles in a systematic manner to the design, training, and inference of such models. In this paper, we aim to predict turbulent flow by learning its highly nonlinear dynamics from spatiotemporal velocity fields of large-scale fluid flow simulations of relevance to turbulence modeling and climate modeling. We adopt a hybrid approach by marrying two well-established turbulent flow simulation techniques with deep learning. Specifically, we introduce trainable spectral filters in a coupled model of Reynolds-averaged Navier-Stokes (RANS) and Large Eddy Simulation (LES), followed by a specialized U-net for prediction. Our approach, which we call Turbulent-Flow Net (TF-Net), is grounded in a principled physics model, yet offers the flexibility of learned representations. We compare our model, TF-Net, with state-of-the-art baselines and observe significant reductions in error for predictions 60 frames ahead. Most significantly, our method predicts physical fields that obey desirable physical characteristics, such as conservation of mass, whilst faithfully emulating the turbulent kinetic energy field and spectrum, which are critical for accurate prediction of turbulent flows.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rui Wang;Karthik Kashinath;Mustafa Mustafa;Adrian Albert;Rose Yu",
        "authorids": "wang.rui4@husky.neu.edu;kkashinath@lbl.gov;mmustafa@lbl.gov;aalbert@lbl.gov;roseyu@northeastern.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwang2020towards,\ntitle={Towards Physics-informed Deep Learning for Turbulent Flow Prediction},\nauthor={Rui Wang and Karthik Kashinath and Mustafa Mustafa and Adrian Albert and Rose Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkg5lAEtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hkg5lAEtvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "639;489;558",
        "wc_reply_reviewers": "134;0;0",
        "wc_reply_authors": "1039;346;696",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            562.0,
            61.30252849597641
        ],
        "wc_reply_reviewers_avg": [
            44.666666666666664,
            63.168205785998246
        ],
        "wc_reply_authors_avg": [
            693.6666666666666,
            282.920876257342
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 446,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15384288009765812398&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Hkg6TySFDr",
        "title": "Neural Reverse Engineering of Stripped Binaries",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "We address the problem of reverse engineering of stripped executables which contain no debug information. This is a challenging problem because of the low amount of syntactic information available in stripped executables, and due to the diverse assembly code patterns arising from compiler optimizations. We present a novel approach for predicting procedure names in stripped executables. Our approach combines static analysis with encoder-decoder-based models. The main idea is to use static analysis to obtain enriched representations of API call sites; encode a set of sequences of these call sites by traversing the Control-Flow Graph; and finally, attend to the encoded sequences while decoding the target name. Our evaluation shows that our model performs predictions that are difficult and time consuming for humans, while improving on the state-of-the-art by 20%.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yaniv David;Uri Alon;Eran Yahav",
        "authorids": "yanivd@cs.technion.ac.il;urialon@cs.technion.ac.il;yahave@cs.technion.ac.il",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hkg6TySFDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "394;162;165",
        "wc_reply_reviewers": "0;0;134",
        "wc_reply_authors": "323;178;292",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            240.33333333333334,
            108.66564416696852
        ],
        "wc_reply_reviewers_avg": [
            44.666666666666664,
            63.168205785998246
        ],
        "wc_reply_authors_avg": [
            264.3333333333333,
            62.3449187094577
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 17,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9212814938917307414&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Hkg9HgBYwH",
        "title": "Encoding Musical Style with Transformer Autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We consider the problem of learning high-level controls over the global structure of sequence generation, particularly in the context of symbolic music generation with complex language models. In this work, we present the Transformer autoencoder, which aggregates encodings of the input data across time to obtain a global representation of style from a given performance.  We show it is possible to combine this global embedding with other temporally distributed embeddings, enabling improved control over the separate aspects of performance style and  and melody. Empirically, we demonstrate the effectiveness of our method on a variety of music generation tasks on the MAESTRO dataset and an internal, 10,000+ hour dataset of piano performances, where we achieve improvements in terms of log-likelihood and mean listening scores as compared to relevant baselines.",
        "keywords": "music generation;sequence-to-sequence model;controllable generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kristy Choi;Curtis Hawthorne;Ian Simon;Monica Dinculescu;Jesse Engel",
        "authorids": "kechoi@cs.stanford.edu;fjord@google.com;iansimon@google.com;noms@google.com;jesseengel@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchoi2020encoding,\ntitle={Encoding Musical Style with Transformer Autoencoders},\nauthor={Kristy Choi and Curtis Hawthorne and Ian Simon and Monica Dinculescu and Jesse Engel},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkg9HgBYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hkg9HgBYwH",
        "pdf_size": 0,
        "rating": "3;6",
        "confidence": "0;0",
        "wc_review": "729;271",
        "wc_reply_reviewers": "42;0",
        "wc_reply_authors": "437;237",
        "reply_reviewers": "1;0",
        "reply_authors": "1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            500.0,
            229.0
        ],
        "wc_reply_reviewers_avg": [
            21.0,
            21.0
        ],
        "wc_reply_authors_avg": [
            337.0,
            100.0
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 136,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11028869108433952179&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HkgAJxrYwr",
        "title": "Attack-Resistant Federated Learning with Residual-based Reweighting",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a novel aggregation algorithm with residual-based reweighting for attack-resistant federated learning.",
        "abstract": "Federated learning has a variety of applications in multiple domains by utilizing private training data stored on different devices. However, the aggregation process in federated learning is highly vulnerable to adversarial attacks so that the global model may behave abnormally under attacks. To tackle this challenge, we present a novel aggregation algorithm with residual-based reweighting to defend federated learning. Our aggregation algorithm combines repeated median regression with the reweighting scheme in iteratively reweighted least squares. Our experiments show that our aggression algorithm outperforms other alternative algorithms in the presence of label-flipping, backdoor, and Gaussian noise attacks. We also provide theoretical guarantees for our aggregation algorithm.\n",
        "keywords": "robust federated learning;backdoor attacks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shuhao Fu;Chulin Xie;Bo Li;Qifeng Chen",
        "authorids": "sfuab@connect.ust.hk;chulinxie@zju.edu.cn;lbo@illinois.edu;chenqifeng22@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfu2020attackresistant,\ntitle={Attack-Resistant Federated Learning with Residual-based Reweighting},\nauthor={Shuhao Fu and Chulin Xie and Bo Li and Qifeng Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgAJxrYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkgAJxrYwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "632;538;343",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            504.3333333333333,
            120.36149255010463
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 120,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7262530793051256078&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HkgB2TNYPS",
        "title": "A Theoretical Analysis of the Number of Shots in Few-Shot Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "The paper analyzes the effect of shot number on prototypical networks and proposes a robust method when the shot number differs from meta-training to meta-testing time.",
        "abstract": "Few-shot classification is the task of predicting the category of an example from a set of few labeled examples. The number of labeled examples per category is called the number of shots (or shot number). Recent works tackle this task through meta-learning, where a meta-learner extracts information from observed tasks during meta-training to quickly adapt to new tasks during meta-testing. In this formulation, the number of shots exploited during meta-training has an impact on the recognition performance at meta-test time. Generally, the shot number used in meta-training should match the one used in meta-testing to obtain the best performance. We introduce a theoretical analysis of the impact of the shot number on Prototypical Networks, a state-of-the-art few-shot classification method. From our analysis, we propose a simple method that is robust to the choice of shot number used during meta-training, which is a crucial hyperparameter. The performance of our model trained for an arbitrary meta-training shot number shows great performance for different values of meta-testing shot numbers. We experimentally demonstrate our approach on different few-shot classification benchmarks.",
        "keywords": "Few shot learning;Meta Learning;Performance Bounds",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianshi Cao;Marc T Law;Sanja Fidler",
        "authorids": "tianshi.cao@mail.utoronto.ca;law@cs.toronto.edu;fidler@cs.toronto.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nCao2020A,\ntitle={A Theoretical Analysis of the Number of Shots in Few-Shot Learning},\nauthor={Tianshi Cao and Marc T Law and Sanja Fidler},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgB2TNYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HkgB2TNYPS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "440;972;579",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "342;429;407",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            663.6666666666666,
            225.28845706979504
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            392.6666666666667,
            36.9353790047188
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 95,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8754064904482309188&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HkgBsaVtDB",
        "title": "Unified recurrent network for many feature types",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a unified RNN that handles five different feature types, each in a different manner.",
        "abstract": "There are time series that are amenable to recurrent neural network (RNN) solutions when treated as sequences, but some series, e.g. asynchronous time series, provide a richer variation of feature types than current RNN cells take into account. In order to address such situations, we introduce a unified RNN that handles five different feature types, each in a different manner. Our RNN framework separates sequential features into two groups dependent on their frequency, which we call sparse and dense features, and which affect cell updates differently. Further, we also incorporate time features at the sequential level that relate to the time between specified events in the sequence and are used to modify the cell's memory state. We also include two types of static (whole sequence level) features, one related to time and one not, which are combined with the encoder output. The experiments show that the proposed modeling framework does increase performance compared to standard cells.",
        "keywords": "sparse;recurrent;asynchronous;time;series",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexander Stec;Diego Klabjan;Jean Utke",
        "authorids": "stec@u.northwestern.edu;d-klabjan@northwestern.edu;jutke@allstate.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nstec2020unified,\ntitle={Unified recurrent network for many feature types},\nauthor={Alexander Stec and Diego Klabjan and Jean Utke},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgBsaVtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HkgBsaVtDB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "114;172;689",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            325.0,
            258.4737252926623
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:q51Ajl0mtW4J:scholar.google.com/&scioq=Unified+recurrent+network+for+many+feature+types&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "HkgFDgSYPH",
        "title": "Adaptive Online Planning for Continual Lifelong Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a method for reducing planning in MPC by measuring the uncertainty of model-free value and policy networks.",
        "abstract": "We study learning control in an online lifelong learning scenario, where mistakes can compound catastrophically into the future and the underlying dynamics of the environment may change. Traditional model-free policy learning methods have achieved successes in difficult tasks due to their broad flexibility, and capably condense broad experiences into compact networks, but struggle in this setting, as they can activate failure modes early in their lifetimes which are difficult to recover from and face performance degradation as dynamics change. On the other hand, model-based planning methods learn and adapt quickly, but require prohibitive levels of computational resources. Under constrained computation limits, the agent must allocate its resources wisely, which requires the agent to understand both its own performance and the current state of the environment: knowing that its mastery over control in the current dynamics is poor, the agent should dedicate more time to planning. We present a new algorithm, Adaptive Online Planning (AOP), that achieves strong performance in this setting by combining model-based planning with model-free learning. By measuring the performance of the planner and the uncertainty of the model-free components, AOP is able to call upon more extensive planning only when necessary, leading to reduced computation times. We show that AOP gracefully deals with novel situations, adapting behaviors and policies effectively in the face of unpredictable changes in the world -- challenges that a continual learning agent naturally faces over an extended lifetime -- even when traditional reinforcement learning methods fail.",
        "keywords": "reinforcement learning;model predictive control;planning;model based;model free;uncertainty;computation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kevin Lu;Igor Mordatch;Pieter Abbeel",
        "authorids": "kzl@berkeley.edu;imordatch@google.com;pabbeel@cs.berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlu2020adaptive,\ntitle={Adaptive Online Planning for Continual Lifelong Learning},\nauthor={Kevin Lu and Igor Mordatch and Pieter Abbeel},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgFDgSYPH}\n}",
        "github": "https://drive.google.com/file/d/1nIv4AqZn6M31M7UTIZ5k4npQMdGcXgPe/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkgFDgSYPH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "372;208;188",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "761;410;259",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            256.0,
            82.42976808572658
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            476.6666666666667,
            210.29238904175514
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17255154219847442214&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkgH0TEYwH",
        "title": "Deep Semi-Supervised Anomaly Detection",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce Deep SAD, a deep method for general semi-supervised anomaly detection that especially takes advantage of labeled anomalies.",
        "abstract": "Deep approaches to anomaly detection have recently shown promising results over shallow methods on large and complex datasets. Typically anomaly detection is treated as an unsupervised learning problem. In practice however, one may have---in addition to a large set of unlabeled samples---access to a small pool of labeled samples, e.g. a subset verified by some domain expert as being normal or anomalous. Semi-supervised approaches to anomaly detection aim to utilize such labeled samples, but most proposed methods are limited to merely including labeled normal samples. Only a few methods take advantage of labeled anomalies, with existing deep approaches being domain-specific. In this work we present Deep SAD, an end-to-end deep methodology for general semi-supervised anomaly detection. We further introduce an information-theoretic framework for deep anomaly detection based on the idea that the entropy of the latent distribution for normal data should be lower than the entropy of the anomalous distribution, which can serve as a theoretical interpretation for our method. In extensive experiments on MNIST, Fashion-MNIST, and CIFAR-10, along with other anomaly detection benchmark datasets, we demonstrate that our method is on par or outperforms shallow, hybrid, and deep competitors, yielding appreciable performance improvements even when provided with only little labeled data.",
        "keywords": "anomaly detection;deep learning;semi-supervised learning;unsupervised learning;outlier detection;one-class classification;deep anomaly detection;deep one-class classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lukas Ruff;Robert A. Vandermeulen;Nico G\u00f6rnitz;Alexander Binder;Emmanuel M\u00fcller;Klaus-Robert M\u00fcller;Marius Kloft",
        "authorids": "contact@lukasruff.com;vandermeulen@cs.uni-kl.de;nico.goernitz@tu-berlin.de;alexander_binder@sutd.edu.sg;mueller@bit.uni-bonn.de;klaus-robert.mueller@tu-berlin.de;kloft@cs.uni-kl.de",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nRuff2020Deep,\ntitle={Deep Semi-Supervised Anomaly Detection},\nauthor={Lukas Ruff and Robert A. Vandermeulen and Nico G\u00f6rnitz and Alexander Binder and Emmanuel M\u00fcller and Klaus-Robert M\u00fcller and Marius Kloft},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgH0TEYwH}\n}",
        "github": "https://github.com/lukasruff/Deep-SAD-PyTorch",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkgH0TEYwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "105;335;225",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;63;555",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            221.66666666666666,
            93.92668535736914
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            206.0,
            248.11690792850052
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 824,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5100822312770479848&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "HkgM81SYDr",
        "title": "SEERL : Sample Efficient Ensemble Reinforcement Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Ensemble learning is a very prevalent method employed in machine learning. The relative success of ensemble methods is attributed to its ability to tackle a wide range of instances and complex problems that require different low-level approaches. However, ensemble methods are relatively less popular in reinforcement learning owing to the high sample complexity and computational expense involved. We present a new training and evaluation framework for model-free algorithms that uses ensembles of policies obtained from a single training instance. These policies are diverse in nature and are learned through directed perturbation of the model parameters at regular intervals. We show that learning an adequately diverse set of policies is required for a good ensemble while extreme diversity can prove detrimental to overall performance. We evaluate our approach to challenging discrete and continuous control tasks and also discuss various ensembling strategies. Our framework  is substantially  sample  efficient, computationally inexpensive and is seen to outperform various baseline methods including other ensemble approaches.",
        "keywords": "Reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rohan Saphal;Balaraman Ravindran;Dheevatsa Mudigere;Sasikanth Avancha;Bharat Kaul",
        "authorids": "rohansaphal@gmail.com;ravi@cse.iitm.ac.in;dheevatsa@fb.com;sasikanth.avancha@intel.com;bharat.kaul@intel.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkgM81SYDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "643;749;366",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            586.0,
            161.47032751148635
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12734018016901763311&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 16
    },
    {
        "id": "HkgMxkHtPH",
        "title": "UWGAN: UNDERWATER GAN FOR REAL-WORLD UNDERWATER COLOR RESTORATION AND DEHAZING",
        "track": "main",
        "status": "Reject",
        "tldr": "A new apporach to enhance underwater images based on GAN and CNNs",
        "abstract": "In real-world underwater environment, exploration of seabed resources, underwater archaeology, and underwater fishing rely on a variety of sensors, vision sensor is the most important one due to its high information content, non-intrusive, and passive nature. However, wavelength-dependent light attenuation and back-scattering result in color distortion and haze effect, which degrade the visibility of images. To address this problem, firstly, we proposed an unsupervised generative adversarial network (GAN) for generating realistic underwater images (color distortion and haze effect simulation) from in-air image and depth map pairs. Secondly, U-Net, which is trained efficiently using synthetic underwater dataset, is adopted for color restoration and de-hazing. Our model directly reconstructs underwater clear images using end-to-end autoencoder networks, while maintaining scene content structural similarity. The results obtained by our method were compared with existing methods qualitatively and quantitatively. Experimental results on open real-world underwater datasets demonstrate that the presented method performs well on different actual underwater scenes, and the processing speed can reach up to 125FPS on images running on one NVIDIA 1060 GPU.",
        "keywords": "underwater image;image restoration;image enhancement;GAN;CNNs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nan Wang;Yabin Zhou;Fenglei Han;Lichao Wan;Haitao Zhu;Yaojing Zheng",
        "authorids": "nanwangmail@hrbeu.edu.cn;zyb0977@163.com;fenglei_han@hrbeu.edu.cn;wanlch1203@hrbeu.edu.cn;zhuhaitao_heu@163.com;yaojingzheng_heu@163.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nwang2020uwgan,\ntitle={{\\{}UWGAN{\\}}: {\\{}UNDERWATER{\\}} {\\{}GAN{\\}} {\\{}FOR{\\}} {\\{}REAL{\\}}-{\\{}WORLD{\\}} {\\{}UNDERWATER{\\}} {\\{}COLOR{\\}} {\\{}RESTORATION{\\}} {\\{}AND{\\}} {\\{}DEHAZING{\\}}},\nauthor={Nan Wang and Yabin Zhou and Fenglei Han and Lichao Wan and Haitao Zhu and Yaojing Zheng},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgMxkHtPH}\n}",
        "github": "https://github.com/infrontofme/UWGAN_UIE",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkgMxkHtPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "211;127;304",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "268;447;600",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            214.0,
            72.29107828771127
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            438.3333333333333,
            135.67690379066815
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 152,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9879217392267591537&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HkgR8erKwB",
        "title": "PAC-Bayesian Neural Network Bounds",
        "track": "main",
        "status": "Reject",
        "tldr": "We derive a new PAC-Bayesian Bound for unbounded loss functions (e.g. Negative Log-Likelihood). ",
        "abstract": "Bayesian neural networks, which both use the negative log-likelihood loss function and average their predictions using a learned posterior over the parameters, have been used successfully across many scientific fields, partly due to their ability to `effortlessly' extract desired representations from many large-scale datasets. However, generalization bounds for this setting is still missing.\nIn this paper, we present a new PAC-Bayesian generalization bound for the negative log-likelihood loss which utilizes the \\emph{Herbst Argument} for the log-Sobolev inequality to bound the moment generating function of the learners risk.",
        "keywords": "PAC-Bayesian bounds;PAC-Bayes;Generalization bounds;Bayesian inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yossi Adi;Alex Schwing;Tamir Hazan",
        "authorids": "yossiadidrum@gmail.com;aschwing@illinois.edu;tamir.hazan@technion.ac.il",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nadi2020pacbayesian,\ntitle={{\\{}PAC{\\}}-Bayesian Neural Network Bounds},\nauthor={Yossi Adi and Alex Schwing and Tamir Hazan},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgR8erKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkgR8erKwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "460;280;288",
        "wc_reply_reviewers": "0;134;0",
        "wc_reply_authors": "408;644;86",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            342.6666666666667,
            83.03145321034808
        ],
        "wc_reply_reviewers_avg": [
            44.666666666666664,
            63.168205785998246
        ],
        "wc_reply_authors_avg": [
            379.3333333333333,
            228.70262108005866
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "HkgTTh4FDH",
        "title": "Implicit Bias of Gradient Descent based Adversarial Training on Separable Data",
        "track": "main",
        "status": "Poster",
        "tldr": "The solution of gradient descent based adversarial training converges in direction to a robust max margin solution that is adapted to adversary geometry, using L2 perturbation also shows significant speed-up in convergence compared to clean training.",
        "abstract": "Adversarial training is a principled approach for training robust neural networks. Despite of tremendous successes in practice, its theoretical properties still remain largely unexplored. In this paper, we provide new theoretical insights of gradient descent based adversarial training by studying its computational properties, specifically on its implicit bias. We take the binary classification task on linearly separable data as an illustrative example, where the loss asymptotically attains its infimum as the parameter diverges to infinity along certain directions. Specifically, we show that for any fixed iteration $T$, when the adversarial perturbation during training has proper bounded L2 norm,  the classifier learned by gradient descent based adversarial training converges in direction to the maximum L2 norm margin classifier at the rate of $O(1/\\sqrt{T})$, significantly faster than the rate $O(1/\\log T}$ of training with clean data. In addition, when the adversarial perturbation during training has bounded Lq norm, the resulting classifier converges in direction to a maximum mixed-norm margin classifier, which has a natural interpretation of robustness, as being the maximum L2 norm margin classifier under worst-case bounded Lq norm perturbation to the data.  Our findings provide theoretical backups for adversarial training that it indeed promotes robustness against adversarial perturbation.",
        "keywords": "implicit bias;adversarial training;robustness;gradient descent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yan Li;Ethan X.Fang;Huan Xu;Tuo Zhao",
        "authorids": "yli939@gatech.edu;xxf13@psu.edu;huan.xu@isye.gatech.edu;tourzhao@gatech.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLi2020Implicit,\ntitle={Implicit Bias of Gradient Descent based Adversarial Training on Separable Data},\nauthor={Yan Li and Ethan X.Fang and Huan Xu and Tuo Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgTTh4FDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkgTTh4FDH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "235;503;522",
        "wc_reply_reviewers": "124;0;0",
        "wc_reply_authors": "651;671;429",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            420.0,
            131.04452169650844
        ],
        "wc_reply_reviewers_avg": [
            41.333333333333336,
            58.45416057808793
        ],
        "wc_reply_authors_avg": [
            583.6666666666666,
            109.6702127086273
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 42,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4137295933786183798&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HkgU3xBtDS",
        "title": "REFINING MONTE CARLO TREE SEARCH AGENTS BY MONTE CARLO TREE SEARCH",
        "track": "main",
        "status": "Reject",
        "tldr": "Apply Monte Carlo Tree Search to episode generation in Alpha Zero",
        "abstract": "Reinforcement learning methods that continuously learn neural networks by episode generation with game tree search have been successful in two-person complete information deterministic games such as chess, shogi, and Go. However, there are only reports of practical cases and there are little evidence to guarantee the stability and the final performance of learning process. In this research, the coordination of episode generation was focused on. By means of regarding the entire system as game tree search, the new method can handle the trade-off between exploitation and exploration during episode generation. The experiments with a small problem showed that it had robust performance compared to the existing method, Alpha Zero.",
        "keywords": "Reinforcement Learning;Monte Carlo Tree Search;Alpha Zero",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Katsuki Ohto",
        "authorids": "katsuki.ohto@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nohto2020refining,\ntitle={{\\{}REFINING{\\}} {\\{}MONTE{\\}} {\\{}CARLO{\\}} {\\{}TREE{\\}} {\\{}SEARCH{\\}} {\\{}AGENTS{\\}} {\\{}BY{\\}} {\\{}MONTE{\\}} {\\{}CARLO{\\}} {\\{}TREE{\\}} {\\{}SEARCH{\\}}},\nauthor={Katsuki Ohto},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgU3xBtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkgU3xBtDS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "110;233;94",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "34;101;43",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            145.66666666666666,
            62.098488083223266
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            59.333333333333336,
            29.69100125552447
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XTzqlZ653GAJ:scholar.google.com/&scioq=REFINING+MONTE+CARLO+TREE+SEARCH+AGENTS+BY+MONTE+CARLO+TREE+SEARCH&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkgXteBYPB",
        "title": "Stochastic Neural Physics Predictor",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a stochastic differentiable forward dynamics predictor that is able to sample multiple physically plausible trajectories under the same initial input state and show that it can be used to train model-free policies more efficiently.",
        "abstract": "Recently,  neural-network  based  forward  dynamics  models  have  been  proposed that  attempt  to  learn  the  dynamics  of  physical  systems  in  a  deterministic  way. While near-term motion can be predicted accurately, long-term predictions suffer from accumulating input and prediction errors which can lead to plausible but different trajectories that diverge from the ground truth.  A system that predicts distributions of the future physical states for long time horizons based on its uncertainty is thus a promising solution.  In this work, we introduce a novel robust Monte Carlo sampling based graph-convolutional dropout method that allows us to sample multiple plausible trajectories for an initial state given a neural-network based forward dynamics predictor.  By introducing a new shape preservation loss and training our dynamics model recurrently, we stabilize long-term predictions. We show that our model\u2019s long-term forward dynamics prediction errors on complicated physical interactions of rigid and deformable objects of various shapes are significantly lower than existing strong baselines. Lastly, we demonstrate how generating multiple trajectories with our Monte Carlo dropout method can be used to train model-free reinforcement learning agents faster and to better solutions on simple manipulation tasks.",
        "keywords": "physics prediction;forward dynamics;stochastic environments;dropout",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Piotr Tatarczyk;Damian Mrowca;Li Fei-Fei;Daniel L. K. Yamins;Nils Thuerey",
        "authorids": "piotr.tatarczyk@tum.de;mrowca@stanford.edu;feifeili@cs.stanford.edu;yamins@stanford.edu;nils.thuerey@tum.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ntatarczyk2020stochastic,\ntitle={Stochastic Neural Physics Predictor},\nauthor={Piotr Tatarczyk and Damian Mrowca and Li Fei-Fei and Daniel L. K. Yamins and Nils Thuerey},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgXteBYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkgXteBYPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "496;219;204",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "727;732;323",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            306.3333333333333,
            134.25431919391727
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            594.0,
            191.63680926864407
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:n5lV3FmS5PIJ:scholar.google.com/&scioq=Stochastic+Neural+Physics+Predictor&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkgYEyrFDr",
        "title": "Learning Good Policies By Learning Good Perceptual Models",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a formulation of curiosity as a visual representation learning problem and show that it allows good visual representations in agents.",
        "abstract": "    Reinforcement learning (RL) has led to increasingly complex looking behavior in recent years. However, such complexity can be misleading and hides over-fitting. We find that visual representations may be a useful metric of complexity, and both correlates well objective optimization and causally effects reward optimization. We then propose curious representation learning (CRL) which allows us to use better visual representation learning algorithms to correspondingly increase visual representation in policy through an intrinsic objective on both simulated environments and transfer to real images. Finally, we show better visual representations induced by CRL allows us to obtain better performance on Atari without any reward than other curiosity objectives.",
        "keywords": "visual representation learning;reinforcement learning;curiosity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yilun Du;Phillip Isola",
        "authorids": "yilundu@mit.edu;phillipi@mit.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ndu2020learning,\ntitle={Learning Good Policies By Learning Good Perceptual Models},\nauthor={Yilun Du and Phillip Isola},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgYEyrFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkgYEyrFDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "412;123;366",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            300.3333333333333,
            126.7920432133745
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0q_nYDuvuPwJ:scholar.google.com/&scioq=Learning+Good+Policies+By+Learning+Good+Perceptual+Models&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Hkg_8xBYDS",
        "title": "$\\alpha^{\\alpha}$-Rank: Scalable Multi-agent Evaluation through Evolution",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We provide a scalable solution to multi-agent evaluation with linear rate complexity in both time and memory in terms of number of agents",
        "abstract": "Although challenging, strategy profile evaluation in large connected learner networks is crucial for enabling the next wave of machine learning applications. Recently, $\\alpha$-Rank, an evolutionary algorithm, has been proposed as a solution for ranking joint policy profiles in multi-agent systems. $\\alpha$-Rank claimed scalability through a polynomial time implementation with respect to the total number of pure strategy profiles. In this paper, we formally prove that such a claim is not grounded. In fact, we show that $\\alpha$-Rank exhibits an exponential complexity in number of agents, hindering its application beyond a small finite number of joint profiles. Realizing such a limitation, we contribute by proposing a scalable evaluation protocol that we title  $\\alpha^{\\alpha}$-Rank. Our method combines evolutionary dynamics with stochastic optimization and double oracles for \\emph{truly} scalable ranking with linear (in number of agents) time and memory complexities. Our contributions allow us, for the first time, to conduct large-scale evaluation experiments of multi-agent systems, where we show successful results on large joint strategy profiles with sizes in the  order of $\\mathcal{O}(2^{25})$ (i.e., $\\approx \\text{$33$ million strategies}$) -- a setting not evaluable using current techniques. ",
        "keywords": "multi-agent evaluation;evolutionary dynamics;game theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yaodong Yang;Rasul Tutunov;Phu Sakulwongtana;Haitham Bou Ammar;Jun Wang",
        "authorids": "yaodong.yang@outlook.com;rasul.tutunov@huawei.com;phusakulwongtana@gmail.com;haitham.ammar@huawei.com;",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=Hkg_8xBYDS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "HkgaETNtDB",
        "title": "Mixout: Effective Regularization to Finetune Large-scale Pretrained Language Models",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "In natural language processing, it has been observed recently that generalization could be greatly improved by finetuning a large-scale language model pretrained on a large unlabeled corpus. Despite its recent success and wide adoption, finetuning a large pretrained language model on a downstream task is prone to degenerate performance when there are only a small number of training instances available. In this paper, we introduce a new regularization technique, to which we refer as \u201cmixout\u201d, motivated by dropout. Mixout stochastically mixes the parameters of two models. We show that our mixout technique regularizes learning to minimize the deviation from one of the two models and that the strength of regularization adapts along the optimization trajectory. We empirically evaluate the proposed mixout and its variants on finetuning a pretrained language model on downstream tasks. More specifically, we demonstrate that the stability of finetuning and the average accuracy greatly increase when we use the proposed approach to regularize finetuning of BERT on downstream tasks in GLUE.",
        "keywords": "regularization;finetuning;dropout;dropconnect;adaptive L2-penalty;BERT;pretrained language model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cheolhyoung Lee;Kyunghyun Cho;Wanmo Kang",
        "authorids": "bloodwass@kaist.ac.kr;kyunghyun.cho@nyu.edu;wanmo.kang@kaist.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLee2020Mixout:,\ntitle={Mixout: Effective Regularization to Finetune Large-scale Pretrained Language Models},\nauthor={Cheolhyoung Lee and Kyunghyun Cho and Wanmo Kang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgaETNtDB}\n}",
        "github": "https://github.com/bloodwass/mixout",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkgaETNtDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "92;329;227",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "181;938;214",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            216.0,
            97.06698717895802
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            444.3333333333333,
            349.33492366050217
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 242,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=476449558403052711&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HkgbKaEtvB",
        "title": "End-To-End Input Selection for Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a framework that automatically learns to select the relevant parts of the input data for a given neural network and its task.",
        "abstract": "Data have often to be moved between servers and clients during the inference phase. This is the case, for instance, when large amounts of data are stored on a public storage server without the possibility for the users to directly execute code and, hence, apply machine learning models. Depending on the available bandwidth, this data transfer can become a major bottleneck. We propose a simple yet effective framework that allows to select certain parts of the input data needed for the subsequent application of a given neural network. Both the associated selection masks as well as the neural network are trained simultaneously such that a good model performance is achieved while, at the same time, only a minimal amount of data is selected. During the inference phase, only the parts selected by the masks have to be transferred between the server and the client. Our experiments indicate that it is often possible to significantly reduce the amount of data needed to be transferred without affecting the model performance much.",
        "keywords": "Deep Learning;Input Selection;Gumbel Softmax Trick;Remote Sensing;Feature Selection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stefan Oehmcke;Fabian Gieseke",
        "authorids": "stefan.oehmcke@gmail.com;fabian.gieseke@di.ku.dk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\noehmcke2020endtoend,\ntitle={End-To-End Input Selection for Deep Neural Networks},\nauthor={Stefan Oehmcke and Fabian Gieseke},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgbKaEtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkgbKaEtvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "432;439;388",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "404;426;587",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            419.6666666666667,
            22.573337271116017
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            472.3333333333333,
            81.57750234524767
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EWPy_5q-Pr0J:scholar.google.com/&scioq=End-To-End+Input+Selection+for+Deep+Neural+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HkgeGeBYDB",
        "title": "RaPP: Novelty Detection with Reconstruction along Projection Pathway",
        "track": "main",
        "status": "Poster",
        "tldr": "A new methodology for novelty detection by utilizing hidden space activation values obtained from a deep autoencoder.",
        "abstract": "We propose RaPP, a new methodology for novelty detection by utilizing hidden space activation values obtained from a deep autoencoder.\nPrecisely, RaPP compares input and its autoencoder reconstruction not only in the input space but also in the hidden spaces.\nWe show that if we feed a reconstructed input to the same autoencoder again, its activated values in a hidden space are equivalent to the corresponding reconstruction in that hidden space given the original input.\nIn order to aggregate the hidden space activation values, we propose two metrics, which enhance the novelty detection performance.\nThrough extensive experiments using diverse datasets, we validate that RaPP improves novelty detection performances of autoencoder-based approaches.\nBesides, we show that RaPP outperforms recent novelty detection methods evaluated on popular benchmarks.\n",
        "keywords": "Novelty Detection;Anomaly Detection;Outlier Detection;Semi-supervised Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ki Hyun Kim;Sangwoo Shim;Yongsub Lim;Jongseob Jeon;Jeongwoo Choi;Byungchan Kim;Andre S. Yoon",
        "authorids": "khkim@makinarocks.ai;sangwoo@makinarocks.ai;yongsub@makinarocks.ai;jongseob.jeon@makinarocks.ai;jeongwoo@makinarocks.ai;kbc8894@makinarocks.ai;andre@makinarocks.ai",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nKim2020RaPP:,\ntitle={RaPP: Novelty Detection with Reconstruction along Projection Pathway},\nauthor={Ki Hyun Kim and Sangwoo Shim and Yongsub Lim and Jongseob Jeon and Jeongwoo Choi and Byungchan Kim and Andre S. Yoon},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgeGeBYDB}\n}",
        "github": "https://drive.google.com/drive/folders/1sknl_i4zmvSsPYZdzYxbg66ZSYDZ_abg?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HkgeGeBYDB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "1339;239;97",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1520;439;528",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;2;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            558.3333333333334,
            555.0503480666316
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            829.0,
            489.9598623016651
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.9428090415820634
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 114,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2855974954213860765&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HkgeUeHFPB",
        "title": "Through the Lens of Neural Network: Analyzing Neural QA Models via Quantized Latent Representation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In recent years, deep learning models remain black boxes, where the decision-making process is still opaque to humans. In this work, we try to explore the probabilities of understanding how machine thinks when doing question-answering tasks. In general, words are represented by continuous latent representations in the neural-based QA models. Here we train the QA models with discrete latent representations, so each word in the context is also a token in the model. In this way, we can know what a word sequence in the context looks like through the lens of the QA models. We analyze the QA models trained on QuAC (Question Answering in Context)  and CoQA (A Conversational Question Answering Challenge) and organize several rules the models obey when dealing with this kind of QA task.  We also find that the models maintain much of the original performance after some hidden layers are quantized.",
        "keywords": "Question Answering;Discrete Representation;Vector Quantization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tsung-Han Wu;Chun-Cheng Hsieh;Yen-Hao Chen;Hung-yi Lee",
        "authorids": "ynnekuw@gmail.com;syasyunjyo@gmail.com;r07921112@ntu.edu.tw;hungyilee@ntu.edu.tw",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer5",
        "site": "https://openreview.net/forum?id=HkgeUeHFPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "659;470;365",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            498.0,
            121.64703037887936
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BWTXcB9BTRkJ:scholar.google.com/&scioq=Through+the+Lens+of+Neural+Network:+Analyzing+Neural+QA+Models+via+Quantized+Latent+Representation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkghoaNYPB",
        "title": "AlgoNet: $C^\\infty$ Smooth Algorithmic Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Integrating classical algorithms into neural networks.",
        "abstract": "Artificial neural networks have revolutionized many areas of computer science in recent years, providing solutions to a number of previously unsolved problems.\nOn the other hand, for many problems, classic algorithms exist, which typically exceed the accuracy and stability of neural networks.\nTo combine these two concepts, we present a new kind of neural networks\u2014algorithmic neural networks (AlgoNets).\nThese networks integrate smooth versions of classic algorithms into the topology of neural networks.\nA forward AlgoNet includes algorithmic layers into existing architectures to enhance performance and explainability while a backward AlgoNet enables solving inverse problems without or with only weak supervision.\nIn addition, we present the algonet package, a PyTorch based library that includes, inter alia, a smoothly evaluated programming language, a smooth 3D mesh renderer, and smooth sorting algorithms.",
        "keywords": "Algorithms;Smoothness;Differentiable;Inverse Problems;Adversarial Training;Neural Networks;Deep Learning;Differentiable Renderer;3D Mesh;Turing-completeness;Library",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Felix Petersen;Christian Borgelt;Oliver Deussen",
        "authorids": "felix.petersen@uni.kn;christian@borgelt.net;oliver.deussen@uni.kn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npetersen2020algonet,\ntitle={AlgoNet: {\\$}C^{\\textbackslash}infty{\\$} Smooth Algorithmic Neural Networks},\nauthor={Felix Petersen and Christian Borgelt and Oliver Deussen},\nyear={2020},\nurl={https://openreview.net/forum?id=HkghoaNYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkghoaNYPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "299;167;228",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            231.33333333333334,
            53.94029621308688
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "Hkgpnn4YvH",
        "title": "Graph Neural Networks For Multi-Image Matching",
        "track": "main",
        "status": "Reject",
        "tldr": "We use Graph Neural Networks to learning multi-image feature matching with Geometric side losses.",
        "abstract": "In geometric computer vision applications, multi-image feature matching gives more accurate and robust solutions compared to simple two-image matching. In this work, we formulate multi-image matching as a graph embedding problem, then use a Graph Neural Network to learn an appropriate embedding function for aligning image features. We use cycle consistency to train our network in an unsupervised fashion, since ground truth correspondence can be difficult or expensive to acquire. Geometric consistency losses are added to aid training, though unlike optimization based methods no geometric information is necessary at inference time. To the best of our knowledge, no other works have used graph neural networks for multi-image feature matching. Our experiments show that our method is competitive with other optimization based approaches.",
        "keywords": "Graph Neural Networks;Multi-image Matching",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stephen Phillips;Kostas Daniilidis",
        "authorids": "stephi@seas.upenn.edu;kostas@seas.upenn.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nphillips2020graph,\ntitle={Graph Neural Networks For Multi-Image Matching},\nauthor={Stephen Phillips and Kostas Daniilidis},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkgpnn4YvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkgpnn4YvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "855;354;223",
        "wc_reply_reviewers": "454;0;0",
        "wc_reply_authors": "822;301;295",
        "reply_reviewers": "2;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            477.3333333333333,
            272.35312045618684
        ],
        "wc_reply_reviewers_avg": [
            151.33333333333334,
            214.01765243912837
        ],
        "wc_reply_authors_avg": [
            472.6666666666667,
            247.02811355975032
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2t3CS1uLEuAJ:scholar.google.com/&scioq=Graph+Neural+Networks+For+Multi-Image+Matching&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HkgqmyrYDH",
        "title": "WORD SEQUENCE PREDICTION FOR AMHARIC LANGUAGE",
        "track": "main",
        "status": "Reject",
        "tldr": "Amharic word sequence prediction model is developed with statistical methods using Hidden Markov Model by incorporating detailed parts of speech tag , user profiling or adaptation. ",
        "abstract": "Word prediction is guessing what word comes after, based on some current information, and it is the main\nfocus of this study. Even though Amharic is used by a large number of populations, no significant work is\ndone on the topic. In this study, Amharic word sequence prediction model is developed using Machine\nlearning. We used statistical methods using Hidden Markov Model by incorporating detailed parts of speech\ntag and user profiling or adaptation. One of the needs for this research is to overcome the challenges on inflected languages. Word sequence prediction is a challenging task for inflected languages (Gustavii &Pettersson, 2003; Seyyed & Assi, 2005). These kinds of languages are morphologically rich and have enormous word forms, which is a word can\nhave different forms. As Amharic language is morphologically rich it shares the problem (Tessema,\n2014).This problem makes word prediction system much more difficult and results poor performance.\nPrevious researches used dictionary approach with no consideration of context information. Due to this\nreason, storing all forms in a dictionary won\u2019t solve the problem as in English and other less inflected\nlanguages. Therefore, we introduced two models; tags and words and linear interpolation that use parts of\nspeech tag information in addition to word n-grams in order to maximize the likelihood of syntactic\nappropriateness of the suggestions. The statistics included in the systems varies from single word\nfrequencies to parts-of-speech tag n-grams. We described a combined statistical and lexical word prediction\nsystem and developed Amharic language models of bigram and trigram for the training purpose. The overall\nstudy followed Design Science Research Methodology (DSRM).\n",
        "keywords": "Word prediction;POS;Statistical approach",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nuniyat Kifle;Ermias Abebe",
        "authorids": "nunukifle2@gmail.com;ermiasabebe@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkifle2020word,\ntitle={{\\{}WORD{\\}} {\\{}SEQUENCE{\\}} {\\{}PREDICTION{\\}} {\\{}FOR{\\}} {\\{}AMHARIC{\\}} {\\{}LANGUAGE{\\}}},\nauthor={Nuniyat Kifle and Ermias Abebe},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgqmyrYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkgqmyrYDH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "278;110;196",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            194.66666666666666,
            68.5921926234239
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YhkUPYjT0mAJ:scholar.google.com/&scioq=WORD+SEQUENCE+PREDICTION+FOR+AMHARIC+LANGUAGE&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "HkgrZ0EYwB",
        "title": "Unpaired Point Cloud Completion on Real Scans using Adversarial Training",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "As 3D scanning solutions become increasingly popular, several deep learning setups have been developed for the task of scan completion, i.e., plausibly filling in regions that were missed in the raw scans. These methods, however, largely rely on supervision in the form of paired training data, i.e., partial scans with corresponding desired completed scans. While these methods have been successfully demonstrated on synthetic data, the approaches cannot be directly used on real scans in absence of suitable paired training data. We develop a first approach that works directly on input point clouds, does not require paired training data,  and hence can directly be applied to real scans for scan completion. We evaluate the approach qualitatively on several real-world datasets (ScanNet, Matterport3D, KITTI), quantitatively on 3D-EPN shape completion benchmark dataset, and demonstrate realistic completions under varying levels of incompleteness.\n",
        "keywords": "point cloud completion;generative adversarial network;real scans",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xuelin Chen;Baoquan Chen;Niloy J. Mitra",
        "authorids": "xuelin.chen.sdu@gmail.com;baoquan.chen@gmail.com;n.mitra@cs.ucl.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nChen2020Unpaired,\ntitle={Unpaired Point Cloud Completion on Real Scans using Adversarial Training},\nauthor={Xuelin Chen and Baoquan Chen and Niloy J. Mitra},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgrZ0EYwB}\n}",
        "github": "https://github.com/xuelin-chen/pcl2pcl-gan-pub",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HkgrZ0EYwB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "257;342;1018",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "171;79;850",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            539.0,
            340.4771162158577
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            366.6666666666667,
            343.8258603162686
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 157,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6319477762897752803&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Hkgs3aNYDS",
        "title": "Quantum Expectation-Maximization for Gaussian Mixture Models",
        "track": "main",
        "status": "Reject",
        "tldr": "It's the quantum algorithm for Expectation Maximization. It's fast: the runtime depends only polylogarithmically on the number of elements in the dataset. ",
        "abstract": "The Expectation-Maximization (EM) algorithm is a fundamental tool in unsupervised machine learning. It is often used as an efficient way to solve Maximum Likelihood (ML) and Maximum A Posteriori estimation problems, especially for models with latent variables. It is also the algorithm of choice to fit mixture models: generative models that represent unlabelled points originating from $k$ different processes, as samples from $k$ multivariate distributions. In this work we define and use a quantum version of EM to fit a Gaussian Mixture Model. Given quantum access to a dataset of $n$ vectors of dimension $d$, our algorithm has convergence and precision guarantees similar to the classical algorithm, but the runtime is only polylogarithmic in the number of elements in the training set, and is polynomial in other parameters - as the dimension of the feature space, and the number of components in the mixture. We generalize further the algorithm by fitting any mixture model of base distributions in the exponential family. We discuss the performance of the algorithm on datasets that are expected to be classified successfully by those algorithms, arguing that on those cases we can give strong guarantees on the runtime.",
        "keywords": "Quantum;ExpectationMaximization;Unsupervised;QRAM",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Iordanis Kerenidis;Anupam Prakash;Alessandro Luongo",
        "authorids": "jkeren@gmail.com;anupamprakash1@gmail.com;aluongo@irif.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkerenidis2020quantum,\ntitle={Quantum Expectation-Maximization for Gaussian Mixture Models},\nauthor={Iordanis Kerenidis and Anupam Prakash and Alessandro Luongo},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkgs3aNYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hkgs3aNYDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "91;614;160",
        "wc_reply_reviewers": "83;504;0",
        "wc_reply_authors": "153;1396;0",
        "reply_reviewers": "1;2;0",
        "reply_authors": "1;3;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            288.3333333333333,
            231.9976053516262
        ],
        "wc_reply_reviewers_avg": [
            195.66666666666666,
            220.64199257822364
        ],
        "wc_reply_authors_avg": [
            516.3333333333334,
            625.1465605937716
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            1.247219128924647
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15172006732265191307&as_sdt=5,44&sciodt=0,44&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HkgsPhNYPS",
        "title": "SELF: Learning to Filter Noisy Labels with Self-Ensembling",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a self-ensemble framework to train more robust deep learning models under noisy labeled datasets.",
        "abstract": "Deep neural networks (DNNs) have been shown to over-fit a dataset when being trained with noisy labels for a long enough time. To overcome this problem, we present a simple and effective method self-ensemble label filtering (SELF) to progressively filter out the wrong labels during training. Our method improves the task performance by gradually allowing supervision only from the potentially non-noisy (clean) labels and stops learning on the filtered noisy labels. For the filtering, we form running averages of predictions over the entire training dataset using the network output at different training epochs. We show that these ensemble estimates yield more accurate identification of inconsistent predictions throughout training than the single estimates of the network at the most recent training epoch. While filtered samples are removed entirely from the supervised training loss, we dynamically leverage them via semi-supervised learning in the unsupervised loss. We demonstrate the positive effect of such an approach on various image classification tasks under both symmetric and asymmetric label noise and at different noise ratios. It substantially outperforms all previous works on noise-aware learning across different datasets and can be applied to a broad set of network architectures.",
        "keywords": "Ensemble Learning;Robust Learning;Noisy Labels;Labels Filtering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Duc Tam Nguyen;Chaithanya Kumar Mummadi;Thi Phuong Nhung Ngo;Thi Hoai Phuong Nguyen;Laura Beggel;Thomas Brox",
        "authorids": "ductam.nguyen08@gmail.com;chaithanyakumar.mummadi@de.bosch.com;thiphuongnhung.ngo@de.bosch.com;hoai.phuong.nguyen198@gmail.com;laura.beggel@de.bosch.com;brox@cs.uni-freiburg.de",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nNguyen2020SELF:,\ntitle={SELF: Learning to Filter Noisy Labels with Self-Ensembling},\nauthor={Duc Tam Nguyen and Chaithanya Kumar Mummadi and Thi Phuong Nhung Ngo and Thi Hoai Phuong Nguyen and Laura Beggel and Thomas Brox},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgsPhNYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkgsPhNYPS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "186;317;506",
        "wc_reply_reviewers": "16;0;0",
        "wc_reply_authors": "486;444;670",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            336.3333333333333,
            131.35279043688246
        ],
        "wc_reply_reviewers_avg": [
            5.333333333333333,
            7.542472332656507
        ],
        "wc_reply_authors_avg": [
            533.3333333333334,
            98.14728161741867
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 400,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5334781473324233536&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HkgsUJrtDB",
        "title": "R\u00e9nyi Fair Inference",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Machine learning algorithms have been increasingly deployed in critical automated decision-making systems that directly affect human lives. When these algorithms are solely trained to minimize the training/test error, they could suffer from systematic discrimination against individuals based on their sensitive attributes, such as gender or race. Recently, there has been a surge in machine learning society to develop algorithms for fair machine learning. \nIn particular, several adversarial learning procedures have been proposed to impose fairness. Unfortunately, these algorithms either can only impose fairness up to linear dependence between the variables, or they lack computational convergence guarantees. In this paper, we use R\u00e9nyi correlation as a measure of fairness of machine learning models and develop a general training framework to impose fairness. In particular, we propose a min-max formulation which balances the accuracy and fairness when solved to optimality. For the case of discrete sensitive attributes, we suggest an iterative algorithm with theoretical convergence guarantee for solving the proposed min-max problem. Our algorithm and analysis are then specialized to fair classification and fair clustering problems. To demonstrate the performance of the proposed R\u00e9nyi fair inference framework in practice, we compare it with well-known existing methods on several benchmark datasets. Experiments indicate that the proposed method has favorable empirical performance against state-of-the-art approaches.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sina Baharlouei;Maher Nouiehed;Ahmad Beirami;Meisam Razaviyayn",
        "authorids": "baharlou@usc.edu;nouiehed@usc.edu;beirami@mit.edu;razaviya@usc.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nBaharlouei2020R\u00e9nyi,\ntitle={R\u00e9nyi Fair Inference},\nauthor={Sina Baharlouei and Maher Nouiehed and Ahmad Beirami and Meisam Razaviyayn},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgsUJrtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkgsUJrtDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "303;276;278",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "479;628;77",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            285.6666666666667,
            12.283683848458853
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            394.6666666666667,
            232.7148947150187
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 115,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8763394583621541424&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkgsWxrtPB",
        "title": "Meta Reinforcement Learning with Autonomous Inference of Subtask Dependencies",
        "track": "main",
        "status": "Poster",
        "tldr": "A novel meta-RL method that infers latent subtask structure",
        "abstract": "We propose and address a novel few-shot RL problem, where a task is characterized by a subtask graph which describes a set of subtasks and their dependencies that are unknown to the agent. The agent needs to quickly adapt to the task over few episodes during adaptation phase to maximize the return in the test phase. Instead of directly learning a meta-policy, we develop a Meta-learner with Subtask Graph Inference (MSGI), which infers the latent parameter of the task by interacting with the environment and maximizes the return given the latent parameter. To facilitate learning, we adopt an intrinsic reward inspired by upper confidence bound (UCB) that encourages efficient exploration. Our experiment results on two grid-world domains and StarCraft II environments show that the proposed method is able to accurately infer the latent task parameter, and to adapt more efficiently than existing meta RL and hierarchical RL methods.",
        "keywords": "Meta reinforcement learning;subtask graph",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sungryull Sohn;Hyunjae Woo;Jongwook Choi;Honglak Lee",
        "authorids": "srsohn@umich.edu;hjwoo@umich.edu;jwook@umich.edu;honglak@eecs.umich.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nSohn2020Meta,\ntitle={Meta Reinforcement Learning with Autonomous Inference of Subtask Dependencies},\nauthor={Sungryull Sohn and Hyunjae Woo and Jongwook Choi and Honglak Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgsWxrtPB}\n}",
        "github": "[![github](/images/github_icon.svg) srsohn/msgi](https://github.com/srsohn/msgi)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkgsWxrtPB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "178;523;391",
        "wc_reply_reviewers": "0;0;28",
        "wc_reply_authors": "555;1086;572",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            364.0,
            142.13373983681706
        ],
        "wc_reply_reviewers_avg": [
            9.333333333333334,
            13.199326582148887
        ],
        "wc_reply_authors_avg": [
            737.6666666666666,
            246.406619680199
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 58,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15507319353031290390&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HkgtJRVFPS",
        "title": "Topological Autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a novel approach for preserving topological structures of the input space in latent representations of autoencoders. Using persistent homology, a technique from topological data analysis, we calculate topological signatures of both the input and latent space to derive a topological loss term. Under weak theoretical assumptions, we can construct this loss in a differentiable manner, such that the encoding learns to retain multi-scale connectivity information. \nWe show that our approach is theoretically well-founded and that it exhibits favourable latent representations on a synthetic manifold as well as on real-world image data sets, while preserving low reconstruction errors.",
        "keywords": "Topology;Deep Learning;Autoencoders;Persistent Homology;Representation Learning;Dimensionality Reduction;Topological Machine Learning;Topological Data Analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael Moor;Max Horn;Bastian Rieck;Karsten Borgwardt",
        "authorids": "michael.moor@bsse.ethz.ch;max.horn@bsse.ethz.ch;bastian.rieck@bsse.ethz.ch;karsten.borgwardt@bsse.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmoor2020topological,\ntitle={Topological Autoencoders},\nauthor={Michael Moor and Max Horn and Bastian Rieck and Karsten Borgwardt},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgtJRVFPS}\n}",
        "github": "https://osf.io/abuce/?view_only=f16d65d3f73e4918ad07cdd08a1a0d4b",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkgtJRVFPS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "253;645;216",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "440;776;43",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            371.3333333333333,
            194.1002032170211
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            419.6666666666667,
            299.59120295644345
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 236,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11510547932502602061&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "Hkgty1BFDS",
        "title": "Super-AND: A Holistic Approach to Unsupervised Embedding Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We proposed a comprehensive approach for unsupervised embedding learning on the basis of AND algorithm.",
        "abstract": "Unsupervised embedding learning aims to extract good representations from data without the use of human-annotated labels. Such techniques are apparently in the limelight because of the challenges in collecting massive-scale labels required for supervised learning. This paper proposes a comprehensive approach, called Super-AND, which is based on the Anchor Neighbourhood Discovery model. Multiple losses defined in Super-AND make similar samples gather even within a low-density space and keep features invariant against augmentation. As a result, our model outperforms existing approaches in various benchmark datasets and achieves an accuracy of 89.2% in CIFAR-10 with the Resnet18 backbone network, a 2.9% gain over the state-of-the-art.",
        "keywords": "Unsupervised embedding learning;computer vision;anchor neighborhood discovery;image clustering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sungwon Han;Yizhan Xu;Sungwon Park;Meeyoung Cha;Cheng-Te Li",
        "authorids": "lion4151@kaist.ac.kr;re6071020@gs.ncku.edu.tw;psw0416@kaist.ac.kr;mcha@ibs.re.kr;chengte@mail.ncku.edu.tw",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "https://github.com/super-AND/super-AND",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkgty1BFDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "374;345;380",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            366.3333333333333,
            15.2825245151302
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ATFbLxWvXk0J:scholar.google.com/&scioq=Super-AND:+A+Holistic+Approach+to+Unsupervised+Embedding+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HkgxW0EYDS",
        "title": "Scalable Model Compression by Entropy Penalized Reparameterization",
        "track": "main",
        "status": "Poster",
        "tldr": "An end-to-end trainable model compression method optimizing accuracy jointly with the expected model size.",
        "abstract": "We describe a simple and general neural network weight compression approach, in which the network parameters (weights and biases) are represented in a \u201clatent\u201d space, amounting to a reparameterization. This space is equipped with a learned probability model, which is used to impose an entropy penalty on the parameter representation during training, and to compress the representation using a simple arithmetic coder after training. Classification accuracy and model compressibility is maximized jointly, with the bitrate\u2013accuracy trade-off specified by a hyperparameter. We evaluate the method on the MNIST, CIFAR-10 and ImageNet classification benchmarks using six distinct model architectures. Our results show that state-of-the-art model compression can be achieved in a scalable and general way without requiring complex procedures such as multi-stage training.",
        "keywords": "deep learning;model compression;computer vision;information theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Deniz Oktay;Johannes Ball\u00e9;Saurabh Singh;Abhinav Shrivastava",
        "authorids": "doktay@princeton.edu;jballe@google.com;saurabhsingh@google.com;abhinav@cs.umd.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nOktay2020Scalable,\ntitle={Scalable Model Compression by Entropy Penalized Reparameterization},\nauthor={Deniz Oktay and Johannes Ball\u00e9 and Saurabh Singh and Abhinav Shrivastava},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgxW0EYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkgxW0EYDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "485;258;254",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "756;90;238",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            332.3333333333333,
            107.96398576480131
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            361.3333333333333,
            285.53731026882554
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 51,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12552667975057314566&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkgxheBFDS",
        "title": "Undersensitivity in Neural Reading Comprehension",
        "track": "main",
        "status": "Reject",
        "tldr": "We demonstrate vulnerability to undersensitivity attacks in SQuAD2.0 and NewsQA neural reading comprehension models, where the model predicts the same answer with increased confidence to adversarially chosen questions, and compare defence strategies.",
        "abstract": "Neural reading comprehension models have recently achieved impressive gener- alisation results, yet still perform poorly when given adversarially selected input. Most prior work has studied semantically invariant text perturbations which cause a model\u2019s prediction to change when it should not. In this work we focus on the complementary problem: excessive prediction undersensitivity where input text is meaningfully changed, and the model\u2019s prediction does not change when it should. We formulate a noisy adversarial attack which searches among semantic variations of comprehension questions for which a model still erroneously pro- duces the same answer as the original question \u2013 and with an even higher prob- ability. We show that \u2013 despite comprising unanswerable questions \u2013 SQuAD2.0 and NewsQA models are vulnerable to this attack and commit a substantial frac- tion of errors on adversarially generated questions. This indicates that current models\u2014even where they can correctly predict the answer\u2014rely on spurious sur- face patterns and are not necessarily aware of all information provided in a given comprehension question. Developing this further, we experiment with both data augmentation and adversarial training as defence strategies: both are able to sub- stantially decrease a model\u2019s vulnerability to undersensitivity attacks on held out evaluation data. Finally, we demonstrate that adversarially robust models gener- alise better in a biased data setting with a train/evaluation distribution mismatch; they are less prone to overly rely on predictive cues only present in the training set and outperform a conventional model in the biased data setting by up to 11% F1.",
        "keywords": "reading comprehension;undersensitivity;adversarial questions;adversarial training;robustness;biased data setting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Johannes Welbl;Pasquale Minervini;Max Bartolo;Pontus Stenetorp;Sebastian Riedel",
        "authorids": "johannes.welbl.14@ucl.ac.uk;p.minervini@gmail.com;maxbartolo@gmail.com;pontus.stenetorp@gmail.com;s.riedel@ucl.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwelbl2020undersensitivity,\ntitle={Undersensitivity in Neural Reading Comprehension},\nauthor={Johannes Welbl and Pasquale Minervini and Max Bartolo and Pontus Stenetorp and Sebastian Riedel},\nyear={2020},\nurl={https://openreview.net/forum?id=HkgxheBFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkgxheBFDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "405;328;522",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "700;340;664",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            418.3333333333333,
            79.75936030056631
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            568.0,
            161.88885075878449
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14155150158824317965&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Hkl-7C4FvH",
        "title": "Conditional Out-of-Sample Generation For Unpaired Data using trVAE",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Generates never seen data during training from a desired condition ",
        "abstract": "While generative models have shown great success in generating high-dimensional samples conditional on low-dimensional descriptors (learning e.g. stroke thickness in MNIST, hair color in CelebA, or speaker identity in Wavenet), their generation out-of-sample poses fundamental problems. The conditional variational autoencoder (CVAE) as a simple conditional generative model does not explicitly relate conditions during training and, hence, has no incentive of learning a compact joint distribution across conditions. We overcome this limitation by matching their distributions using maximum mean discrepancy (MMD) in the decoder layer that follows the bottleneck. This introduces a strong regularization both for reconstructing samples within the same condition and for transforming samples across conditions, resulting in much improved generalization. We refer to the architecture as transformer VAE (trVAE). Benchmarking trVAE on high-dimensional image and tabular data, we demonstrate higher robustness and higher accuracy than existing approaches. In particular, we show qualitatively improved predictions for cellular perturbation response to treatment and disease based on high-dimensional single-cell gene expression data, by tackling previously problematic minority classes and multiple conditions. For generic tasks, we improve Pearson correlations of high-dimensional estimated means and variances with their ground truths from 0.89 to 0.97 and 0.75 to 0.87, respectively.\n",
        "keywords": "Generative Models;OUT-OF-SAMPLE;trVAE;MMD",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohammad Lotfollahi;Mohsen Naghipourfar;Fabian J. Theis;F. Alexander Wolf",
        "authorids": "mohammad.lotfollahi@helmholtz-muenchen.de;mohsen.naghipourfar@helmholtz-muenchen.de;fabian.theis@helmholtz-muenchen.de;alex.wolf@helmholtz-muenchen.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://drive.google.com/open?id=1Hepbm9ceqh-G2gOXRxTPce9zDtq6AWP3",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=Hkl-7C4FvH",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4097506781071757967&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Hkl1iRNFwS",
        "title": "The Early Phase of Neural Network Training",
        "track": "main",
        "status": "Poster",
        "tldr": "We thoroughly investigate neural network learning dynamics over the early phase of training, finding that these changes are crucial and difficult to approximate, though extended pretraining can recover them.",
        "abstract": "Recent studies have shown that many important aspects of neural network learning take place within the very earliest iterations or epochs of training. For example, sparse, trainable sub-networks emerge (Frankle et al., 2019), gradient descent moves into a small subspace (Gur-Ari et al., 2018), and the network undergoes a critical period (Achille et al., 2019). Here we examine the changes that deep neural networks undergo during this early phase of training. We perform extensive measurements of the network state and its updates during these early iterations of training, and leverage the framework of Frankle et al. (2019) to quantitatively probe the weight distribution and its reliance on various aspects of the dataset. We find that, within this framework, deep networks are not robust to reinitializing with random weights while maintaining signs, and that weight distributions are highly non-independent even after only a few hundred iterations. Despite this, pre-training with blurred inputs or an auxiliary self-supervised task can approximate the changes in supervised networks, suggesting that these changes are label-agnostic, though labels significantly accelerate this process. Together, these results help to elucidate the network changes occurring during this pivotal initial period of learning.",
        "keywords": "empirical;learning dynamics;lottery tickets;critical periods;early",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jonathan Frankle;David J. Schwab;Ari S. Morcos",
        "authorids": "jfrankle@mit.edu;dschwab@gc.cuny.edu;arimorcos@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nFrankle2020The,\ntitle={The Early Phase of Neural Network Training},\nauthor={Jonathan Frankle and David J. Schwab and Ari S. Morcos},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkl1iRNFwS}\n}",
        "github": "[![github](/images/github_icon.svg) facebookresearch/open_lth](https://github.com/facebookresearch/open_lth)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkl1iRNFwS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "159;386;425",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "565;242;547",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            323.3333333333333,
            117.28692263940691
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            451.3333333333333,
            148.20331380310705
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 199,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15707294236176535435&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Hkl4EANFDH",
        "title": "Regularizing Trajectories to Mitigate Catastrophic Forgetting",
        "track": "main",
        "status": "Reject",
        "tldr": "Regularizing the optimization trajectory with the Fisher information of old tasks reduces catastrophic forgetting greatly",
        "abstract": "Regularization-based continual learning approaches generally prevent catastrophic forgetting by augmenting the training loss with an auxiliary objective. However in most practical optimization scenarios with noisy data and/or gradients, it is possible that stochastic gradient descent can inadvertently change critical parameters.\nIn this paper, we argue for the importance of regularizing optimization trajectories directly. We derive a new co-natural gradient update rule for continual learning whereby the new task gradients are preconditioned with the empirical Fisher information of previously learnt tasks. We show that using the co-natural gradient systematically reduces forgetting in continual learning. Moreover, it helps combat overfitting when learning a new task in a low resource scenario.",
        "keywords": "Continual Learning;Regularization;Adaptation;Natural Gradient",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Paul Michel;Elisabeth Salesky;Graham Neubig",
        "authorids": "pmichel1@cs.cmu.edu;esalesky@gmail.com;gneubig@cs.cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmichel2020regularizing,\ntitle={Regularizing Trajectories to Mitigate Catastrophic Forgetting},\nauthor={Paul Michel and Elisabeth Salesky and Graham Neubig},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkl4EANFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hkl4EANFDH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "261;169;307",
        "wc_reply_reviewers": "0;0;345",
        "wc_reply_authors": "818;460;1538",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            245.66666666666666,
            57.372079930533765
        ],
        "wc_reply_reviewers_avg": [
            115.0,
            162.63455967290594
        ],
        "wc_reply_authors_avg": [
            938.6666666666666,
            448.2866146662076
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13153324782230912732&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Hkl6i0EFPH",
        "title": "Scalable Differentially Private Data Generation via Private Aggregation of Teacher Ensembles",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We present a novel approach named G-PATE for training differentially private data generator. The generator can be used to produce synthetic datasets with strong privacy guarantee while preserving high data utility. Our approach leverages generative adversarial nets  to generate data and exploits the PATE (Private Aggregation of Teacher Ensembles) framework to protect data privacy. Compared to existing methods, our approach significantly improves the use of privacy budget.  This is possible since we only need to  ensure differential privacy for the generator, which is the part of the model that actually needs to be published for private data generation.  To achieve this, we connect a student generator with an ensemble of teacher discriminators and propose a private gradient aggregation mechanism to ensure differential privacy on all the information that flows from the teacher discriminators to the student generator. Theoretically, we prove that our algorithm ensures differential privacy for the generator.   Empirically, we provide thorough experiments to demonstrate the superiority of our method over prior work on both image and non-image datasets.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yunhui Long;Suxin Lin;Zhuolin Yang;Carl A. Gunter;Han Liu;Bo Li",
        "authorids": "ylong4@illinois.edu;linsuxin28@gmail.com;lucas110550@sjtu.edu.cn;cgunter@illinois.edu;hanliu@northwestern.edu;lbo@illinois.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nlong2020scalable,\ntitle={Scalable Differentially Private Data Generation via Private  Aggregation  of  Teacher Ensembles},\nauthor={Yunhui Long and Suxin Lin and Zhuolin Yang and Carl A. Gunter and Han Liu and Bo Li},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkl6i0EFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hkl6i0EFPH",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "216;375;233",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "334;791;202",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            274.6666666666667,
            71.28503037493606
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            442.3333333333333,
            252.36525557655955
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3421826334876802738&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hkl8Ia4YPH",
        "title": "Hierarchical Summary-to-Article Generation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "we explore the task of summary-to-article generation and propose a hierarchical generation scheme together with a jointly end-to-end reinforcement learning framework to train the hierarchical model.",
        "abstract": "In this paper, we explore \\textit{summary-to-article generation}: the task of generating long articles given a short summary, which provides finer-grained content control for the generated text. To prevent sequence-to-sequence (seq2seq) models from degenerating into language models and better controlling the long text to be generated, we propose a hierarchical generation approach which first generates a sketch of intermediate length based on the summary and then completes the article by enriching the generated sketch. To mitigate the discrepancy between the ``oracle'' sketch used during training and the noisy sketch generated during inference, we propose an end-to-end joint training framework based on multi-agent reinforcement learning. For evaluation, we use text summarization corpora by reversing their inputs and outputs, and introduce a novel evaluation method that employs a summarization system to summarize the generated article and test its match with the original input summary. Experiments show that our proposed hierarchical generation approach can generate a coherent and relevant article based on the given summary, yielding significant improvements upon conventional seq2seq models. ",
        "keywords": "text generation;reinforcement learning;hierarchical generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wangchunshu Zhou;Tao Ge;Ke Xu;Furu Wei;Ming Zhou",
        "authorids": "v-waz@microsoft.com;tage@microsoft.com;kexu@nlsde.buaa.edu.cn;fuwei@microsoft.com;mingzhou@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hkl8Ia4YPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "331;235;468",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "390;260;535",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.6666666666667,
            95.61148234158676
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            395.0,
            112.32393630329497
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WX3QUnIrpjUJ:scholar.google.com/&scioq=Hierarchical+Summary-to-Article+Generation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Hkl9JlBYvr",
        "title": "VariBAD: A Very Good Method for Bayes-Adaptive Deep RL via Meta-Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "VariBAD opens a path to tractable approximate Bayes-optimal exploration for deep RL using ideas from meta-learning, Bayesian RL, and approximate variational inference.",
        "abstract": "Trading off exploration and exploitation in an unknown environment is key to maximising expected return during learning. A Bayes-optimal policy, which does so optimally, conditions its actions not only on the environment state but on the agent\u2019s uncertainty about the environment. Computing a Bayes-optimal policy is however intractable for all but the smallest tasks. In this paper, we introduce variational Bayes-Adaptive Deep RL (variBAD), a way to meta-learn to perform approximate inference in an unknown environment, and incorporate task uncer- tainty directly during action selection. In a grid-world domain, we illustrate how variBAD performs structured online exploration as a function of task uncertainty. We further evaluate variBAD on MuJoCo domains widely used in meta-RL and show that it achieves higher online return than existing methods.",
        "keywords": "Meta-Learning;Bayesian Reinforcement Learning;BAMDPs;Deep Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Luisa Zintgraf;Kyriacos Shiarlis;Maximilian Igl;Sebastian Schulze;Yarin Gal;Katja Hofmann;Shimon Whiteson",
        "authorids": "luisa.zintgraf@cs.ox.ac.uk;kikos1988@gmail.com;maximilian.igl@gmail.com;sebastian.schulze@eng.ox.ac.uk;yarin.gal@cs.ox.ac.uk;katja.hofmann@microsoft.com;shimon.whiteson@cs.ox.ac.uk",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nZintgraf2020VariBAD:,\ntitle={VariBAD: A Very Good Method for Bayes-Adaptive Deep RL via Meta-Learning},\nauthor={Luisa Zintgraf and Kyriacos Shiarlis and Maximilian Igl and Sebastian Schulze and Yarin Gal and Katja Hofmann and Shimon Whiteson},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkl9JlBYvr}\n}",
        "github": "[![github](/images/github_icon.svg) lmzintgraf/varibad](https://github.com/lmzintgraf/varibad) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=Hkl9JlBYvr)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Hkl9JlBYvr",
        "pdf_size": 0,
        "rating": "1;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "126;199;235;771",
        "wc_reply_reviewers": "0;0;0;426",
        "wc_reply_authors": "118;434;596;800",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;2;1;2",
        "rating_avg": [
            5.75,
            2.8613807855648994
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            332.75,
            256.0530950799072
        ],
        "wc_reply_reviewers_avg": [
            106.5,
            184.46341100608544
        ],
        "wc_reply_authors_avg": [
            487.0,
            249.40930215210497
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 329,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4911534686383009186&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HklA93NYwS",
        "title": "Stochasticity and skip connections improve knowledge transfer",
        "track": "main",
        "status": "Withdraw",
        "tldr": "The goal of this paper is to get the effect of multiple teacher networks by exploiting stochastic blocks and skip connections.",
        "abstract": "Deep neural networks have achieved state-of-the-art performance in various fields, but they have to be scaled down to be used for real-world applications. As a means to reduce the size of a neural network while preserving its performance, knowledge transfer has brought a lot of attention. One popular method of knowledge transfer is knowledge distillation (KD), where softened outputs of a pre-trained teacher network help train student networks. Since KD, other transfer methods have been proposed, and they mainly focus on loss functions, activations of hidden layers, or additional modules to transfer knowledge well from teacher networks to student networks. In this work, we focus on the structure of a teacher network to get the effect of multiple teacher networks without additional resources. We propose changing the structure of a teacher network to have stochastic blocks and skip connections. In doing so, a teacher network becomes the aggregate of a huge number of paths. In the training phase, each sub-network is generated by dropping stochastic blocks randomly and used as a teacher network. This allows training the student network with multiple teacher networks and further enhances the student network on the same resources in a single teacher network. We verify that the proposed structure brings further improvement to student networks on benchmark datasets.",
        "keywords": "deep learning;knowledge transfer;model compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kwangjin Lee;Luong Trung Nguyen;Byonghyo Shim",
        "authorids": "kjlee@islab.snu.ac.kr;ltnguyen@islab.snu.ac.kr;bshim@islab.snu.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=HklA93NYwS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10154949942124838071&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HklBjCEKvH",
        "title": "Generalization through Memorization: Nearest Neighbor Language Models",
        "track": "main",
        "status": "Poster",
        "tldr": "We extend a pre-trained neural language model by linearly interpolating it with a k-nearest neighbors model, achieving new state-of-the-art results on Wikitext-103 with no additional training.",
        "abstract": "We introduce $k$NN-LMs, which extend a pre-trained neural language model (LM) by linearly interpolating it with a $k$-nearest neighbors ($k$NN) model. The nearest neighbors are computed according to distance in the pre-trained LM embedding space, and can be drawn from any text collection, including the original LM training data. Applying this transformation to a strong Wikitext-103 LM, with neighbors drawn from the original training set, our $k$NN-LM achieves a new state-of-the-art perplexity of 15.79 -- a 2.9 point improvement with no additional training. We also show that this approach has implications for efficiently scaling up to larger training sets and allows for effective domain adaptation, by simply varying the nearest neighbor datastore, again without further training. Qualitatively, the model is particularly helpful in predicting rare patterns, such as factual knowledge. Together, these results strongly suggest that learning similarity between sequences of text is easier than predicting the next word, and that nearest neighbor search is an effective approach for language modeling in the long tail.",
        "keywords": "language models;k-nearest neighbors",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Urvashi Khandelwal;Omer Levy;Dan Jurafsky;Luke Zettlemoyer;Mike Lewis",
        "authorids": "urvashik@stanford.edu;omerlevy@gmail.com;jurafsky@stanford.edu;lsz@fb.com;mikelewis@fb.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nKhandelwal2020Generalization,\ntitle={Generalization through Memorization: Nearest Neighbor Language Models},\nauthor={Urvashi Khandelwal and Omer Levy and Dan Jurafsky and Luke Zettlemoyer and Mike Lewis},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklBjCEKvH}\n}",
        "github": "https://github.com/urvashik/knnlm",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklBjCEKvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "236;192;565",
        "wc_reply_reviewers": "534;0;0",
        "wc_reply_authors": "454;243;248",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            331.0,
            166.4351725647757
        ],
        "wc_reply_reviewers_avg": [
            178.0,
            251.7300141024109
        ],
        "wc_reply_authors_avg": [
            315.0,
            98.309036546325
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 952,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17433739628027955410&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HklCk1BtwS",
        "title": "Word embedding re-examined: is the symmetrical factorization optimal?",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "As observed in previous works, many word embedding methods exhibit two interesting properties: (1) words having similar semantic meanings are embedded closely; (2) analogy structure exists in the embedding space, such that ''emph{Paris} is to \\emph{France} as \\emph{Berlin} is to \\emph{Germany}''. We theoretically analyze the inner mechanism leading to these nice properties. Specifically, the embedding can be viewed as a linear transformation from the word-context co-occurrence space to the embedding space. We reveal how the relative distances between nodes change during this transforming process. Such linear transformation will result in these good properties. Based on the analysis, we also provide the answer to a question whether the symmetrical factorization (e.g., \\texttt{word2vec}) is better than traditional SVD method. We propose a method to improve the embedding further. The experiments on real datasets verify our analysis.",
        "keywords": "word embedding;matrix factorization;linear transformation;neighborhood structure",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhichao Han;Jia Li;Xu Li;Hong Cheng",
        "authorids": "zchan@se.cuhk.edu.hk;lijia@se.cuhk.edu.hk;xuli@se.cuhk.edu.hk;hcheng@se.cuhk.edu.hk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhan2020word,\ntitle={Word embedding re-examined: is the symmetrical factorization optimal?},\nauthor={Zhichao Han and Jia Li and Xu Li and Hong Cheng},\nyear={2020},\nurl={https://openreview.net/forum?id=HklCk1BtwS}\n}",
        "github": "https://www.dropbox.com/sh/5d5j4pthcgzutdf/AABUvZPJpxUo8ugff1gQ7fIQa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HklCk1BtwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "683;296;336",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "451;147;483",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            438.3333333333333,
            173.77443872893264
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            360.3333333333333,
            151.41407537683614
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2WC5ZDa1ElUJ:scholar.google.com/&scioq=Word+embedding+re-examined:+is+the+symmetrical+factorization+optimal%3F&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HklCmaVtPS",
        "title": "UW-NET: AN INCEPTION-ATTENTION NETWORK FOR UNDERWATER IMAGE CLASSIFICATION",
        "track": "main",
        "status": "Reject",
        "tldr": "A visual understanding mechanism for special environment",
        "abstract": "The classification of images taken in special imaging environments except air is the first challenge in extending the applications of deep learning. We report on an UW-Net (Underwater Network), a new convolutional neural network (CNN) based network for underwater image classification. In this model, we simulate the visual correlation of background attention with image understanding for special environments, such as fog and underwater by constructing an inception-attention (I-A) module. The experimental results demonstrate that the proposed UW-Net achieves an accuracy of 99.3% on underwater image classification, which is significantly better than other image classification networks, such as AlexNet, InceptionV3, ResNet and Se-ResNet. Moreover, we demonstrate the proposed IA module can be used to boost the performance of the existing object recognition networks. By substituting the inception module with the I-A module, the Inception-ResnetV2 network achieves a 10.7% top1 error rate and a 0% top5 error rate on the subset of ILSVRC-2012, which further illustrates the function of the background attention in the image classifications.",
        "keywords": "Underwater image;Convolutional neural network;Image classification;Inception module;Attention module",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Miao Yang and Ke Hu;Chongyi Li;Zhiqiang Wei",
        "authorids": ";;weizhiqiang@ouc.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhu2020uwnet,\ntitle={{\\{}UW{\\}}-{\\{}NET{\\}}: {\\{}AN{\\}} {\\{}INCEPTION{\\}}-{\\{}ATTENTION{\\}} {\\{}NETWORK{\\}} {\\{}FOR{\\}} {\\{}UNDERWATER{\\}} {\\{}IMAGE{\\}} {\\{}CLASSIFICATION{\\}}},\nauthor={Miao Yang and Ke Hu and Chongyi Li and Zhiqiang Wei},\nyear={2020},\nurl={https://openreview.net/forum?id=HklCmaVtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HklCmaVtPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "408;781;129",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            439.3333333333333,
            267.0984005110393
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7108052855950610807&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HklE01BYDB",
        "title": "Improving Sample Efficiency in Model-Free Reinforcement Learning from Images",
        "track": "main",
        "status": "Reject",
        "tldr": "We design a simple and efficient model-free off-policy method for image-based reinforcement learning that matches the state-of-the-art model-based methods in sample efficiency",
        "abstract": "Training an agent to solve control tasks directly from high-dimensional images with model-free reinforcement learning (RL) has proven difficult. The agent needs to learn  a latent representation together with a control policy to perform the task. Fitting a high-capacity encoder using a scarce reward signal is not only extremely sample inefficient, but also prone to suboptimal convergence. Two ways to improve sample efficiency are to learn a good feature representation and use off-policy algorithms. We dissect various approaches of learning good latent features, and conclude that the image reconstruction loss is the essential ingredient that enables efficient and stable representation learning in image-based RL. Following these findings, we devise an off-policy actor-critic algorithm with an auxiliary decoder that trains end-to-end and matches state-of-the-art performance across both model-free and model-based algorithms on many challenging control tasks. We release our code to encourage future research on image-based RL.",
        "keywords": "reinforcement learning;model-free;off-policy;image-based reinforcement learning;continuous control",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Denis Yarats;Amy Zhang;Ilya Kostrikov;Brandon Amos;Joelle Pineau;Rob Fergus",
        "authorids": "denisyarats@cs.nyu.edu;amyzhang@fb.com;ik1078@nyu.edu;brandon.amos.cs@gmail.com;jpineau@fb.com;robfergus@fb.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nyarats2020improving,\ntitle={Improving Sample Efficiency in Model-Free Reinforcement Learning from Images},\nauthor={Denis Yarats and Amy Zhang and Ilya Kostrikov and Brandon Amos and Joelle Pineau and Rob Fergus},\nyear={2020},\nurl={https://openreview.net/forum?id=HklE01BYDB}\n}",
        "github": "https://drive.google.com/file/d/1slqgCj3f8br5K6KiHUKHJBnjAhYnw3M-/view",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HklE01BYDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "507;609;253",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            456.3333333333333,
            149.68708101309954
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 505,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2166916287829038916&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HklFUlBKPB",
        "title": "Identifying Weights and Architectures of Unknown ReLU Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that in many cases it is possible to reconstruct the architecture, weights, and biases of a deep ReLU network given the network's output for specified inputs.",
        "abstract": "The output of a neural network depends on its parameters in a highly nonlinear way, and it is widely assumed that a network's parameters cannot be identified from its outputs. Here, we show that in many cases it is possible to reconstruct the architecture, weights, and biases of a deep ReLU network given the ability to query the network. ReLU networks are piecewise linear and the boundaries between pieces correspond to inputs for which one of the ReLUs switches between inactive and active states. Thus, first-layer ReLUs can be identified (up to sign and scaling) based on the orientation of their associated hyperplanes. Later-layer ReLU boundaries bend when they cross earlier-layer boundaries and the extent of bending reveals the weights between them. Our algorithm uses this to identify the units in the network and weights connecting them (up to isomorphism). The fact that considerable parts of deep networks can be identified from their outputs has implications for security, neuroscience, and our understanding of neural networks.",
        "keywords": "deep neural network;ReLU;piecewise linear function;linear region;activation region;weights;parameters;architecture",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David Rolnick;Konrad P. Kording",
        "authorids": "drolnick@seas.upenn.edu;koerding@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nrolnick2020identifying,\ntitle={Identifying Weights and Architectures of Unknown Re{\\{}LU{\\}} Networks},\nauthor={David Rolnick and Konrad P. Kording},\nyear={2020},\nurl={https://openreview.net/forum?id=HklFUlBKPB}\n}",
        "github": "https://osf.io/rf4jt/?view_only=e367ddfd3ad84a44b7abeddd27624a88",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HklFUlBKPB",
        "pdf_size": 0,
        "rating": "1;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "335;889;279;186",
        "wc_reply_reviewers": "0;0;0;21",
        "wc_reply_authors": "295;617;301;544",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            4.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            422.25,
            274.68288534235256
        ],
        "wc_reply_reviewers_avg": [
            5.25,
            9.093266739736606
        ],
        "wc_reply_authors_avg": [
            439.25,
            143.60427396146676
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15587367691783044842&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HklJ4gSFPS",
        "title": "Task-Mediated Representation Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Traditionally, unsupervised representation learning is used to discover underlying regularities from raw sensory data without relying on labeled data. A great number of algorithms in this field resorts to utilizing proxy objectives to facilitate learning. Further, learning how to act upon these regularities is left to a separate algorithm. Neural encoding in biological systems, on the other hand, is optimized to represent behaviorally relevant features of the environment in order to make inferences that guide successful behavior. Evidence suggests that neural encoding in biological systems is shaped by such behavioral objectives. In our work, we propose a model of inference-driven representation learning. Rather than following some auxiliary, a priori objective (e.g. minimization of reconstruction error, maximization of the fidelity of a generative model, etc.) and indiscriminately encoding information present in an observation, our model learns to build representations that support accurate inferences. Given a set of observations, our model encodes underlying regularities that de facto are necessary to solve the inference problem in hand. Rather than labeling the observations and learning representations that portray corresponding labels or learning representation in a self-supervised manner and learning explicit features of the input observations, we propose a model that learns representations that implicitly shaped by the goal of correct inference.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sergei Bugrov;Ron Sun",
        "authorids": "bugros@rpi.edu;rsun@rpi.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HklJ4gSFPS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "440;263;312",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            338.3333333333333,
            74.62052145504092
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UhDHGArXUjwJ:scholar.google.com/&scioq=Task-Mediated+Representation+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HklJdaNYPH",
        "title": "Augmenting Self-attention with Persistent Memory",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel attention layer that combines self-attention and feed-forward sublayers of Transformer networks.",
        "abstract": "Transformer networks have lead to important progress in language modeling and machine translation. These models include two consecutive modules, a feed-forward layer and a self-attention layer. The latter allows the network to capture long term dependencies and are often regarded as the key ingredient in the success of Transformers. Building upon this intuition, we propose a new model that solely consists of attention layers. More precisely, we augment the self-attention layers with persistent memory vectors that play a similar role as the feed-forward layer. Thanks to these vectors, we can remove the feed-forward layer without degrading the performance of a transformer. Our evaluation shows the benefits brought by our model on standard character and word level language modeling benchmarks.",
        "keywords": "transformer;language modeling;self-attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sainbayar Sukhbaatar;Edouard Grave;Guillaume Lample;Herve Jegou;Armand Joulin",
        "authorids": "sainbar@fb.com;egrave@fb.com;guismay@fb.com;rvj@fb.com;ajoulin@fb.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsukhbaatar2020augmenting,\ntitle={Augmenting Self-attention with Persistent Memory},\nauthor={Sainbayar Sukhbaatar and Edouard Grave and Guillaume Lample and Herve Jegou and Armand Joulin},\nyear={2020},\nurl={https://openreview.net/forum?id=HklJdaNYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklJdaNYPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "466;373;174",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "687;308;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            337.6666666666667,
            121.79855864865105
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            331.6666666666667,
            280.96539921413023
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 138,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1889083650525996475&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HklOjkHKDr",
        "title": "State2vec: Off-Policy Successor Feature Approximators",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "A major challenge in reinforcement learning (RL) is how to design agents that are able to generalize across tasks that share common  dynamics. A viable solution is  meta-reinforcement learning,   which identifies common structures among past tasks to be then generalized to new tasks (meta-test). In meta-training, the RL agent learns state representations that encode  prior information from a set of  tasks, used to generalize the value function approximation. This has been proposed in the literature as   successor representation approximators. While promising, these methods do not generalize  well across optimal policies,  leading to sampling-inefficiency during   meta-test phases. In this paper, we propose state2vec, an efficient and low-complexity framework for learning  successor features which (i) generalize across policies, (ii) ensure sample-efficiency during meta-test. Representing each RL tasks as a graph, we extend the well known nod2vec framework to learn graph embeddings able to capture the discounted future state transitions in RL. The proposed off-policy state2vec captures the geometry of the underlying state space, making good basis functions for linear value function approximation. ",
        "keywords": "reinforcement learning;meta learning;transfer learning;value function approximation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sephora Madjiheurem;Laura Toni",
        "authorids": "sephora.madjiheurem.17@ucl.ac.uk;l.toni@ucl.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HklOjkHKDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "143;128;455",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            242.0,
            150.73818361649447
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12381373503507205160&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HklOo0VFDH",
        "title": "Decoding As Dynamic Programming For Recurrent Autoregressive Models",
        "track": "main",
        "status": "Poster",
        "tldr": "Approximate inference using dynamic programming for Autoregressive models.",
        "abstract": "Decoding in autoregressive models (ARMs) consists of searching for a high scoring output sequence under the trained model.  Standard decoding methods, based on unidirectional greedy algorithm or beam search, are suboptimal due to error propagation and myopic decisions which do not account for future steps in the generation process. In this paper we present a novel decoding approach based on the method of auxiliary coordinates (Carreira-Perpinan & Wang, 2014) to address the aforementioned shortcomings.  Our method introduces discrete variables for output tokens,  and auxiliary continuous variables representing the states of the underlying ARM. The auxiliary variables lead to a factor graph approximation of the ARM, whose maximum a posteriori (MAP) inference is found exactly using dynamic programming. The MAP inference is then used to recreate an improved factor graph approximation of the ARM via updated auxiliary variables. We then extend our approach to decode in an ensemble of ARMs, possibly with different generation orders,  which is out of reach for the standard unidirectional decoding algorithms. Experiments on the text infilling task over SWAG and Daily Dialogue datasets show that our decoding method is superior to strong unidirectional decoding baselines.",
        "keywords": "Decoding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Najam Zaidi;Trevor Cohn;Gholamreza Haffari",
        "authorids": "syed.zaidi1@monash.edu;t.cohn@unimelb.edu.au;reza.haffari@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nZaidi2020Decoding,\ntitle={Decoding As Dynamic Programming For Recurrent Autoregressive Models},\nauthor={Najam Zaidi and Trevor Cohn and Gholamreza Haffari},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklOo0VFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HklOo0VFDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "393;122;431",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "574;268;527",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            315.3333333333333,
            137.58472137397945
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            456.3333333333333,
            134.54697651336835
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=459446075623022361&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HklPzxHFwB",
        "title": "Zero-Shot Policy Transfer with Disentangled Attention",
        "track": "main",
        "status": "Reject",
        "tldr": "We present an agent that uses a beta-vae to extract visual features and an attention mechanism to ignore irrelevant features from visual observations to enable robust transfer between visual domains.",
        "abstract": "Domain adaptation is an open problem in deep reinforcement learning (RL). Often, agents are asked to perform in environments where data is difficult to obtain. In such settings, agents are trained in similar environments, such as simulators, and are then transferred to the original environment. The gap between visual observations of the source and target environments often causes the agent to fail in the target environment. We present a new RL agent, SADALA (Soft Attention DisentAngled representation Learning Agent). SADALA first learns a compressed state representation. It then jointly learns to ignore distracting features and solve the task presented. SADALA's separation of important and unimportant visual features leads to robust domain transfer. SADALA outperforms both prior disentangled-representation based RL and domain randomization approaches across RL environments (Visual Cartpole and DeepMind Lab).",
        "keywords": "Transfer Learning;Reinforcement Learning;Attention;Domain Adaptation;Representation Learning;Feature Extraction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Josh Roy;George Konidaris",
        "authorids": "josh_roy@brown.edu;gdk@cs.brown.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nroy2020zeroshot,\ntitle={Zero-Shot Policy Transfer with Disentangled Attention},\nauthor={Josh Roy and George Konidaris},\nyear={2020},\nurl={https://openreview.net/forum?id=HklPzxHFwB}\n}",
        "github": "https://drive.google.com/open?id=1KRaAjLofmAGpk2OhsmjUaXG1-DExkpOy",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklPzxHFwB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "442;183;391",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "371;225;195",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            338.6666666666667,
            112.02479884184375
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            263.6666666666667,
            76.87796968067134
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:em90Oz5DVmcJ:scholar.google.com/&scioq=Zero-Shot+Policy+Transfer+with+Disentangled+Attention&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HklQYxBKwS",
        "title": "Neural tangent kernels, transportation mappings, and universal approximation",
        "track": "main",
        "status": "Poster",
        "tldr": "The NTK linearization is a universal approximator, even when looking arbitrarily close to initialization",
        "abstract": "This paper establishes rates of universal approximation for the shallow neural tangent kernel (NTK): network weights are only allowed microscopic changes from random initialization, which entails that activations are mostly unchanged, and the network is nearly equivalent to its linearization. Concretely, the paper has two main contributions: a generic scheme to approximate functions with the NTK by sampling from transport mappings between the initial weights and their desired values, and the construction of transport mappings via Fourier transforms. Regarding the first contribution, the proof scheme provides another perspective on how the NTK regime arises from rescaling: redundancy in the weights due to resampling allows individual weights to be scaled down. Regarding the second contribution, the most notable transport mapping asserts that roughly $1 / \\delta^{10d}$ nodes are sufficient to approximate continuous functions, where $\\delta$ depends on the continuity properties of the target function. By contrast, nearly the same proof yields a bound of $1 / \\delta^{2d}$ for shallow ReLU networks; this gap suggests a tantalizing direction for future work, separating shallow ReLU networks and their linearization.\n",
        "keywords": "Neural Tangent Kernel;universal approximation;Barron;transport mapping",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ziwei Ji;Matus Telgarsky;Ruicheng Xian",
        "authorids": "ziweiji2@illinois.edu;mjt@illinois.edu;rxian2@illinois.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nJi2020Neural,\ntitle={Neural tangent kernels, transportation mappings, and universal approximation},\nauthor={Ziwei Ji and Matus Telgarsky and Ruicheng Xian},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklQYxBKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HklQYxBKwS",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "806;335",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "1161;136",
        "reply_reviewers": "0;0",
        "reply_authors": "3;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            570.5,
            235.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            648.5,
            512.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            1.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 58,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5271807721736493316&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HklRKpEKDr",
        "title": "Deep Coordination Graphs",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce an efficient value factorization architecture for MARL that is defined by a coordination graph.",
        "abstract": "This paper introduces the deep coordination graph (DCG) for collaborative multi-agent reinforcement learning. DCG strikes a flexible trade-off between representational capacity and generalization by factorizing the joint value function of all agents according to a coordination graph into payoffs between pairs of agents. The value can be maximized by local message passing along the graph, which allows training of the value function end-to-end with Q-learning. Payoff functions are approximated with deep neural networks and parameter sharing improves generalization over the state-action space. We show that DCG can solve challenging predator-prey tasks that are vulnerable to the relative overgeneralization pathology and in which all other known value factorization approaches fail.",
        "keywords": "multi-agent reinforcement learning;coordination graph;deep Q-learning;value factorization;relative overgeneralization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wendelin Boehmer;Vitaly Kurin;Shimon Whiteson",
        "authorids": "wendelin.boehmer@cs.ox.ac.uk;vitaly.kurin@cs.ox.ac.uk;shimon.whiteson@cs.ox.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nboehmer2020deep,\ntitle={Deep Coordination Graphs},\nauthor={Wendelin Boehmer and Vitaly Kurin and Shimon Whiteson},\nyear={2020},\nurl={https://openreview.net/forum?id=HklRKpEKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklRKpEKDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "215;287;349",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "264;289;547",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            283.6666666666667,
            54.75602452901619
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            366.6666666666667,
            127.92271972649043
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 215,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8113641514627174064&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "HklRwaEKwB",
        "title": "Ridge Regression: Structure, Cross-Validation, and Sketching",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We study the structure of ridge regression in a high-dimensional asymptotic framework, and get insights about cross-validation and sketching.",
        "abstract": "We study the following three fundamental problems about ridge regression: (1) what is the structure of the estimator? (2) how to correctly use cross-validation to choose the regularization parameter? and (3) how to accelerate computation without losing too much accuracy? We consider the three problems in a unified large-data linear model. We give a precise representation of ridge regression as a covariance matrix-dependent linear combination of the true parameter and the noise. \nWe study the bias of $K$-fold cross-validation for choosing the regularization parameter, and propose a simple bias-correction. We analyze the accuracy of primal and dual sketching for ridge regression, showing they are surprisingly accurate. Our results are illustrated by simulations and by analyzing empirical data.",
        "keywords": "ridge regression;sketching;random matrix theory;cross-validation;high-dimensional asymptotics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sifan Liu;Edgar Dobriban",
        "authorids": "sfliu@stanford.edu;dobribanedgar@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLiu2020Ridge,\ntitle={Ridge Regression: Structure, Cross-Validation, and Sketching},\nauthor={Sifan Liu and Edgar Dobriban},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklRwaEKwB}\n}",
        "github": "https://github.com/liusf15/RidgeRegression",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HklRwaEKwB",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "252;467",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "787;777",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.5,
            107.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            782.0,
            5.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 74,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16996813941555291674&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HklSeREtPB",
        "title": "Emergence of functional and structural properties of the head direction system by optimization of recurrent neural networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Artificial neural networks trained with gradient descent are capable of recapitulating both realistic neural activity and the anatomical organization of a biological circuit.",
        "abstract": "Recent work suggests goal-driven training of neural networks can be used to model neural activity in the brain. While response properties of neurons in artificial neural networks bear similarities to those in the brain, the network architectures are often constrained to be different. Here we ask if a neural network can recover both neural representations and, if the architecture is unconstrained and optimized, also the anatomical properties of neural circuits. We demonstrate this in a system where the connectivity and the functional organization have been characterized, namely, the head direction circuit of the rodent and fruit fly. We trained recurrent neural networks (RNNs) to estimate head direction through integration of angular velocity. We found that the two distinct classes of neurons observed in the head direction system, the Compass neurons and the Shifter neurons, emerged naturally in artificial neural networks as a result of training. Furthermore, connectivity analysis and in-silico neurophysiology revealed structural and mechanistic similarities between artificial networks and the head direction system. Overall, our results show that optimization of RNNs in a goal-driven task can recapitulate the structure and function of biological circuits, suggesting that artificial neural networks can be used to study the brain at the level of both neural activity and anatomical organization.",
        "keywords": "recurrent network;head direction system;neural circuits;neural coding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Christopher J. Cueva;Peter Y. Wang;Matthew Chin;Xue-Xin Wei",
        "authorids": "ccueva@gmail.com;peterwang724@gmail.com;mattchin35@gmail.com;weixxpku@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nCueva2020Emergence,\ntitle={Emergence of functional and structural properties of the head direction system by optimization of recurrent neural networks},\nauthor={Christopher J. Cueva and Peter Y. Wang and Matthew Chin and Xue-Xin Wei},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklSeREtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HklSeREtPB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "395;453;342",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "787;588;159",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            396.6666666666667,
            45.330882286680556
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            511.3333333333333,
            262.04876560586115
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 39,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8221292568647616560&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HklUCCVKDB",
        "title": "Uncertainty-guided Continual Learning with Bayesian Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "A regularization-based approach for continual learning using Bayesian neural networks to predict parameters' importance",
        "abstract": "Continual learning aims to learn new tasks without forgetting previously learned ones. This is especially challenging when one cannot access data from previous tasks and when the model has a fixed capacity. Current regularization-based continual learning algorithms  need an external representation and extra computation to measure the parameters' \\textit{importance}. In contrast, we propose Uncertainty-guided Continual Bayesian Neural Networks (UCB), where the learning rate adapts according to the uncertainty defined in the probability distribution of the weights in  networks. Uncertainty is a natural way to identify \\textit{what to remember} and \\textit{what to change} as we continually learn,  and thus mitigate catastrophic forgetting. We also show a variant of our model, which uses uncertainty for weight pruning \nand retains task performance after pruning by saving binary masks per tasks. We evaluate our UCB approach extensively on diverse object classification datasets with short and long sequences of tasks and report superior or on-par performance compared to existing approaches. Additionally, we show that our model does not necessarily need task information at test time, i.e. it does not presume knowledge of which task a sample belongs to.",
        "keywords": "continual learning;catastrophic forgetting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sayna Ebrahimi;Mohamed Elhoseiny;Trevor Darrell;Marcus Rohrbach",
        "authorids": "sayna@berkeley.edu;mohamed.elhoseiny@gmail.com;trevor@eecs.berkeley.edu;maroffm@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nEbrahimi2020Uncertainty-guided,\ntitle={Uncertainty-guided Continual Learning with Bayesian Neural Networks},\nauthor={Sayna Ebrahimi and Mohamed Elhoseiny and Trevor Darrell and Marcus Rohrbach},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklUCCVKDB}\n}",
        "github": "https://github.com/SaynaEbrahimi/UCB",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklUCCVKDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "359;395;250",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "820;575;24",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            334.6666666666667,
            61.64594246357357
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            473.0,
            332.87334928868466
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 266,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10082473234430355613&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "HklWsREKwr",
        "title": "Training Deep Neural Networks with Partially Adaptive Momentum",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Adaptive gradient methods, which adopt historical gradient information to automatically adjust the learning rate, despite the nice property of fast convergence, have been observed to generalize worse than stochastic gradient descent (SGD) with momentum in training deep neural networks. This leaves how to close the generalization gap of adaptive gradient methods an open problem. In this work, we show that adaptive gradient methods such as Adam, Amsgrad, are sometimes ``over adapted''. We design a new algorithm, called Partially adaptive momentum estimation method, which unifies the Adam/Amsgrad with SGD by introducing a partial adaptive parameter $p$, to achieve the best from both worlds. We also prove the convergence rate of our proposed algorithm to a stationary point in the stochastic nonconvex optimization setting. Experiments on standard benchmarks show that our proposed algorithm can maintain fast convergence rate as Adam/Amsgrad while generalizing as well as SGD in training deep neural networks. These results would suggest practitioners pick up adaptive gradient methods once again for faster training of deep neural networks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jinghui Chen;Dongruo Zhou;Yiqi Tang;Ziyan Yang;Yuan Cao;Quanquan Gu",
        "authorids": "jc4zg@virginia.edu;drzhou@cs.ucla.edu;yt6ze@virginia.edu;zy3cx@virginia.edu;yuanc@princeton.edu;qgu@cs.ucla.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nchen2020training,\ntitle={Training Deep Neural Networks with Partially Adaptive Momentum},\nauthor={Jinghui Chen and Dongruo Zhou and Yiqi Tang and Ziyan Yang and Yuan Cao and Quanquan Gu},\nyear={2020},\nurl={https://openreview.net/forum?id=HklWsREKwr}\n}",
        "github": "https://www.dropbox.com/s/vfolf6bthmqc0eb/Padam_code.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HklWsREKwr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "1083;510;570",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "785;615;316",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            721.0,
            257.1419841255022
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            572.0,
            193.86765245049693
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GyBcZFcgY14J:scholar.google.com/&scioq=Training+Deep+Neural+Networks+with+Partially+Adaptive+Momentum&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HklXn1BKDH",
        "title": "Learning To Explore Using Active Neural SLAM",
        "track": "main",
        "status": "Poster",
        "tldr": "A modular and hierarchical approach to learn policies for exploring 3D environments.",
        "abstract": "This work presents a modular and hierarchical approach to learn policies for exploring 3D environments, called `Active Neural SLAM'. Our approach leverages the strengths of both classical and learning-based methods, by using analytical path planners with learned SLAM module, and global and local policies. The use of learning provides flexibility with respect to input modalities (in the SLAM module), leverages structural regularities of the world (in global policies), and provides robustness to errors in state estimation (in local policies). Such use of learning within each module retains its benefits, while at the same time, hierarchical decomposition and modular training allow us to sidestep the high sample complexities associated with training end-to-end policies. Our experiments in visually and physically realistic simulated 3D environments demonstrate the effectiveness of our approach over past learning and geometry-based approaches. The proposed model can also be easily transferred to the PointGoal task and was the winning entry of the CVPR 2019 Habitat PointGoal Navigation Challenge.",
        "keywords": "Navigation;Exploration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Devendra Singh Chaplot;Dhiraj Gandhi;Saurabh Gupta;Abhinav Gupta;Ruslan Salakhutdinov",
        "authorids": "chaplot@cs.cmu.edu;dhirajgandhi@fb.com;saurabhg@illinois.edu;abhinavg@cs.cmu.edu;rsalakhu@cs.cmu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nChaplot2020Learning,\ntitle={Learning To Explore Using Active Neural SLAM},\nauthor={Devendra Singh Chaplot and Dhiraj Gandhi and Saurabh Gupta and Abhinav Gupta and Ruslan Salakhutdinov},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklXn1BKDH}\n}",
        "github": "https://github.com/devendrachaplot/Neural-SLAM",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklXn1BKDH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "517;1037;110",
        "wc_reply_reviewers": "0;9;0",
        "wc_reply_authors": "698;627;30",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            554.6666666666666,
            379.3822464070886
        ],
        "wc_reply_reviewers_avg": [
            3.0,
            4.242640687119285
        ],
        "wc_reply_authors_avg": [
            451.6666666666667,
            299.56894958515903
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 649,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11696547235753024845&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HklZUpEtvr",
        "title": "OPTIMAL TRANSPORT, CYCLEGAN, AND PENALIZED LS FOR UNSUPERVISED LEARNING IN INVERSE PROBLEMS",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The penalized least squares (PLS) is a classic approach to inverse problems, where a regularization term is added to stabilize the solution. Optimal transport (OT) is another mathematical framework for computer vision tasks by providing means to transport one measure to another at minimal cost. Cycle-consistent generative adversarial network (cycleGAN) is a recent extension of GAN to learn target distributions with less mode collapsing behavior. Although similar in that no supervised training is required, the algorithms look different, so the mathematical relationship between these approaches is not clear. In this article, we provide an important advance to unveil the missing link. Specifically, we reveal that a cycleGAN architecture can be derived as  a dual formulation of the optimal transport problem, if the PLS with a  deep learning penalty is used as a transport cost between the two probability measures  from measurements  and unknown images. This suggests that cycleGAN can be considered as stochastic generalization of  classical PLS approaches. \nOur derivation is so general that various types of cycleGAN architecture can be easily derived by merely changing the transport cost. As proofs of concept, this paper provides novel cycleGAN architecture for unsupervised learning in accelerated MRI and deconvolution microscopy problems, which confirm the efficacy and the flexibility of the theory.",
        "keywords": "Optimal transport;CycleGAN;penalized LS;unsupervised learning;and inverse problems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Byeongsu Sim;Gyutaek Oh;Sungjun Lim;and Jong Chul Ye",
        "authorids": "byeongsu.s@kaist.ac.kr;okt0711@kaist.ac.kr;sungjunlim@gmail.com;jong.ye@kaist.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsim2020optimal,\ntitle={{\\{}OPTIMAL{\\}} {\\{}TRANSPORT{\\}}, {\\{}CYCLEGAN{\\}}, {\\{}AND{\\}} {\\{}PENALIZED{\\}} {\\{}LS{\\}} {\\{}FOR{\\}} {\\{}UNSUPERVISED{\\}} {\\{}LEARNING{\\}} {\\{}IN{\\}} {\\{}INVERSE{\\}} {\\{}PROBLEMS{\\}}},\nauthor={Byeongsu Sim and Gyutaek Oh and Sungjun Lim and and Jong Chul Ye},\nyear={2020},\nurl={https://openreview.net/forum?id=HklZUpEtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HklZUpEtvr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "678;465;286",
        "wc_reply_reviewers": "767;0;0",
        "wc_reply_authors": "1757;810;297",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            476.3333333333333,
            160.23385687453475
        ],
        "wc_reply_reviewers_avg": [
            255.66666666666666,
            361.5672674467213
        ],
        "wc_reply_authors_avg": [
            954.6666666666666,
            604.7568841186422
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1502925102802969008&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Hkl_bCVKDr",
        "title": "Scaleable input gradient regularization for adversarial robustness",
        "track": "main",
        "status": "Reject",
        "tldr": "New robust certification bounds motivate gradient regularization for adversarial robustness ",
        "abstract": "In this work we revisit gradient regularization for adversarial robustness with some new ingredients.  First, we derive new per-image theoretical robustness bounds based on local gradient information. These bounds strongly motivate input gradient regularization.  Second, we implement a scaleable version of input gradient regularization which avoids double backpropagation: adversarially robust ImageNet models are trained in 33 hours on four consumer grade GPUs.  Finally, we show experimentally and through theoretical certification that input gradient regularization is competitive with adversarial training. Moreover we demonstrate that gradient regularization does not lead to gradient obfuscation or gradient masking.",
        "keywords": "adversarial robustness;gradient regularization;robust certification;robustness bounds",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chris Finlay;Adam M Oberman",
        "authorids": "christopher.finlay@mail.mcgill.ca;adam.oberman@mcgill.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nfinlay2020scaleable,\ntitle={Scaleable input gradient regularization for adversarial robustness},\nauthor={Chris Finlay and Adam M Oberman},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkl_bCVKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hkl_bCVKDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "468;362;242",
        "wc_reply_reviewers": "220;206;0",
        "wc_reply_authors": "1452;1397;96",
        "reply_reviewers": "2;1;0",
        "reply_authors": "5;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.3333333333333,
            92.32310412651621
        ],
        "wc_reply_reviewers_avg": [
            142.0,
            100.5716991338352
        ],
        "wc_reply_authors_avg": [
            981.6666666666666,
            626.6632978632855
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.699673171197595
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 94,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17098923237249986881&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Hkl_sAVtwr",
        "title": "Compressed Sensing with Deep Image Prior and Learned Regularization",
        "track": "main",
        "status": "Reject",
        "tldr": "Compressed sensing methods with untrained networks and theoretical guarantees",
        "abstract": "We propose a novel method for compressed sensing recovery using\nuntrained deep generative models. Our method is based on the recently\nproposed Deep Image Prior (DIP), wherein the convolutional weights of\nthe network are optimized to match the observed measurements. We show\nthat this approach can be applied to solve any differentiable linear inverse\nproblem, outperforming previous unlearned methods. Unlike various learned approaches based on generative models, our method does not require pre-training over large datasets. We further introduce a novel learned regularization technique, which incorporates prior information on the network weights. This reduces reconstruction error, especially for noisy measurements. Finally we prove that, using the DIP optimization approach, moderately overparameterized single-layer networks trained can perfectly fit any signal despite the nonconvex nature of the fitting problem. This theoretical result provides justification for early stopping.",
        "keywords": "compressed sensing;sparsity;inverse problems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dave Van Veen;Ajil Jalal;Mahdi Soltanolkotabi;Eric Price;Sriram Vishwanath;Alexandros G. Dimakis",
        "authorids": "davemvanveen@gmail.com;ajiljalal@utexas.edu;soltanol@usc.edu;ecprice@cs.utexas.edu;sriram@austin.utexas.edu;dimakis@austin.utexas.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nveen2020compressed,\ntitle={Compressed Sensing with Deep Image Prior and Learned Regularization},\nauthor={Dave Van Veen and Ajil Jalal and Mahdi Soltanolkotabi and Eric Price and Sriram Vishwanath and Alexandros G. Dimakis},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkl_sAVtwr}\n}",
        "github": "https://github.com/anon-iclr/csdip-iclr",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1;AnonReviewer5",
        "site": "https://openreview.net/forum?id=Hkl_sAVtwr",
        "pdf_size": 0,
        "rating": "3;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "1453;426;157;200",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            5.25,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            559.0,
            526.1677489166359
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 218,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17494647236470015589&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Hkla1eHFvS",
        "title": "Efficient Exploration via State Marginal Matching",
        "track": "main",
        "status": "Reject",
        "tldr": "We view exploration in RL as a problem of matching a marginal distribution over states.",
        "abstract": "Reinforcement learning agents need to explore their unknown environments to solve the tasks given to them. The Bayes optimal solution to exploration is intractable for complex environments, and while several exploration methods have been proposed as approximations, it remains unclear what underlying objective is being optimized by existing exploration methods, or how they can be altered to incorporate prior knowledge about the task. Moreover, it is unclear how to acquire a single exploration strategy that will be useful for solving multiple downstream tasks. We address these shortcomings by learning a single exploration policy that can quickly solve a suite of downstream tasks in a multi-task setting, amortizing the cost of learning to explore. We recast exploration as a problem of State Marginal Matching (SMM), where we aim to learn a policy for which the state marginal distribution matches a given target state distribution, which can incorporate prior knowledge about the task. We optimize the objective by reducing it to a two-player, zero-sum game between a state density model and a parametric policy. Our theoretical analysis of this approach suggests that prior exploration methods do not learn a policy that does distribution matching, but acquire a replay buffer that performs distribution matching, an observation that potentially explains these prior methods' success in single-task settings. On both simulated and real-world tasks, we demonstrate that our algorithm explores faster and adapts more quickly than prior methods.",
        "keywords": "reinforcement learning;exploration;distribution matching;robotics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lisa Lee;Benjain Eysenbach;Emilio Parisotto;Erix Xing;Sergey Levine;Ruslan Salakhutdinov",
        "authorids": "lslee@cs.cmu.edu;beysenba@cs.cmu.edu;eparisot@cs.cmu.edu;epxing@cs.cmu.edu;svlevine@eecs.berkeley.edu;rsalakhu@cs.cmu.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nlee2020efficient,\ntitle={Efficient Exploration via State Marginal Matching},\nauthor={Lisa Lee and Benjain Eysenbach and Emilio Parisotto and Erix Xing and Sergey Levine and Ruslan Salakhutdinov},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkla1eHFvS}\n}",
        "github": "https://drive.google.com/open?id=1q4DgW9vq3AOyBVH3xZQCmLN3iX2jICtt",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hkla1eHFvS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "908;1044;616",
        "wc_reply_reviewers": "0;301;0",
        "wc_reply_authors": "594;821;424",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            856.0,
            178.5571803839506
        ],
        "wc_reply_reviewers_avg": [
            100.33333333333333,
            141.89276075810054
        ],
        "wc_reply_authors_avg": [
            613.0,
            162.63046045149926
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 308,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10797132271713004787&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Hkla70NFPH",
        "title": "Should All Cross-Lingual Embeddings Speak English?",
        "track": "main",
        "status": "Withdraw",
        "tldr": "The choice of the hub (target) language affects the quality of cross-lingual embeddings, which shouldn't be evaluated only  on English-centric dictionaries.",
        "abstract": "Most of recent work in cross-lingual word embeddings is severely Anglocentric. The vast majority of lexicon induction evaluation dictionaries are between English and another language, and the English embedding space is selected by default as the hub when learning in a multilingual setting. With this work, however, we challenge these practices. First, we show that the choice of hub language can significantly impact downstream lexicon induction performance. Second, we both expand the current evaluation dictionary collection to include all language pairs using triangulation, and also create new dictionaries for under-represented languages. Evaluating established methods over all these language pairs sheds light into their suitability and presents new challenges for the field. Finally, in our analysis we identify general guidelines for strong cross-lingual embeddings baselines, based on more than just Anglocentric experiments. ",
        "keywords": "multilingual embeddings;cross-lingual embeddings;under-represented languages",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Antonios Anastasopoulos;Graham Neubig",
        "authorids": "aanastas@andrew.cmu.edu;gneubig@cs.cmu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hkla70NFPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1044;800;307",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            717.0,
            306.5496153425521
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 42,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17672382256594251859&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HklbIerFDS",
        "title": "Slow Thinking Enables Task-Uncertain Lifelong and Sequential Few-Shot Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper studies the interactions between the fast-learning and slow-prediction models and demonstrate how such interactions can improve machine capability to solve the joint lifelong and few-shot learning problems.",
        "abstract": "Lifelong machine learning focuses on adapting to novel tasks without forgetting the old tasks, whereas few-shot learning strives to learn a single task given a small amount of data. These two different research areas are crucial for artificial general intelligence, however, their existing studies have somehow assumed some impractical settings when training the models. For lifelong learning, the nature (or the quantity) of incoming tasks during inference time is assumed to be known at training time. As for few-shot learning, it is commonly assumed that a large number of tasks is available during training. Humans, on the other hand, can perform these learning tasks without regard to the aforementioned assumptions. Inspired by how the human brain works, we propose a novel model, called the Slow Thinking to Learn (STL), that makes sophisticated (and slightly slower) predictions by iteratively considering interactions between current and previously seen tasks at runtime. Having conducted experiments, the results empirically demonstrate the effectiveness of STL for more realistic lifelong and few-shot learning settings.",
        "keywords": "lifelong learning;few-shot learning;memory-augmented models;runtime adaptation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rosalie Dolor;Hsin-Chi Chu;Shan-Hung Wu",
        "authorids": "rosalie@ghtinc.com;hcchu@datalab.cs.nthu.edu.tw;shwu@cs.nthu.edu.tw",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklbIerFDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "94;1306;875",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            758.3333333333334,
            501.62690872356075
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PdGjbLkcSxMJ:scholar.google.com/&scioq=Slow+Thinking+Enables+Task-Uncertain+Lifelong+and+Sequential+Few-Shot+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Hklcm0VYDS",
        "title": "How noise affects the Hessian spectrum in overparameterized neural networks",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper shows that for overparameterized networks with a degenerate valley in their loss landscape, SGD on average decreases the trace of the Hessian of the loss and generalizes this result to other noise structures.",
        "abstract": "Stochastic gradient descent (SGD) forms the core optimization method for deep neural networks. While some theoretical progress has been made, it still remains unclear why SGD leads the learning dynamics in overparameterized networks to solutions that generalize well. Here we show that for overparameterized networks with a degenerate valley in their loss landscape, SGD on average decreases the trace of the Hessian of the loss. We also generalize this result to other noise structures and show that isotropic noise in the non-degenerate subspace of the Hessian decreases its determinant. In addition to explaining SGDs role in sculpting the Hessian spectrum, this opens the door to new optimization approaches that may confer better generalization performance. We test our results with experiments on toy models and deep neural networks.",
        "keywords": "noise;optimization;loss landscape;Hessian",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mingwei Wei;David Schwab",
        "authorids": "m.wei@u.northwestern.edu;dschwab@gc.cuny.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nwei2020how,\ntitle={How noise affects the Hessian spectrum in overparameterized neural networks},\nauthor={Mingwei Wei and David Schwab},\nyear={2020},\nurl={https://openreview.net/forum?id=Hklcm0VYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hklcm0VYDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "612;301;520",
        "wc_reply_reviewers": "169;0;0",
        "wc_reply_authors": "1047;357;677",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            477.6666666666667,
            130.446242652758
        ],
        "wc_reply_reviewers_avg": [
            56.333333333333336,
            79.66736401368435
        ],
        "wc_reply_authors_avg": [
            693.6666666666666,
            281.93773938387335
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 29,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4576272254436290654&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HkldyTNYwH",
        "title": "AE-OT: A NEW GENERATIVE MODEL BASED ON EXTENDED SEMI-DISCRETE OPTIMAL TRANSPORT",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Generative adversarial networks (GANs) have attracted huge attention due to\nits capability to generate visual realistic images. However, most of the existing\nmodels suffer from the mode collapse or mode mixture problems. In this work, we\ngive a theoretic explanation of the both problems by Figalli\u2019s regularity theory of\noptimal transportation maps. Basically, the generator compute the transportation\nmaps between the white noise distributions and the data distributions, which are\nin general discontinuous. However, DNNs can only represent continuous maps.\nThis intrinsic conflict induces mode collapse and mode mixture. In order to\ntackle the both problems, we explicitly separate the manifold embedding and the\noptimal transportation; the first part is carried out using an autoencoder to map the\nimages onto the latent space; the second part is accomplished using a GPU-based\nconvex optimization to find the discontinuous transportation maps. Composing the\nextended OT map and the decoder, we can finally generate new images from the\nwhite noise. This AE-OT model avoids representing discontinuous maps by DNNs,\ntherefore effectively prevents mode collapse and mode mixture.",
        "keywords": "Generative model;auto-encoder;optimal transport;mode collapse;regularity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongsheng An;Yang Guo;Na Lei;Zhongxuan Luo;Shing-Tung Yau;Xianfeng Gu",
        "authorids": "doan@cs.stonybrook.edu;yangguo@cs.stonybrook.edu;nalei@dlut.edu.cn;zxluo@dlut.edu.cn;yau@math.harvard.edu;gu@cs.stonybrook.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nAn2020AE-OT:,\ntitle={AE-OT: A NEW GENERATIVE MODEL BASED ON EXTENDED SEMI-DISCRETE OPTIMAL TRANSPORT},\nauthor={Dongsheng An and Yang Guo and Na Lei and Zhongxuan Luo and Shing-Tung Yau and Xianfeng Gu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkldyTNYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkldyTNYwH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "295;253;422",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "634;1492;549",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            323.3333333333333,
            71.84396673037726
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            891.6666666666666,
            425.9157454499918
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 65,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1649345730271484144&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkliveStvH",
        "title": "Connectivity-constrained interactive annotations for panoptic segmentation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Large-scale ground truth data sets are of crucial importance for deep learning\nbased segmentation models, but annotating per-pixel\nmasks is prohibitively time consuming. In this paper, we investigate interactive graph-based segmentation algorithms that enforce connectivity. To be more precise, we introduce an instance-aware heuristic of a discrete Potts model, and a class-aware Integer Linear Programming (ILP) formulation that ensures global optimum. Both algorithms can take RGB, or utilize the feature maps from any DCNN, whether trained on the target dataset or not, as input. We present competitive semantic (and panoptic) segmentation results on the PASCAL VOC 2012 and Cityscapes dataset given initial scribbles. We also demonstrate that our interactive approach can reach $90.6\\%$ mIoU on VOC validation set with an overhead of just $3$ correction scribbles. They are thus suitable for  interactive annotation on new or existing datasets, or can be used inside any weakly supervised learning framework on new datasets.",
        "keywords": "Panoptic Segmentation;Semantic Segmentation;Interactive Segmentation;Integer Programming",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruobing Shen;Bo Tang;Ismail Ben Ayed;Andrea Lodi;Thomas Guthier",
        "authorids": "ruobing.shen@gmobis.com;lucastang1994@gmail.com;ismail.benayed@etsmtl.ca;andrea.lodi@polymtl.ca;thomas.guthier@gmobis.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nshen2020connectivityconstrained,\ntitle={Connectivity-constrained interactive annotations for panoptic segmentation},\nauthor={Ruobing Shen and Bo Tang and Ismail Ben Ayed and Andrea Lodi and Thomas Guthier},\nyear={2020},\nurl={https://openreview.net/forum?id=HkliveStvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkliveStvH",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "516;178;468;476",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "336;79;240;149",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            409.5,
            134.8879164343493
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            201.0,
            96.61004088602799
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:s0wArq_Z6A8J:scholar.google.com/&scioq=Connectivity-constrained+interactive+annotations+for+panoptic+segmentation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkljrTEKvr",
        "title": "Hierarchical Image-to-image Translation with Nested Distributions Modeling",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Granularity controled multi-domain and multimodal image to image translation method",
        "abstract": "Unpaired image-to-image translation among category domains has achieved remarkable success in past decades. Recent studies mainly focus on two challenges. For one thing, such translation is inherently multimodal due to variations of domain-specific information (e.g., the domain of house cat has multiple fine-grained subcategories). For another, existing multimodal approaches have limitations in handling more than two domains, i.e. they have to independently build one model for every pair of domains. To address these problems, we propose the Hierarchical Image-to-image Translation (HIT) method which jointly formulates the multimodal and multi-domain problem in a semantic hierarchy structure, and can further control the uncertainty of multimodal. Specifically, we regard the domain-specific variations as the result of the multi-granularity property of domains, and one can control the granularity of the multimodal translation by dividing a domain with large variations into multiple subdomains which capture local and fine-grained variations. With the assumption of Gaussian prior, variations of domains are modeled in a common space such that translations can further be done among multiple domains within one model. To learn such complicated space, we propose to leverage the inclusion relation among domains to constrain distributions of parent and children to be nested. Experiments on several datasets validate the promising results and competitive performance against state-of-the-arts.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shishi Qiao;Ruiping Wang;Shiguang Shan;Xilin Chen",
        "authorids": "qiaoshishi14@mails.ucas.ac.cn;wangruiping@ict.ac.cn;sgshan@ict.ac.cn;xlchen@ict.ac.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkljrTEKvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "402;287;399",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            362.6666666666667,
            53.51842880935708
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4400853000354588350&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Hklk6xrYPB",
        "title": "Measure by Measure: Automatic Music Composition with Traditional Western Music Notation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In this paper, we present a system that is capable of generating long polyphonic music given number of measures, up to hundreds of measures. \nThis is achieved by creating a measure model that imitates the object hierarchy used in  common encodings of traditional western music notation.\nOn top of this, we construct a inter-measure context model that spans the entire composition. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yujia Yan;Zhiyao Duan",
        "authorids": "yujia.yan.w@gmail.com;zhiyao.duan@rochester.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hklk6xrYPB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "822;616;406",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            614.6666666666666,
            169.83390578902538
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XwQDokTBW-QJ:scholar.google.com/&scioq=Measure+by+Measure:+Automatic+Music+Composition+with+Traditional+Western+Music+Notation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HklkeR4KPB",
        "title": "ReMixMatch: Semi-Supervised Learning with Distribution Matching and Augmentation Anchoring",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce Distribution Matching and Augmentation Anchoring, two improvements to MixMatch which produce state-of-the-art results and enable surprisingly strong performance with only 40 labels on CIFAR-10 and SVHN.",
        "abstract": "We improve the recently-proposed ``MixMatch semi-supervised learning algorithm by introducing two new techniques: distribution alignment and augmentation anchoring.\n- Distribution alignment encourages the marginal distribution of predictions on unlabeled data to be close to the marginal distribution of ground-truth labels.\n- Augmentation anchoring} feeds multiple strongly augmented versions of an input into the model and encourages each output to be close to the prediction for a weakly-augmented version of the same input.\nTo produce strong augmentations, we propose a variant of AutoAugment which learns the augmentation policy while the model is being trained.\n\nOur new algorithm, dubbed ReMixMatch, is significantly more data-efficient than prior work, requiring between 5 times and 16 times less data to reach the same accuracy. For example, on CIFAR-10 with 250 labeled examples we reach 93.73% accuracy (compared to MixMatch's accuracy of 93.58% with 4000 examples) and a median accuracy of 84.92% with just four labels per class.\n",
        "keywords": "semi-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David Berthelot;Nicholas Carlini;Ekin D. Cubuk;Alex Kurakin;Kihyuk Sohn;Han Zhang;Colin Raffel",
        "authorids": "dberth@google.com;ncarlini@google.com;cubuk@google.com;kurakin@google.com;kihyuks@google.com;zhanghan@google.com;craffel@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nBerthelot2020ReMixMatch:,\ntitle={ReMixMatch: Semi-Supervised Learning with Distribution Matching and Augmentation Anchoring},\nauthor={David Berthelot and Nicholas Carlini and Ekin D. Cubuk and Alex Kurakin and Kihyuk Sohn and Han Zhang and Colin Raffel},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklkeR4KPB}\n}",
        "github": "https://github.com/google-research/remixmatch",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklkeR4KPB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "558;519;295",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "322;367;293",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            457.3333333333333,
            115.88595926839264
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            327.3333333333333,
            30.44484995674784
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "HklliySFDS",
        "title": "Continual Learning with Gated Incremental Memories for Sequential Data Processing",
        "track": "main",
        "status": "Reject",
        "tldr": "We tackled the problem of CL in sequential data processing scenarios, providing a set of domain-agnostic benchmarks against which we compared performances of a novel RNN for CL and other standard RNNs.",
        "abstract": "The ability to learn over changing task distributions without forgetting previous knowledge, also known as continual learning, is a key enabler for scalable and trustworthy deployments of adaptive solutions. While the importance of continual learning is largely acknowledged in machine vision and reinforcement learning problems, this is mostly under-documented for sequence processing tasks. This work focuses on characterizing and quantitatively assessing the impact of catastrophic forgetting and task interference when dealing with sequential data in recurrent neural networks. We also introduce a general architecture, named Gated Incremental Memory, for augmenting recurrent models with continual learning skills, whose effectiveness is demonstrated through the benchmarks introduced in this paper.",
        "keywords": "continual learning;recurrent neural networks;progressive networks;gating autoencoders;sequential data processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrea Cossu;Antonio Carta;Davide Bacciu",
        "authorids": "cossu48@gmail.com;antonio.carta@di.unipi.it;bacciu@di.unipi.it",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ncossu2020continual,\ntitle={Continual Learning with Gated Incremental Memories for Sequential Data Processing},\nauthor={Andrea Cossu and Antonio Carta and Davide Bacciu},\nyear={2020},\nurl={https://openreview.net/forum?id=HklliySFDS}\n}",
        "github": "https://drive.google.com/open?id=1L2_y35Zy5xahmxQRqGDCGonPiHeyuJlp",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklliySFDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "349;470;129",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "326;366;199",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            316.0,
            141.15476140274782
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            297.0,
            71.19456908126256
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12181687821385673172&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HklmoRVYvr",
        "title": "Long History Short-Term Memory for Long-Term Video Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new recurrent unit, Long History Short-Term Memory (LH-STM) which incorporates long history states into a recurrent unit to learn longer range dependencies.",
        "abstract": "While video prediction approaches have advanced considerably in recent years, learning to predict long-term future is challenging \u2014 ambiguous future or error propagation over time yield blurry predictions. To address this challenge, existing algorithms rely on extra supervision (e.g., action or object pose), motion flow learning, or adversarial training. In this paper, we propose a new recurrent unit, Long History Short-Term Memory (LH-STM). LH-STM incorporates long history states into a recurrent unit to learn longer range dependencies. To capture spatio-temporal dynamics in videos, we combined LH-STM with the Context-aware Video Prediction model (ContextVP). Our experiments on the KTH human actions and BAIR robot pushing datasets demonstrate that our approach produces not only sharper near-future predictions, but also farther into the future compared to the state-of-the-art methods. ",
        "keywords": "LSTM;video;long-term prediction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wonmin Byeon;Jan Kautz",
        "authorids": "wonmin.byeon@gmail.com;jkautz@nvidia.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nbyeon2020long,\ntitle={Long History Short-Term Memory for Long-Term Video Prediction},\nauthor={Wonmin Byeon and Jan Kautz},\nyear={2020},\nurl={https://openreview.net/forum?id=HklmoRVYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklmoRVYvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "200;719;609",
        "wc_reply_reviewers": "0;646;0",
        "wc_reply_authors": "299;1112;207",
        "reply_reviewers": "0;3;0",
        "reply_authors": "1;5;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            509.3333333333333,
            223.2940263917112
        ],
        "wc_reply_reviewers_avg": [
            215.33333333333334,
            304.52732043100644
        ],
        "wc_reply_authors_avg": [
            539.3333333333334,
            406.6745900867452
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.699673171197595
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:l5izWBweXD0J:scholar.google.com/&scioq=Long+History+Short-Term+Memory+for+Long-Term+Video+Prediction&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Hklo5RNtwS",
        "title": "Behavior-Guided Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We seek to find the right measure of similarity between two policies, acting on the same underlying MDP, and devise algorithms to leverage this information for reinforcement learning.",
        "abstract": "We introduce a new approach for comparing reinforcement learning policies, using Wasserstein distances (WDs) in a newly defined latent behavioral space. We show that by utilizing the dual formulation of the WD, we can learn score functions over trajectories that can be in turn used to lead policy optimization towards (or away from) (un)desired behaviors. Combined with smoothed WDs, the dual formulation allows us to devise efficient algorithms that take stochastic gradient descent steps through WD regularizers. We incorporate these regularizers into two novel on-policy algorithms, Behavior-Guided Policy Gradient and Behavior-Guided Evolution Strategies, which we demonstrate can outperform existing methods in a variety of challenging environments. We also provide an open source demo.",
        "keywords": "Reinforcement Learning;Optimal Transport;Evolution Strategies",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aldo Pacchiano;Jack Parker-Holder;Yunhao Tang;Anna Choromanska;Krzysztof Choromanski;Michael I. Jordan",
        "authorids": "pacchiano@berkeley.edu;jh3764@columbia.edu;yt2541@columbia.edu;achoroma@gmail.com;kchoro@google.com;jordan@cs.berkeley.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\npacchiano2020behaviorguided,\ntitle={Behavior-Guided Reinforcement Learning},\nauthor={Aldo Pacchiano and Jack Parker-Holder and Yunhao Tang and Anna Choromanska and Krzysztof Choromanski and Michael I. Jordan},\nyear={2020},\nurl={https://openreview.net/forum?id=Hklo5RNtwS}\n}",
        "github": "https://github.com/behaviorguidedRL/BGRL",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hklo5RNtwS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "1626;313;386",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1773;601;393",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            775.0,
            602.4854078454239
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            922.3333333333334,
            607.4763827142217
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14947927860783110616&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hklr204Fvr",
        "title": "Towards a Deep Network Architecture for Structured Smoothness",
        "track": "main",
        "status": "Poster",
        "tldr": "A feedforward layer to incorporate structured smoothness into a deep learning model",
        "abstract": "We propose the Fixed Grouping Layer (FGL); a novel feedforward layer designed to incorporate the inductive bias of structured smoothness into a deep learning model. FGL achieves this goal by connecting nodes across layers based on spatial similarity. The use of structured smoothness, as implemented by FGL, is motivated by applications to structured spatial data, which is, in turn, motivated by domain knowledge. The proposed model architecture outperforms conventional neural network architectures across a variety of simulated and real datasets with structured smoothness. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haroun Habeeb;Oluwasanmi Koyejo",
        "authorids": "haroun7@gmail.com;sanmi@illinois.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nHabeeb2020Towards,\ntitle={Towards a Deep Network Architecture for Structured Smoothness},\nauthor={Haroun Habeeb and Oluwasanmi Koyejo},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hklr204Fvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hklr204Fvr",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "274;127",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "778;233",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            200.5,
            73.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            505.5,
            272.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1041537781325935468&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HklsHyBKDr",
        "title": "On Predictive Information Sub-optimality of RNNs",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Certain biological neurons demonstrate a remarkable capability to optimally compress the history of sensory inputs while being maximally informative about the future. In this work, we investigate if the same can be said of artificial neurons in recurrent neural networks (RNNs) trained with maximum likelihood. In experiments on two datasets, restorative Brownian motion and a hand-drawn sketch dataset, we find that RNNs are sub-optimal in the information plane. Instead of optimally compressing past information, they extract additional information that is not relevant for predicting the future. Overcoming this limitation may require alternative training procedures and architectures, or objectives beyond maximum likelihood estimation.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhe Dong;Deniz Oktay;Ben Poole;Alexander A. Alemi",
        "authorids": "zhedong@google.com;doktay@princeton.edu;pooleb@google.com;alemi@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ndong2020on,\ntitle={On Predictive Information Sub-optimality of {\\{}RNN{\\}}s},\nauthor={Zhe Dong and Deniz Oktay and Ben Poole and Alexander A. Alemi},\nyear={2020},\nurl={https://openreview.net/forum?id=HklsHyBKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HklsHyBKDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "272;282;228",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "359;441;249",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            260.6666666666667,
            23.456816114345575
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            349.6666666666667,
            78.66101674626101
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8WjHqcbosLYJ:scholar.google.com/&scioq=On+Predictive+Information+Sub-optimality+of+RNNs&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Hkls_yBKDB",
        "title": "Neural Program Synthesis By Self-Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop a neural program synthesis algorithm,AutoAssemblet, to explore the large-scale  code  space  efficiently  via  self-learning  under  the  reinforcement  learning  (RL)  framework.",
        "abstract": "Neural inductive program synthesis is a task generating instructions that can produce desired outputs from given inputs. In this paper, we focus on the generation of a chunk of assembly code that can be executed to match a state change inside the CPU. We develop a neural program synthesis algorithm, AutoAssemblet, learned via self-learning reinforcement learning that explores the large code space efficiently. Policy networks and value networks are learned to reduce the breadth and depth of the Monte Carlo Tree Search, resulting in better synthesis performance.  We also propose an effective multi-entropy policy sampling technique to alleviate online update correlations.  We apply AutoAssemblet to basic programming tasks and show significant higher success rates compared to several competing baselines.",
        "keywords": "Neural Program Synthesis;Reinforcement Learning;Deep learning;Self-Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yifan Xu;Lu Dai;Udaikaran Singh;Kening Zhang;Zhuowen Tu",
        "authorids": "yix081@ucsd.edu;dldaisy@mail.ustc.edu.cn;u1singh@ucsd.edu;kez040@ucsd.edu;ztu@ucsd.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nxu2020neural,\ntitle={Neural Program Synthesis By Self-Learning},\nauthor={Yifan Xu and Lu Dai and Udaikaran Singh and Kening Zhang and Zhuowen Tu},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkls_yBKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkls_yBKDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "503;1256;364",
        "wc_reply_reviewers": "0;24;0",
        "wc_reply_authors": "538;716;447",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            707.6666666666666,
            391.86080295034475
        ],
        "wc_reply_reviewers_avg": [
            8.0,
            11.313708498984761
        ],
        "wc_reply_authors_avg": [
            567.0,
            111.71690412227984
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15240295263120041738&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Hklso24Kwr",
        "title": "Continual Learning with Adaptive Weights (CLAW)",
        "track": "main",
        "status": "Poster",
        "tldr": "A continual learning framework which learns to automatically adapt its architecture based on a proposed variational inference algorithm. ",
        "abstract": "Approaches to continual learning aim to successfully learn a set of related tasks that arrive in an online manner. Recently, several frameworks have been developed which enable deep learning to be deployed in this learning scenario. A key modelling decision is to what extent the architecture should be shared across tasks. On the one hand, separately modelling each task avoids catastrophic forgetting but it does not support transfer learning and leads to large models. On the other hand, rigidly specifying a shared component and a task-specific part enables task transfer and limits the model size, but it is vulnerable to catastrophic forgetting and restricts the form of task-transfer that can occur. Ideally, the network should adaptively identify which parts of the network to share in a data driven way. Here we introduce such an approach called Continual Learning with Adaptive Weights (CLAW), which is based on probabilistic modelling and variational inference. Experiments show that CLAW achieves state-of-the-art performance on six benchmarks in terms of overall continual learning performance, as measured by classification accuracy, and in terms of addressing catastrophic forgetting. ",
        "keywords": "Continual learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tameem Adel;Han Zhao;Richard E. Turner",
        "authorids": "tah47@cam.ac.uk;han.zhao@cs.cmu.edu;ret26@cam.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nAdel2020Continual,\ntitle={Continual Learning with Adaptive Weights (CLAW)},\nauthor={Tameem Adel and Han Zhao and Richard E. Turner},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hklso24Kwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hklso24Kwr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "194;149;362",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "634;736;686",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            235.0,
            91.6624241442479
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            685.3333333333334,
            41.643993831310446
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 98,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12791982500218131733&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HklsthVYDH",
        "title": "Learning to Defense by Learning to Attack",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Adversarial training provides a principled approach for training robust neural networks. From an optimization perspective, the adversarial training is essentially solving a minimax robust optimization problem. The outer minimization is trying to learn a robust classifier, while the inner maximization is trying to generate adversarial samples. Unfortunately, such a minimax problem is very difficult to solve due to the lack of convex-concave structure. This work proposes a new adversarial training method based on a generic learning-to-learn (L2L) framework. Specifically, instead of applying the existing hand-designed algorithms for the inner problem, we learn an optimizer, which is parametrized as a convolutional neural network. At the same time, a robust classifier is learned to defense the adversarial attack generated by the learned optimizer. Our experiments over CIFAR-10 and CIFAR-100 datasets demonstrate that the L2L outperforms existing adversarial training methods in both classification accuracy and computational efficiency. Moreover, our L2L framework can be extended to the generative adversarial imitation learning and stabilize the training.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhehui Chen;Haoming Jiang;Yuyang Shi;Bo Dai;Tuo Zhao",
        "authorids": "zhchen@gatech.edu;jianghm@gatech.edu;yyshi@gatech.edu;bodai@google.com;tourzhao@gatech.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchen2020learning,\ntitle={Learning to Defense by Learning to Attack},\nauthor={Zhehui Chen and Haoming Jiang and Yuyang Shi and Bo Dai and Tuo Zhao},\nyear={2020},\nurl={https://openreview.net/forum?id=HklsthVYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklsthVYDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "241;248;323",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1144;546;541",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            270.6666666666667,
            37.11543554312081
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            743.6666666666666,
            283.0857742020173
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14120162341000412513&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HklvMJSYPB",
        "title": "Adaptive Adversarial Imitation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We present the ADaptive Adversarial Imitation Learning (ADAIL) algorithm for learning adaptive policies that can be transferred between environments of varying dynamics, by imitating a small number of demonstrations collected from a single source domain. This problem is important in robotic learning because in real world scenarios 1) reward functions are hard to obtain, 2) learned policies from one domain are difficult to deploy in another due to varying source to target domain statistics, 3) collecting expert demonstrations in multiple environments where the dynamics are known and controlled is often infeasible. We address these constraints by building upon recent advances in adversarial imitation learning; we condition our policy on a learned dynamics embedding and we employ a domain-adversarial loss to learn a dynamics-invariant discriminator. The effectiveness of our method is demonstrated on simulated control tasks with varying environment dynamics and the learned adaptive agent outperforms several recent baselines.",
        "keywords": "Imitation Learning;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yiren Lu;Jonathan Tompson;Sergey Levine",
        "authorids": "luyiren92@gmail.com;tompson@google.com;slevine@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlu2020adaptive,\ntitle={Adaptive Adversarial Imitation Learning},\nauthor={Yiren Lu and Jonathan Tompson and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=HklvMJSYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HklvMJSYPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "263;863;303",
        "wc_reply_reviewers": "0;40;0",
        "wc_reply_authors": "928;868;609",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            476.3333333333333,
            273.9018477889885
        ],
        "wc_reply_reviewers_avg": [
            13.333333333333334,
            18.856180831641264
        ],
        "wc_reply_authors_avg": [
            801.6666666666666,
            138.42045449362686
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1125572774772401536&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HklvmlrKPB",
        "title": "Improving Sequential Latent Variable Models with Autoregressive Flows",
        "track": "main",
        "status": "Reject",
        "tldr": "We show how autoregressive flows can be used to improve sequential latent variable models.",
        "abstract": "We propose an approach for sequence modeling based on autoregressive normalizing flows. Each autoregressive transform, acting across time, serves as a moving reference frame for modeling higher-level dynamics. This technique provides a simple, general-purpose method for improving sequence modeling, with connections to existing and classical techniques. We demonstrate the proposed approach both with standalone models, as well as a part of larger sequential latent variable models. Results are presented on three benchmark video datasets, where flow-based dynamics improve log-likelihood performance over baseline models.",
        "keywords": "Autoregressive Flows;Sequence Modeling;Latent Variable Models;Video Modeling;Variational Inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Joseph Marino;Lei Chen;Jiawei He;Stephan Mandt",
        "authorids": "jmarino@caltech.edu;lei_chen_4@sfu.ca;jiawei_he_2@sfu.ca;stephan.mandt@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmarino2020improving,\ntitle={Improving Sequential Latent Variable Models with Autoregressive Flows},\nauthor={Joseph Marino and Lei Chen and Jiawei He and Stephan Mandt},\nyear={2020},\nurl={https://openreview.net/forum?id=HklvmlrKPB}\n}",
        "github": "https://anonymous.4open.science/r/f02199f7-86d2-45ee-ad23-3f13f769ee10/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HklvmlrKPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "677;1075;242",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "591;630;290",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            664.6666666666666,
            340.1826306886085
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            503.6666666666667,
            151.92176349102263
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3045152774117768540&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 15
    },
    {
        "id": "HklxbgBKvr",
        "title": "Model-based reinforcement learning for biological sequence design",
        "track": "main",
        "status": "Poster",
        "tldr": "We augment model-free policy learning with a sequence-level surrogate reward functions and count-based visitation bonus and demonstrate effectiveness in the large batch, low-round regime seen in designing DNA and protein sequences.",
        "abstract": "The ability to design biological structures such as DNA or proteins would have considerable medical and industrial impact. Doing so presents a challenging black-box optimization problem characterized by the large-batch, low round setting due to the need for labor-intensive wet lab evaluations. In response, we propose using reinforcement learning (RL) based on proximal-policy optimization (PPO) for biological sequence design. RL provides a flexible framework for optimization generative sequence models to achieve specific criteria, such as diversity among the high-quality sequences discovered. We propose a model-based variant of PPO, DyNA-PPO, to improve sample efficiency, where the policy for a new round is trained offline using a simulator fit on functional measurements from prior rounds. To accommodate the growing number of observations across rounds, the simulator model is automatically selected at each round from a pool of diverse models of varying capacity.  On the tasks of designing DNA transcription factor binding sites, designing antimicrobial proteins, and optimizing the energy of Ising models based on protein structure, we find that DyNA-PPO performs significantly better than existing methods in settings in which modeling is feasible, while still not performing worse in situations in which a reliable model cannot be learned.",
        "keywords": "reinforcement learning;blackbox optimization;molecule design",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Christof Angermueller;David Dohan;David Belanger;Ramya Deshpande;Kevin Murphy;Lucy Colwell",
        "authorids": "christofa@google.com;ddohan@google.com;dbelanger@google.com;ramyadeshpande@google.com;lcolwell@google.com;kpmurphy@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nAngermueller2020Model-based,\ntitle={Model-based reinforcement learning for biological sequence design},\nauthor={Christof Angermueller and David Dohan and David Belanger and Ramya Deshpande and Kevin Murphy and Lucy Colwell},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HklxbgBKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HklxbgBKvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "207;507;235",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "300;736;526",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            316.3333333333333,
            135.30541583970523
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            520.6666666666666,
            178.0362010628425
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 167,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15503946079382614883&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Hklz71rYvS",
        "title": "Kernelized Wasserstein Natural Gradient",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Estimator for the Wasserstein natural gradient",
        "abstract": "Many machine learning problems can be expressed as the optimization of some cost functional over a parametric family of probability distributions. It is often beneficial to solve such optimization problems using natural gradient methods. These methods are invariant to the parametrization of the family, and thus can yield more effective optimization. Unfortunately, computing the natural gradient is challenging as it requires inverting a high dimensional matrix at each iteration. We propose a general framework to approximate the natural gradient for the Wasserstein metric, by leveraging a dual formulation of the metric restricted to a Reproducing Kernel Hilbert Space. Our approach leads to an estimator for gradient direction that can trade-off accuracy and computational cost, with theoretical guarantees. We verify its accuracy on simple examples, and show the advantage of using such an estimator in classification tasks on \\texttt{Cifar10} and \\texttt{Cifar100} empirically. ",
        "keywords": "kernel methods;natural gradient;information geometry;Wasserstein metric",
        "primary_area": "",
        "supplementary_material": "",
        "author": "M Arbel;A Gretton;W Li;G Montufar",
        "authorids": "michael.n.arbel@gmail.com;arthur.gretton@gmail.com;wcli@math.ucla.edu;guidomontufar@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nArbel2020Kernelized,\ntitle={Kernelized Wasserstein Natural Gradient},\nauthor={M Arbel and A Gretton and W Li and G Montufar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hklz71rYvS}\n}",
        "github": "[![github](/images/github_icon.svg) MichaelArbel/KWNG](https://github.com/MichaelArbel/KWNG)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hklz71rYvS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "439;393;867",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "590;513;619",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            566.3333333333334,
            213.43122753919798
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            574.0,
            44.7288124888943
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4819202851249905644&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "Hkx1qkrKPr",
        "title": "DropEdge: Towards Deep Graph Convolutional Networks on Node Classification",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper proposes DropEdge, a novel and flexible technique to alleviate over-smoothing and overfitting issue in deep Graph Convolutional Networks.",
        "abstract": "Over-fitting and over-smoothing are two main obstacles of developing deep Graph Convolutional Networks (GCNs) for node classification. In particular, over-fitting weakens the generalization ability on small dataset, while over-smoothing impedes model training by isolating output representations from the input features with the increase in network depth. This paper proposes DropEdge, a novel and flexible technique to alleviate both issues. At its core, DropEdge randomly removes a certain number of edges from the input graph at each training epoch, acting like a data augmenter and also a message passing reducer. Furthermore, we theoretically demonstrate that DropEdge either reduces the convergence speed of over-smoothing or relieves the information loss caused by it. More importantly, our DropEdge is a general skill that can be equipped with many other backbone models (e.g. GCN, ResGCN, GraphSAGE, and JKNet) for enhanced performance. Extensive experiments on several benchmarks verify that DropEdge consistently improves the performance on a variety of both shallow and deep GCNs. The effect of DropEdge on preventing over-smoothing is empirically visualized and validated as well. Codes are released on~https://github.com/DropEdge/DropEdge.",
        "keywords": "graph neural network;over-smoothing;over-fitting;dropedge;graph convolutional networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Rong;Wenbing Huang;Tingyang Xu;Junzhou Huang",
        "authorids": "yu.rong@hotmail.com;hwenbing@126.com;tingyangxu@tencent.com;jzhuang@uta.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nRong2020DropEdge:,\ntitle={DropEdge: Towards Deep Graph Convolutional Networks on Node Classification},\nauthor={Yu Rong and Wenbing Huang and Tingyang Xu and Junzhou Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkx1qkrKPr}\n}",
        "github": "https://github.com/DropEdge/DropEdge",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Hkx1qkrKPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "245;360;515",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "408;259;619",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            373.3333333333333,
            110.62951183517393
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            428.6666666666667,
            147.69412837196867
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            34,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1783,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16127626475319244243&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Hkx3ElHYwS",
        "title": "GQ-Net: Training Quantization-Friendly Deep Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We train accurate fully quantized networks using a loss function maximizing full precision model accuracy and minimizing the difference between the full precision and quantized networks.",
        "abstract": "Network quantization is a model compression and acceleration technique that has become essential to neural network deployment. Most quantization methods per- form fine-tuning on a pretrained network, but this sometimes results in a large loss in accuracy compared to the original network. We introduce a new technique to train quantization-friendly networks, which can be directly converted to an accurate quantized network without the need for additional fine-tuning. Our technique allows quantizing the weights and activations of all network layers down to 4 bits, achieving high efficiency and facilitating deployment in practical settings. Com- pared to other fully quantized networks operating at 4 bits, we show substantial improvements in accuracy, for example 66.68% top-1 accuracy on ImageNet using ResNet-18, compared to the previous state-of-the-art accuracy of 61.52% Louizos et al. (2019) and a full precision reference accuracy of 69.76%. We performed a thorough set of experiments to test the efficacy of our method and also conducted ablation studies on different aspects of the method and techniques to improve training stability and accuracy. Our codebase and trained models are available on GitHub.",
        "keywords": "Network quantization;Efficient deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rundong Li;Rui Fan",
        "authorids": "lird@shanghaitech.edu.cn;fanrui@shanghaitech.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nli2020gqnet,\ntitle={{\\{}GQ{\\}}-Net: Training Quantization-Friendly Deep Networks},\nauthor={Rundong Li and Rui Fan},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkx3ElHYwS}\n}",
        "github": "https://gofile.io/?c=GEvKoF",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hkx3ElHYwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "693;399;365",
        "wc_reply_reviewers": "211;81;0",
        "wc_reply_authors": "1121;841;946",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            485.6666666666667,
            147.262426376256
        ],
        "wc_reply_reviewers_avg": [
            97.33333333333333,
            86.91119349977627
        ],
        "wc_reply_authors_avg": [
            969.3333333333334,
            115.49410759380277
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VttPRP6CE8IJ:scholar.google.com/&scioq=GQ-Net:+Training+Quantization-Friendly+Deep+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Hkx6hANtwH",
        "title": "LambdaNet: Probabilistic Type Inference using Graph Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We have presented LambdaNet, a neural architecture for type inference that combines the strength of explicit program analysis with graph neural networks.",
        "abstract": "As gradual typing becomes increasingly popular in languages like Python and TypeScript, there is a growing need to infer type annotations automatically. While type annotations help with tasks like code completion and static error catching, these annotations cannot be fully inferred by compilers and are tedious to annotate by hand. This paper proposes a probabilistic type inference scheme for TypeScript based on a graph neural network. Our approach first uses lightweight source code analysis to generate a program abstraction called a type dependency graph, which links type variables with logical constraints as well as name and usage information. Given this program abstraction, we then use a graph neural network to propagate information between related type variables and eventually make type predictions. Our neural architecture can predict both standard types, like number or string, as well as user-defined types that have not been encountered during training. Our experimental results show that our approach outperforms prior work in this space by 14% (absolute) on library types, while having the ability to make type predictions that are out of scope for existing techniques. ",
        "keywords": "Type inference;Graph neural network;Programming languages;Pointer network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiayi Wei;Maruth Goyal;Greg Durrett;Isil Dillig",
        "authorids": "jiayi@cs.utexas.edu;maruth@utexas.edu;gdurrett@cs.utexas.edu;isil@cs.utexas.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWei2020LambdaNet:,\ntitle={LambdaNet: Probabilistic Type Inference using Graph Neural Networks},\nauthor={Jiayi Wei and Maruth Goyal and Greg Durrett and Isil Dillig},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkx6hANtwH}\n}",
        "github": "https://github.com/MrVPlusOne/LambdaNet",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkx6hANtwH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "506;486;259",
        "wc_reply_reviewers": "126;351;0",
        "wc_reply_authors": "1149;1387;158",
        "reply_reviewers": "1;2;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.0,
            112.02083139606967
        ],
        "wc_reply_reviewers_avg": [
            159.0,
            145.18264359075434
        ],
        "wc_reply_authors_avg": [
            898.0,
            532.2035951275288
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 141,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14484091760382594314&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Hkx6p6EFDr",
        "title": "Equivariant Entity-Relationship Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a feed-forward layer that is informed by the ER model of relational data and show that it is the most expressive linear layer possible under given the equivariance constraints. ",
        "abstract": "Due to its extensive use in databases, the relational model is ubiquitous in representing big-data. However, recent progress in deep learning with relational data has been focused on (knowledge) graphs. In this paper we propose Equivariant Entity-Relationship Networks, the class of parameter-sharing neural networks derived from the entity-relationship model. We prove that our proposed feed-forward layer is the most expressive linear layer under the given equivariance constraints, and subsumes recently introduced equivariant models for sets, exchangeable tensors, and graphs. The proposed feed-forward layer has linear complexity in the the data and can be used for both inductive and transductive reasoning about relational databases, including database embedding, and the prediction of missing records. This, provides a principled theoretical foundation for the application of deep learning to one of the most abundant forms of data.",
        "keywords": "deep learning;relational model;knowledge graph;exchangeability;equivariance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Devon Graham;Siamak Ravanbakhsh",
        "authorids": "drgraham@cs.ubc.ca;siamak@cs.mcgill.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngraham2020equivariant,\ntitle={Equivariant Entity-Relationship Networks},\nauthor={Devon Graham and Siamak Ravanbakhsh},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkx6p6EFDr}\n}",
        "github": "https://anonymous.4open.science/r/10bd3f10-0c97-42ac-b2f1-d21a838b007f/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer5;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hkx6p6EFDr",
        "pdf_size": 0,
        "rating": "3;3;3;3;8",
        "confidence": "0;0;0;0;0",
        "wc_review": "164;407;178;250;178",
        "wc_reply_reviewers": "190;75;0;0;0",
        "wc_reply_authors": "489;613;393;262;40",
        "reply_reviewers": "2;1;0;0;0",
        "reply_authors": "3;2;1;1;1",
        "rating_avg": [
            4.0,
            2.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            235.4,
            90.93646133427448
        ],
        "wc_reply_reviewers_avg": [
            53.0,
            74.4043009509531
        ],
        "wc_reply_authors_avg": [
            359.4,
            196.845726395063
        ],
        "reply_reviewers_avg": [
            0.6,
            0.7999999999999999
        ],
        "reply_authors_avg": [
            1.6,
            0.8
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16383923627246212916&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Hkx7_1rKwS",
        "title": "On Solving Minimax Optimization Locally: A Follow-the-Ridge Approach",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Many tasks in modern machine learning can be formulated as finding equilibria in sequential games. In particular, two-player zero-sum sequential games, also known as minimax optimization, have received growing interest. It is tempting to apply gradient descent to solve minimax optimization given its popularity and success in supervised learning. However, it has been noted that naive application of gradient descent fails to find some local minimax and can converge to non-local-minimax points. In this paper, we propose Follow-the-Ridge (FR), a novel algorithm that provably converges to and only converges to local minimax. We show theoretically that the algorithm addresses the notorious rotational behaviour of gradient dynamics, and is compatible with preconditioning and positive momentum. Empirically, FR solves toy minimax problems and improves the convergence of GAN training compared to the recent minimax optimization algorithms. ",
        "keywords": "minimax optimization;smooth differentiable games;local convergence;generative adversarial networks;optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuanhao Wang*;Guodong Zhang*;Jimmy Ba",
        "authorids": "yuanhao-16@mails.tsinghua.edu.cn;gdzhang@cs.toronto.edu;jba@cs.toronto.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nWang*2020On,\ntitle={On Solving Minimax Optimization Locally: A Follow-the-Ridge Approach},\nauthor={Yuanhao Wang* and Guodong Zhang* and Jimmy Ba},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkx7_1rKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hkx7_1rKwS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "450;1042;599",
        "wc_reply_reviewers": "296;2061;0",
        "wc_reply_authors": "430;3717;233",
        "reply_reviewers": "1;5;0",
        "reply_authors": "2;11;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            697.0,
            251.42129318469958
        ],
        "wc_reply_reviewers_avg": [
            785.6666666666666,
            909.8572537613921
        ],
        "wc_reply_authors_avg": [
            1460.0,
            1597.965164409621
        ],
        "reply_reviewers_avg": [
            2.0,
            2.160246899469287
        ],
        "reply_authors_avg": [
            4.666666666666667,
            4.4969125210773475
        ],
        "replies_avg": [
            26,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 122,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8383752406006580107&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Hkx7xRVYDr",
        "title": "Duration-of-Stay Storage Assignment under Uncertainty",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We develop a new storage assignment framework with a novel neural network that enables large efficiency gains in the warehouse.",
        "abstract": "Storage assignment, the act of choosing what goods are placed in what locations in a warehouse, is a central problem of supply chain logistics. Past literature has shown that the optimal method to assign pallets is to arrange them in increasing duration of stay in the warehouse (the Duration-of-Stay, or DoS, method), but the methodology requires perfect prior knowledge of DoS for each pallet, which is unknown and uncertain under realistic conditions. Attempts to predict DoS have largely been unfruitful due to the multi-valuedness nature (every shipment contains multiple identical pallets with different DoS) and data sparsity induced by lack of matching historical conditions. In this paper, we introduce a new framework for storage assignment that provides a solution to the DoS prediction problem through a distributional reformulation and a novel neural network, ParallelNet. Through collaboration with a world-leading cold storage company, we show that the system is able to predict DoS with a MAPE of 29%, a decrease of ~30% compared to a CNN-LSTM model, and suffers less performance decay into the future. The framework is then integrated into a first-of-its-kind Storage Assignment system, which is being deployed in warehouses across United States, with initial results showing up to 21% in labor savings. We also release the first publicly available set of warehousing records to facilitate research into this central problem.",
        "keywords": "Storage Assignment;Deep Learning;Duration-of-Stay;Application;Natural Language Processing;Parallel Network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael Lingzhi Li;Elliott Wolf;Daniel Wintz",
        "authorids": "mlli@mit.edu;ewolf@lineagelogistics.com;dwintz@lineagelogistics.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLi2020Duration-of-Stay,\ntitle={Duration-of-Stay Storage Assignment under Uncertainty},\nauthor={Michael Lingzhi Li and Elliott Wolf and Daniel Wintz},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkx7xRVYDr}\n}",
        "github": "https://anonymous.4open.science/r/8de2111c-d496-423e-86f3-b5e31792bead/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkx7xRVYDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "707;350;397",
        "wc_reply_reviewers": "125;15;15",
        "wc_reply_authors": "1563;342;410",
        "reply_reviewers": "3;1;1",
        "reply_authors": "5;2;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            484.6666666666667,
            158.37999312483325
        ],
        "wc_reply_reviewers_avg": [
            51.666666666666664,
            51.85449728701349
        ],
        "wc_reply_authors_avg": [
            771.6666666666666,
            560.245382746604
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            3.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            20,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1177895361455775788&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Hkx9UaNKDH",
        "title": "Task Level Data Augmentation for Meta-Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a data augmentation approach for meta-learning and prove that it is valid.",
        "abstract": "Data augmentation is one of the most effective approaches for improving the accuracy of modern machine learning models, and it is also indispensable to train a deep model for meta-learning. However, most current data augmentation implementations applied in meta-learning are the same as those used in the conventional image classification. In this paper, we introduce a new data augmentation method for meta-learning, which is named as ``Task Level Data Augmentation'' (referred to Task Aug). The basic idea of Task Aug is to increase the number of image classes rather than the number of images in each class. In contrast, with a larger amount of classes, we can sample more diverse task instances during training. This allows us to train a deep network by meta-learning methods with little over-fitting. Experimental results show that our approach achieves state-of-the-art performance on miniImageNet, CIFAR-FS, and FC100 few-shot learning benchmarks. Once paper is accepted, we will provide the link to code.",
        "keywords": "Meta-learning;few-shot learning;data augmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jialin Liu;Fei Chao;Chih-Min Lin",
        "authorids": "31520171153232@stu.xmu.edu.cn;fchao@xmu.edu.cn;cml@saturn.yzu.edu.tw",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkx9UaNKDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "608;352;183",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            381.0,
            174.71309815427884
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BBa8rI3vOF0J:scholar.google.com/&scioq=Task+Level+Data+Augmentation+for+Meta-Learning&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "HkxARkrFwB",
        "title": "word2ket: Space-efficient Word Embeddings inspired by Quantum Entanglement",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We use ideas from quantum computing to propose word embeddings that utilize much fewer trainable parameters.",
        "abstract": "Deep learning natural language processing models often use vector word embeddings, such as word2vec or GloVe, to represent words. A discrete sequence of words can be much more easily integrated with downstream neural layers if it is represented as a  sequence of continuous vectors. Also, semantic relationships between words, learned from a text corpus, can be encoded in the relative configurations of the embedding vectors. However, storing and accessing embedding vectors for all words in a dictionary requires large amount of space, and may stain systems with limited GPU memory. Here, we used approaches inspired by quantum computing to propose two related methods, word2ket and word2ketXS, for storing word embedding matrix during training and inference in a highly efficient way. Our approach achieves a hundred-fold or more reduction in the space required to store the embeddings with almost no relative drop in accuracy in practical natural language processing tasks.",
        "keywords": "word embeddings;natural language processing;model reduction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aliakbar Panahi;Seyran Saeedi;Tom Arodz",
        "authorids": "panahia@vcu.edu;saeedis@vcu.edu;tarodz@vcu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nPanahi2020word2ket:,\ntitle={word2ket: Space-efficient Word Embeddings inspired by Quantum Entanglement},\nauthor={Aliakbar Panahi and Seyran Saeedi and Tom Arodz},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxARkrFwB}\n}",
        "github": "https://github.com/panaali/word2ket",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkxARkrFwB",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "517;479;216",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "783;773;142",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.0,
            133.83821078700458
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            566.0,
            299.8410690126799
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4927881667581942745&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkxAS6VFDB",
        "title": "Prune or quantize? Strategy for Pareto-optimally low-cost and accurate CNN",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper reveals that \"prune-then-quantize method\" is the best strategy to achieve Pareto-optimal performance by using a proposed hardware-agnostic metric to measure computational cost.",
        "abstract": "Pruning and quantization are typical approaches to reduce the computational cost of CNN inference. Although the idea to combine them together seems natural, it is being unexpectedly difficult to figure out the resultant effect of the combination unless measuring the performance on a certain hardware which a user is going to use. This is because the benefits of pruning and quantization strongly depend on the hardware architecture where the model is executed. For example, a CPU-like architecture without any parallelization may fully exploit the reduction of computations by unstructured pruning for speeding up, but a GPU-like massive parallel architecture would not. Besides, there have been emerging proposals of novel hardware architectures such as one supporting variable bit precision quantization. From an engineering viewpoint, optimization for each hardware architecture is useful and important in practice, but this is quite a brute-force approach. Therefore, in this paper, we first propose hardware-agnostic metric to measure the computational cost. And using the metric, we demonstrate that Pareto-optimal performance, where the best accuracy is obtained at a given computational cost, is achieved when a slim model with smaller number of parameters is quantized moderately rather than a fat model with huge number of parameters is quantized to extremely low bit precision such as binary or ternary. Furthermore, we empirically found the possible quantitative relation between the proposed metric and the signal to noise ratio during SGD training, by which the information obtained during SGD training provides the optimal policy of quantization and pruning. We show the Pareto frontier is improved by 4 times in post-training quantization scenario based on these findings. These findings are available not only to improve the Pareto frontier for accuracy vs. computational cost, but also give us some new insights on deep neural network.",
        "keywords": "CNN;Quantization;Pruning;Accelerator;Computational cost",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kengo Nakata;Daisuke Miyashita;Asuka Maki;Fumihiko Tachibana;Shinichi Sasaki;Jun Deguchi",
        "authorids": "kengo1.nakata@toshiba.co.jp;daisuke1.miyashita@toshiba.co.jp;asuka.maki@toshiba.co.jp;fumihiko.tachibana@toshiba.co.jp;shinichi8.sasaki@toshiba.co.jp;jun.deguchi@toshiba.co.jp",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nnakata2020prune,\ntitle={Prune or quantize? Strategy for Pareto-optimally low-cost and accurate {\\{}CNN{\\}}},\nauthor={Kengo Nakata and Daisuke Miyashita and Asuka Maki and Fumihiko Tachibana and Shinichi Sasaki and Jun Deguchi},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxAS6VFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HkxAS6VFDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "943;289;455",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2363;1053;1418",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            562.3333333333334,
            277.57201267819175
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1611.3333333333333,
            552.0014090159151
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.9428090415820634
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4817841437317295672&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HkxBJT4YvB",
        "title": "Learning Disentangled Representations for CounterFactual Regression",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We consider the challenge of estimating treatment effects from observational data; and point out that, in general, only some factors based on the observed covariates X contribute to selection of the treatment T, and only some to determining the outcomes Y. We model this by considering three underlying sources of {X, T, Y} and show that explicitly modeling these sources offers great insight to guide designing models that better handle selection bias. This paper is an attempt to conceptualize this line of thought and provide a path to explore it further.\nIn this work, we propose an algorithm to (1) identify disentangled representations of the above-mentioned underlying factors from any given observational dataset D and (2) leverage this knowledge to reduce, as well as account for, the negative impact of selection bias on estimating the treatment effects from D. Our empirical results show that the proposed method achieves state-of-the-art performance in both individual and population based evaluation measures.",
        "keywords": "Counterfactual Regression;Causal Effect Estimation;Selection Bias;Off-policy Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Negar Hassanpour;Russell Greiner",
        "authorids": "hassanpo@ualberta.ca;rgreiner@ualberta.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nHassanpour2020Learning,\ntitle={Learning Disentangled Representations for CounterFactual Regression},\nauthor={Negar Hassanpour and Russell Greiner},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxBJT4YvB}\n}",
        "github": "https://www.dropbox.com/sh/vrux2exqwc9uh7k/AAAR4tlJLScPlkmPruvbrTJQa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkxBJT4YvB",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "141;199;333",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "181;316;293",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            224.33333333333334,
            80.40453276332677
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            263.3333333333333,
            58.97080256835204
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 197,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3963446784623084693&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HkxCcJHtPr",
        "title": "CAT: Compression-Aware Training for bandwidth reduction",
        "track": "main",
        "status": "Reject",
        "tldr": "Adding an entropy-reducing regularization to the loss to improve activation compression in inference time reducing memory bandwidth.",
        "abstract": "Convolutional neural networks (CNNs) have become the dominant neural network architecture for solving visual processing tasks. One of the major obstacles hindering the ubiquitous use of CNNs for inference is their relatively high memory bandwidth requirements, which can be a main energy consumer and throughput bottleneck in hardware accelerators. Accordingly, an efficient feature map compression method can result in substantial performance gains. Inspired by quantization-aware training approaches, we propose a compression-aware training (CAT) method that involves training the model in a way that allows better compression of feature maps during inference. Our method trains the model to achieve low-entropy feature maps, which enables efficient compression at inference time using classical transform coding methods. CAT significantly improves the state-of-the-art results reported for quantization. For example, on ResNet-34 we achieve 73.1% accuracy (0.2% degradation from the baseline)  with an average representation of only 1.79 bits per value. Reference implementation accompanies the paper.",
        "keywords": "compression;quantization;efficient inference;memory bandwidth;entropy;compression-aware training;Huffman;variable length coding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chaim Baskin;Brian Chmiel;Evgenii Zheltonozhskii;Ron Banner;Alex M. Bronstein;Avi Mendelson",
        "authorids": "chaimbaskin@cs.technion.ac.il;brian.chmiel@intel.com;evgeniizh@campus.technion.ac.il;ron.banner@intel.com;bron@cs.technion.ac.il;avi.mendelson@cs.technion.ac.il",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nbaskin2020cat,\ntitle={{\\{}CAT{\\}}: Compression-Aware Training for bandwidth reduction},\nauthor={Chaim Baskin and Brian Chmiel and Evgenii Zheltonozhskii and Ron Banner and Alex M. Bronstein and Avi Mendelson},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxCcJHtPr}\n}",
        "github": "https://github.com/CAT-teams/CAT",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkxCcJHtPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "206;163;258",
        "wc_reply_reviewers": "0;16;0",
        "wc_reply_authors": "199;282;312",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            209.0,
            38.841558499456056
        ],
        "wc_reply_reviewers_avg": [
            5.333333333333333,
            7.542472332656507
        ],
        "wc_reply_authors_avg": [
            264.3333333333333,
            47.79353745248642
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13907767032957471178&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "HkxCzeHFDB",
        "title": "Functional Regularisation for Continual Learning with Gaussian Processes",
        "track": "main",
        "status": "Poster",
        "tldr": "Using inducing point sparse Gaussian process methods to overcome catastrophic forgetting in neural networks.",
        "abstract": "We introduce a framework for Continual Learning (CL) based on Bayesian inference over the function space rather than the parameters of a deep neural network. This method, referred to as functional regularisation for Continual Learning, avoids forgetting a previous task by constructing and memorising an approximate posterior belief over the underlying task-specific function. To achieve this we rely on a Gaussian process obtained by treating the weights of the last layer of a neural network as random and Gaussian distributed. Then, the training algorithm sequentially encounters tasks and constructs posterior beliefs over the task-specific functions by using inducing point sparse Gaussian process methods. At each step a new task is first learnt and then a summary is constructed consisting of (i) inducing inputs \u2013 a fixed-size subset of the task inputs selected such that it optimally represents the task \u2013 and (ii) a posterior distribution over the function values at these inputs. This summary then regularises learning of future tasks, through Kullback-Leibler regularisation terms. Our method thus unites approaches focused on (pseudo-)rehearsal with those derived from a sequential Bayesian inference perspective in a principled way, leading to strong results on accepted benchmarks.",
        "keywords": "Continual Learning;Gaussian Processes;Lifelong learning;Incremental Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michalis K. Titsias;Jonathan Schwarz;Alexander G. de G. Matthews;Razvan Pascanu;Yee Whye Teh",
        "authorids": "mtitsias@google.com;schwarzjn@google.com;alexmatthews@google.com;razp@google.com;ywteh@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nTitsias2020Functional,\ntitle={Functional Regularisation for  Continual Learning with Gaussian Processes},\nauthor={Michalis K. Titsias and Jonathan Schwarz and Alexander G. de G. Matthews and Razvan Pascanu and Yee Whye Teh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxCzeHFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkxCzeHFDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "648;437;487",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1368;508;628",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            524.0,
            90.02592219281438
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            834.6666666666666,
            380.2922852520443
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 215,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13361072200312158132&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkxDheHFDr",
        "title": "LAVAE: Disentangling Location and Appearance",
        "track": "main",
        "status": "Reject",
        "tldr": "Generative model that learns structured, interpretable, object-based representations of visual scenes, disentangling object location and appearance.",
        "abstract": "We propose a probabilistic generative model for unsupervised learning of structured, interpretable, object-based representations of visual scenes. We use amortized variational inference to train the generative model end-to-end. The learned representations of object location and appearance are fully disentangled, and objects are represented independently of each other in the latent space. Unlike previous approaches that disentangle location and appearance, ours generalizes seamlessly to scenes with many more objects than encountered in the training regime. We evaluate the proposed model on multi-MNIST and multi-dSprites data sets.",
        "keywords": "structured scene representations;compositional representations;generative models;unsupervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrea Dittadi;Ole Winther",
        "authorids": "adit@dtu.dk;olwi@dtu.dk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ndittadi2020lavae,\ntitle={{\\{}LAVAE{\\}}: Disentangling Location and Appearance},\nauthor={Andrea Dittadi and Ole Winther},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxDheHFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkxDheHFDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "380;250;234",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            288.0,
            65.38093503970914
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14911066439319385622&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HkxGaeHKvB",
        "title": "NAMSG: An Efficient Method for Training Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A new algorithm for training neural networks that compares favorably to popular adaptive methods.",
        "abstract": "We introduce NAMSG, an adaptive first-order algorithm for training neural networks. The method is efficient in computation and memory, and is straightforward to implement. It computes the gradients at configurable remote observation points, in order to expedite the convergence by adjusting the step size for directions with different curvatures in the stochastic setting. It also scales the updating vector elementwise by a nonincreasing preconditioner to take the advantages of AMSGRAD. We analyze the convergence properties for both convex and nonconvex problems by modeling the training process as a dynamic system, and provide a strategy to select the observation factor without grid search. A data-dependent regret bound is proposed to guarantee the convergence in the convex setting. The method can further achieve a O(log(T)) regret bound for strongly convex functions. Experiments demonstrate that NAMSG works well in practical problems and compares favorably to popular adaptive methods, such as ADAM, NADAM, and AMSGRAD.",
        "keywords": "neural networks;training;adaptive methods",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yushu Chen;Hao Jing;Wenlai Zhao;Zhiqiang Liu;Ouyi Li;Liang Qiao;Haohuan Fu;Wei Xue;Guangwen Yang",
        "authorids": "yschen11@126.com;jinghao0320@gmail.com;cryinlaugh@gmail.com;gt_liuzq@163.com;18801087946@163.com;qiaoliang6363@163.com;haohuan@tsinghua.edu.cn;xuewei@mail.tsinghua.edu.cn;ygw@mail.tsinghua.edu.cn",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "",
        "github": "https://github.com/rationalspark/NAMSG/blob/master/Namsg.py",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkxGaeHKvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "219;255;308",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "266;28;139",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            260.6666666666667,
            36.55437350334734
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            144.33333333333334,
            97.23625295582347
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7we1kpEzK7EJ:scholar.google.com/&scioq=NAMSG:+An+Efficient+Method+for+Training+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkxHXJrtPr",
        "title": "Improving and Stabilizing Deep Energy-Based Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Deep energy-based models are powerful, but pose challenges for learning and inference (Belanger & McCallum, 2016). Tu & Gimpel (2018) developed an efficient framework for energy-based models by training \u201cinference networks\u201d to approximate structured inference instead of using gradient descent. However, their alternating optimization approach suffers from instabilities during training, requiring additional loss terms and careful hyperparameter tuning. In this paper, we contribute several strategies to stabilize and improve this joint training of energy functions and inference networks for structured prediction. We design a compound objective to jointly train both cost-augmented and test-time inference networks along with the energy function. We propose joint parameterizations for the inference networks that encourage them to capture complementary functionality during learning. We empirically validate our strategies on two sequence labeling tasks, showing easier paths to strong performance than prior work, as well as further improvements with global energy terms.",
        "keywords": "deep structured prediction;energy-based learning;inference networks;sequence labeling;training objectives",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lifu Tu;Richard Yuanzhe Pang;Kevin Gimpel",
        "authorids": "lifu@ttic.edu;yzpang@nyu.edu;kgimpel@ttic.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkxHXJrtPr",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "126;143;400",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            223.0,
            125.3501761732574
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zCmlCw5As08J:scholar.google.com/&scioq=Improving+and+Stabilizing+Deep+Energy-Based+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HkxIIaVKPB",
        "title": "Unsupervised-Learning of time-varying features",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a model-architecture based on conditional VAEs that can learn time-varying features, as for example image-transformations, efficiently.",
        "abstract": "We present an architecture based on the conditional Variational Autoencoder to learn a representation\nof transformations in time-sequence data. The model is constructed in a way that allows to identify sub-spaces of features indicating changes between frames without learning features that are constant within a time-sequence. Therefore, the approach disentangles content from transformations. Different model-architectures are applied to affine image-transformations on MNIST as well as a car-racing video-game task.\nResults show that the model discovers relevant parameterizations, however, model architecture has a major impact on the feature-space. It turns out, that there is an advantage of only learning features describing change of state between images, over learning the states of the images at each frame. In this case, we do not only achieve higher accuracy but also more interpretable linear features. Our results also uncover the need for model architectures that combine global transformations with convolutional architectures.",
        "keywords": "Representation Learning;Variational Autoencoder;Unsupervised Learning;Deep-Learning;Registration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Henrik H\u00f8eg;Matthias Brix;Oswin Krause",
        "authorids": "lvt956@alumni.ku.dk;brixmatthias@gmail.com;oswin.krause@di.ku.dk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nh{\\o}eg2020unsupervisedlearning,\ntitle={Unsupervised-Learning of time-varying features},\nauthor={Henrik H{\\o}eg and Matthias Brix and Oswin Krause},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxIIaVKPB}\n}",
        "github": "https://drive.google.com/drive/folders/1euBPj9DMHlHz6ueuHtiWrqU2KsKXfpVq?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkxIIaVKPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "247;388;401",
        "wc_reply_reviewers": "0;183;0",
        "wc_reply_authors": "777;643;264",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            345.3333333333333,
            69.73441681758649
        ],
        "wc_reply_reviewers_avg": [
            61.0,
            86.2670273047588
        ],
        "wc_reply_authors_avg": [
            561.3333333333334,
            217.24691533419346
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EE-NAnLl6BYJ:scholar.google.com/&scioq=Unsupervised-Learning+of+time-varying+features&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkxJHlrFvr",
        "title": "Angular Visual Hardness",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel measure in CNN based on angular similarity that is shown to correlate strongly with human visual hardness with gains in applications such as self-training.",
        "abstract": "The mechanisms behind  human visual systems and convolutional neural networks (CNNs) are vastly different. Hence, it is expected that they have different notions of ambiguity or hardness. In this paper, we make a surprising discovery: there exists a (nearly) universal score function for CNNs whose correlation with human visual hardness is statistically significant. We term this function as angular visual hardness (AVH) and in a CNN, it is given by the normalized angular distance between a feature embedding and the classifier weights of the corresponding target category. We conduct an in-depth scientific study. We observe that CNN models with the highest  accuracy also have the best AVH scores. This agrees with an earlier finding that state-of-art models tend to improve on classification of harder training examples. We find that AVH displays interesting dynamics during training: it quickly reaches a plateau even though the training loss keeps improving. This suggests the need  for designing better loss functions that can  target harder examples more effectively. Finally, we empirically show significant improvement in performance by using AVH as a measure of hardness in self-training tasks.\n ",
        "keywords": "angular similarity;self-training;hard samples mining",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Beidi Chen;Weiyang Liu;Animesh Garg;Zhiding Yu;Anshumali Shrivastava;Jan Kautz;Anima Anandkumar",
        "authorids": "beidi.chen@rice.edu;wyliu@gatech.edu;garg@cs.stanford.edu;zhidingy@nvidia.com;anshumali@rice.edu;jkautz@nvidia.com;anima@caltech.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nchen2020angular,\ntitle={Angular Visual Hardness},\nauthor={Beidi Chen and Weiyang Liu and Animesh Garg and Zhiding Yu and Anshumali Shrivastava and Jan Kautz and Anima Anandkumar},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxJHlrFvr}\n}",
        "github": "https://drive.google.com/drive/folders/1AqAhFI93cGT4uut05c5rQWxCEeeVA_TG?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkxJHlrFvr",
        "pdf_size": 0,
        "rating": "1;8;8",
        "confidence": "0;0;0",
        "wc_review": "886;226;560",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2048;474;306",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;1;1",
        "rating_avg": [
            5.666666666666667,
            3.299831645537222
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            557.3333333333334,
            269.45046957753766
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            942.6666666666666,
            784.5921799819885
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 55,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9765580008525125823&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "HkxJpnVtPr",
        "title": "A Stochastic Trust Region Method for Non-convex Minimization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We target the problem of finding a local minimum in non-convex finite-sum minimization. Towards this goal, we first prove that the trust region method with inexact gradient and Hessian estimation can achieve a convergence rate of order $\\mathcal{O}({1}/{k^{2/3}})$ as long as those differential estimations are sufficiently accurate.\nCombining such result with a novel Hessian estimator, we propose a sample-efficient stochastic trust region (STR) algorithm which finds an $(\\epsilon, \\sqrt{\\epsilon})$-approximate local minimum within $\\tilde{\\mathcal{O}}({\\sqrt{n}}/{\\epsilon^{1.5}})$ stochastic Hessian oracle queries. \nThis improves the state-of-the-art result by a factor of $\\mathcal{O}(n^{1/6})$. Finally,  we also develop Hessian-free STR algorithms which achieve the lowest runtime complexity. \nExperiments verify theoretical conclusions and the efficiency of the proposed algorithms.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zebang Shen;Pan Zhou;Cong Fang;Jiahao Xie;Alejandro Ribeiro",
        "authorids": "shenzebang@zju.edu.cn;pzhou@u.nus.edu;fangcong@pku.edu.cn;xiejh@zju.edu.cn;aribeiro@seas.upenn.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nshen2020a,\ntitle={A Stochastic Trust Region Method for Non-convex Minimization},\nauthor={Zebang Shen and Pan Zhou and Cong Fang and Jiahao Xie and Alejandro Ribeiro},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxJpnVtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkxJpnVtPr",
        "pdf_size": 0,
        "rating": "3;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "504;551;81;210",
        "wc_reply_reviewers": "0;53;0;0",
        "wc_reply_authors": "540;448;8;124",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "1;2;1;1",
        "rating_avg": [
            5.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            336.5,
            197.07168746423216
        ],
        "wc_reply_reviewers_avg": [
            13.25,
            22.949673200287624
        ],
        "wc_reply_authors_avg": [
            280.0,
            220.3088740836374
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15769377748407264920&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkxLiJSKwB",
        "title": "Graph-based motion planning networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose an end-to-end differentiable planning network for graphs. This can be applicable to many motion planning problems",
        "abstract": "Differentiable planning network architecture has shown to be powerful in solving transfer planning tasks while possesses a simple end-to-end training feature. Many great planning architectures that have been proposed later in literature are inspired by this design principle in which a recursive network architecture is applied to emulate backup operations of a value  iteration algorithm. However existing frame-works can only learn and plan effectively on domains with a lattice structure, i.e. regular graphs embedded in a certain Euclidean space. In this paper, we propose a general planning network, called Graph-based Motion Planning Networks (GrMPN), that will be able to i) learn and plan on general irregular graphs, hence ii) render existing planning network architectures special cases. The proposed GrMPN framework is invariant to task graph permutation, i.e. graph isormophism. As a result, GrMPN possesses the generalization strength and data-efficiency ability. We demonstrate the performance of the proposed GrMPN method against other baselines on three domains ranging from 2D mazes (regular graph), path planning on irregular graphs, and motion planning (an irregular graph of robot configurations).",
        "keywords": "motion planning;reinforcement learning;graph-based planning;transfer-planning;zero-shot planning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tai Hoang;Ngo Anh Vien",
        "authorids": "thobotics@gmail.com;v.ngo@qub.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkxLiJSKwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "425;824;236",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            495.0,
            245.09997960016236
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16457479147522861311&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkxQRTNYPH",
        "title": "Mirror-Generative Neural Machine Translation",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "Training neural machine translation models (NMT) requires a large amount of parallel corpus, which is scarce for many language pairs. However, raw non-parallel corpora are often easy to obtain. Existing approaches have not exploited the full potential of non-parallel bilingual data either in training or decoding. In this paper, we propose the mirror-generative NMT (MGNMT), a single unified architecture that simultaneously integrates the source to target translation model, the target to source translation model, and two language models. Both translation models and language models share the same latent semantic space, therefore both translation directions can learn from non-parallel data more effectively. Besides, the translation models and language models can collaborate together during decoding. Our experiments show that the proposed MGNMT consistently outperforms existing approaches in a variety of scenarios and language pairs, including resource-rich and low-resource situations. ",
        "keywords": "neural machine translation;generative model;mirror",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zaixiang Zheng;Hao Zhou;Shujian Huang;Lei Li;Xin-Yu Dai;Jiajun Chen",
        "authorids": "zhengzx.142857@gmail.com;zhouhao.nlp@bytedance.com;huangsj@nju.edu.cn;lilei.02@bytedance.com;daixinyu@nju.edu.cn;chenjj@nju.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nZheng2020Mirror-Generative,\ntitle={Mirror-Generative Neural Machine Translation},\nauthor={Zaixiang Zheng and Hao Zhou and Shujian Huang and Lei Li and Xin-Yu Dai and Jiajun Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxQRTNYPH}\n}",
        "github": "https://github.com/zhengzx-nlp/MGNMT",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkxQRTNYPH",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "540;154;368",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "425;50;247",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            354.0,
            157.8944795319541
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            240.66666666666666,
            153.15859608770106
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18299597180581988419&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HkxQzlHFPr",
        "title": "Robust Natural Language Representation Learning for Natural Language Inference by Projecting Superficial Words out",
        "track": "main",
        "status": "Reject",
        "tldr": "We use neural networks to project superficial information out for natural language inference by defining and identifying the superficial information from the perspective of first-order logic.",
        "abstract": "In natural language inference, the semantics of some words do not affect the inference. Such information is considered superficial and brings overfitting. How can we represent and discard such superficial information? In this paper, we use first order logic (FOL) - a classic technique from meaning representation language \u2013 to explain what information is superficial for a given sentence pair. Such explanation also suggests two inductive biases according to its properties. We proposed a neural network-based approach that utilizes the two inductive biases. We obtain substantial improvements over extensive experiments.",
        "keywords": "natural language inference;first order logic",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wanyun Cui;Guangyu Zheng;Wei Wang",
        "authorids": "cui.wanyun@sufe.edu.cn;simonzgy@outlook.com;weiwang1@fudan.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ncui2020robust,\ntitle={Robust Natural Language Representation Learning for Natural Language Inference by Projecting Superficial Words out},\nauthor={Wanyun Cui and Guangyu Zheng and Wei Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxQzlHFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkxQzlHFPr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "148;479;274",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            300.3333333333333,
            136.40707052381444
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fqzULj8DSmMJ:scholar.google.com/&scioq=Robust+Natural+Language+Representation+Learning+for+Natural+Language+Inference+by+Projecting+Superficial+Words+out&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkxSOAEFDB",
        "title": "Octave Graph Convolutional Network",
        "track": "main",
        "status": "Reject",
        "tldr": "Octave convolutional learning for graphs in spectral domain within the framework of Graph Convolutional Networks",
        "abstract": "Many variants of Graph Convolutional Networks (GCNs) for representation learning have been proposed recently and have achieved fruitful results in various domains. Among them, spectral-based GCNs are constructed via convolution theorem upon theoretical foundation from the perspective of Graph Signal Processing (GSP). However, despite most of them implicitly act as low-pass filters that generate smooth representations for each node, there is limited development on the full usage of underlying information from low-frequency. Here, we first introduce the octave convolution on graphs in spectral domain. Accordingly, we present Octave Graph Convolutional Network (OctGCN), a novel architecture that learns representations for different frequency components regarding to weighted filters and graph wavelets bases. We empirically validate the importance of low-frequency components in graph signals on semi-supervised node classification and demonstrate that our model achieves state-of-the-art performance in comparison with both spectral-based and spatial-based baselines.",
        "keywords": "Graph Convolutional Networks;Octave Convolution;Graph Mining",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Heng Chang;Yu Rong;Somayeh Sojoudi;Junzhou Huang;Wenwu Zhu",
        "authorids": "changh17@mails.tsinghua.edu.cn;yu.rong@hotmail.com;sojoudi@berkeley.edu;jzhuang@uta.edu;wwzhu@tsinghua.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchang2020octave,\ntitle={Octave Graph Convolutional Network},\nauthor={Heng Chang and Yu Rong and Somayeh Sojoudi and Junzhou Huang and Wenwu Zhu},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxSOAEFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkxSOAEFDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "217;260;112",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "481;549;127",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            196.33333333333334,
            62.16286851668143
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            385.6666666666667,
            184.99969969945596
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TABCcCaWexMJ:scholar.google.com/&scioq=Octave+Graph+Convolutional+Network&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HkxTwkrKDB",
        "title": "On Universal Equivariant Set Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "Settling permutation equivariance universality for popular deep models. ",
        "abstract": "Using deep neural networks that are either invariant or equivariant to permutations in order to learn functions on unordered sets has become prevalent. The most popular, basic models are DeepSets (Zaheer et al. 2017) and PointNet (Qi et al. 2017). While known to be universal for approximating invariant functions, DeepSets and PointNet are not known to be universal when approximating equivariant set functions. On the other hand, several recent equivariant set architectures have been proven equivariant universal (Sannai et al. 2019, Keriven and Peyre 2019), however these models either use layers that are not permutation equivariant (in the standard sense) and/or use higher order tensor variables which are less practical. There is, therefore, a gap in understanding the universality of popular equivariant set models versus theoretical ones. \n\t\t\t\nIn this paper we close this gap by proving that: (i) PointNet is not equivariant universal; and (ii) adding a single linear transmission  layer makes PointNet universal. We call this architecture PointNetST and argue it is the simplest permutation equivariant universal model known to date. Another consequence is that DeepSets is universal, and also PointNetSeg, a popular point cloud segmentation network (used e.g., in Qi et al. 2017) is universal.\n\t\t\nThe key theoretical tool used to prove the above results is an explicit characterization of all permutation equivariant polynomial layers. Lastly, we provide numerical experiments validating the theoretical results and comparing different permutation equivariant models.",
        "keywords": "deep learning;universality;set functions;equivariance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nimrod Segol;Yaron Lipman",
        "authorids": "nimrod.segol@weizmann.ac.il;yaron.lipman@weizmann.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nSegol2020On,\ntitle={On Universal Equivariant Set Networks},\nauthor={Nimrod Segol and Yaron Lipman},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxTwkrKDB}\n}",
        "github": "[![github](/images/github_icon.svg) NimrodSegol/On-Universal-Equivariant-Set-Networks](https://github.com/NimrodSegol/On-Universal-Equivariant-Set-Networks)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkxTwkrKDB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "373;308;676",
        "wc_reply_reviewers": "0;0;68",
        "wc_reply_authors": "241;249;362",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.3333333333333,
            160.36694034480908
        ],
        "wc_reply_reviewers_avg": [
            22.666666666666668,
            32.05550741379015
        ],
        "wc_reply_authors_avg": [
            284.0,
            55.25094267672423
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 77,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17434444729278914575&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HkxU2pNYPH",
        "title": "Sticking to the Facts: Confident Decoding for Faithful Data-to-Text Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a confidence-oriented decoder to reduce hallucination in neural structured-data-to-text generation.",
        "abstract": "Neural conditional text generation systems have achieved significant progress in recent years, showing the ability to produce highly fluent text. However, the inherent lack of controllability in these systems allows them to hallucinate factually incorrect phrases that are unfaithful to the source, making them often unsuitable for many real world systems that require high degrees of precision. In this work, we propose a novel confidence oriented decoder that assigns a confidence score to each target position. This score is learned in training using a variational Bayes objective, and can be leveraged at inference time using a calibration technique to promote more faithful generation. Experiments on a structured data-to-text dataset -- WikiBio -- show that our approach is more faithful to the source than existing state-of-the-art approaches, according to both automatic metrics and human evaluation.",
        "keywords": "Natural Language Processing;Text Generation;Data-to-Text Generation;Hallucination;Calibration;Variational Bayes",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ran Tian;Shashi Narayan;Thibault Sellam;Ankur P. Parikh",
        "authorids": "tianran@google.com;shashinarayan@google.com;tsellam@google.com;aparikh@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntian2020sticking,\ntitle={Sticking to the Facts: Confident Decoding for Faithful Data-to-Text Generation},\nauthor={Ran Tian and Shashi Narayan and Thibault Sellam and Ankur P. Parikh},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxU2pNYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkxU2pNYPH",
        "pdf_size": 0,
        "rating": "3;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "334;419;215;446",
        "wc_reply_reviewers": "0;0;0;162",
        "wc_reply_authors": "676;806;274;449",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;1;1;2",
        "rating_avg": [
            5.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            353.5,
            90.01249913206499
        ],
        "wc_reply_reviewers_avg": [
            40.5,
            70.14805770653953
        ],
        "wc_reply_authors_avg": [
            551.25,
            204.806463521052
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 106,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18159799809473555658&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HkxWXkStDB",
        "title": "Improving Robustness Without Sacrificing Accuracy with Patch Gaussian Augmentation",
        "track": "main",
        "status": "Reject",
        "tldr": "Simple augmentation method overcomes robustness/accuracy trade-off observed in literature and opens questions about the effect of training distribution on out-of-distribution generalization.",
        "abstract": "Deploying machine learning systems in the real world requires both high accuracy on clean data and robustness to naturally occurring corruptions. While architectural advances have led to improved accuracy, building robust models remains challenging, involving major changes in training procedure and datasets.  Prior work has argued that there is an inherent trade-off between robustness and accuracy, as exemplified by standard data augmentation techniques such as Cutout, which improves clean accuracy but not robustness, and additive Gaussian noise, which improves robustness but hurts accuracy. We introduce Patch Gaussian, a simple augmentation scheme that adds noise to randomly selected patches in an input image.  Models trained with Patch Gaussian achieve state of the art on the CIFAR-10 and ImageNet Common Corruptions benchmarks while also maintaining accuracy on clean data. We find that this augmentation leads to reduced sensitivity to high frequency noise (similar to Gaussian) while retaining the ability to take advantage of relevant high frequency information in the image (similar to Cutout). We show it can be used in conjunction with other regularization methods and data augmentation policies such as AutoAugment.  Finally, we find that the idea of restricting perturbations to patches can also be useful in the context of adversarial learning, yielding models without the loss in accuracy that is found with unconstrained adversarial training.",
        "keywords": "Data Augmentation;Out-of-distribution;Robustness;Generalization;Computer Vision;Corruption",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Raphael Gontijo Lopes;Dong Yin;Ben Poole;Justin Gilmer;Ekin D. Cubuk",
        "authorids": "iraphael@google.com;dongyin@berkeley.edu;pooleb@google.com;gilmer@google.com;cubuk@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nlopes2020improving,\ntitle={Improving Robustness Without Sacrificing Accuracy with Patch Gaussian Augmentation},\nauthor={Raphael Gontijo Lopes and Dong Yin and Ben Poole and Justin Gilmer and Ekin D. Cubuk},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxWXkStDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkxWXkStDB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "82;248;148",
        "wc_reply_reviewers": "229;0;0",
        "wc_reply_authors": "825;576;53",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            159.33333333333334,
            68.24140157476903
        ],
        "wc_reply_reviewers_avg": [
            76.33333333333333,
            107.95163526114627
        ],
        "wc_reply_authors_avg": [
            484.6666666666667,
            321.71657643888284
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 248,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14002256910750839695&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HkxYzANYDB",
        "title": "CLEVRER: Collision Events for Video Representation and Reasoning",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We present a diagnostic dataset for systematic study of temporal and casual reasoning in videos. ",
        "abstract": "The ability to reason about temporal and causal events from videos lies at the core of human intelligence. Most video reasoning benchmarks, however, focus on pattern recognition from complex visual and language input, instead of on causal structure. We study the complementary problem, exploring the temporal and causal structures behind videos of objects with simple visual appearance. To this end, we introduce the CoLlision Events for Video REpresentation and Reasoning (CLEVRER) dataset, a  diagnostic video dataset for systematic evaluation of computational models on a wide range of reasoning tasks.  Motivated by the theory of human casual judgment, CLEVRER includes four types of question:  descriptive (e.g., \u2018what color\u2019), explanatory (\u2018what\u2019s responsible for\u2019), predictive (\u2018what will happen next\u2019), and counterfactual (\u2018what if\u2019).  We evaluate various state-of-the-art models for visual reasoning on our benchmark. While these models thrive on the perception-based task (descriptive), they perform poorly on the causal tasks (explanatory, predictive and counterfactual), suggesting that a principled approach for causal reasoning should incorporate the capability of both perceiving complex visual and language inputs, and understanding the underlying dynamics and causal relations. We also study an oracle model that explicitly combines these components via symbolic representations. ",
        "keywords": "Neuro-symbolic;Reasoning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kexin Yi*;Chuang Gan*;Yunzhu Li;Pushmeet Kohli;Jiajun Wu;Antonio Torralba;Joshua B. Tenenbaum",
        "authorids": "kyi@g.harvard.edu;ganchuang1990@gmail.com;liyunzhu@mit.edu;pushmeet@google.com;jiajunwu@mit.edu;torralba@mit.edu;jbt@mit.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nYi*2020CLEVRER:,\ntitle={CLEVRER: Collision Events for Video Representation and Reasoning},\nauthor={Kexin Yi* and Chuang Gan* and Yunzhu Li and Pushmeet Kohli and Jiajun Wu and Antonio Torralba and Joshua B. Tenenbaum},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxYzANYDB}\n}",
        "github": "http://clevrer.csail.mit.edu/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkxYzANYDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "710;416;326",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "706;278;381",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            484.0,
            163.97560794215707
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            455.0,
            182.39700289935323
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 559,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4352064462350202338&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HkxZVlHYvH",
        "title": "Ergodic Inference: Accelerate Convergence by Optimisation",
        "track": "main",
        "status": "Reject",
        "tldr": "In this work, we aim to improve upon MCMC and VI by a novel hybrid method based on the idea of reducing simulation bias of finite-length MCMC chains using gradient-based optimisation.",
        "abstract": "Statistical inference methods are fundamentally important in machine learning. Most state-of-the-art inference algorithms are \nvariants of Markov chain Monte Carlo (MCMC) or variational inference (VI). However, both methods struggle with limitations in practice: MCMC methods can be computationally demanding; VI methods may have large bias. \nIn this work, we aim to improve upon MCMC and VI by a novel hybrid method based on the idea of reducing simulation bias of finite-length MCMC chains using gradient-based optimisation. The proposed method can generate low-biased samples by increasing the length of MCMC simulation and optimising the MCMC hyper-parameters, which offers attractive balance between approximation bias and computational efficiency. We show that our method produces promising results on popular benchmarks when compared to recent hybrid methods of MCMC and VI.",
        "keywords": "MCMC;variational inference;statistical inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yichuan Zhang;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato",
        "authorids": "yichuan.zhang@eng.cam.ac.uk;jmh233@cam.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzhang2020ergodic,\ntitle={Ergodic Inference: Accelerate Convergence by Optimisation},\nauthor={Yichuan Zhang and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxZVlHYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkxZVlHYvH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "369;303;78",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "657;1081;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;0",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            250.0,
            124.57126474432215
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            579.3333333333334,
            444.7203865601646
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.816496580927726
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6919741306126576870&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HkxZigSYwS",
        "title": "Universal Safeguarded Learned Convex Optimization with Guaranteed Convergence",
        "track": "main",
        "status": "Reject",
        "tldr": "We provide the first general framework, with convergence guarantees, for applying learning to optimize schemes to any convex optimization problem.",
        "abstract": "Many applications require quickly and repeatedly  solving a certain type of optimization problem, each time with new (but similar) data. However, state of the art general-purpose optimization methods may converge too slowly for real-time use. This shortcoming is addressed by  \u201clearning to optimize\u201d (L2O) schemes, which construct   neural networks from parameterized forms of the update  operations of general-purpose methods. Inferences by each network form solution estimates, and networks are trained to optimize these estimates for a particular distribution of data. This results in task-specific algorithms (e.g., LISTA, ALISTA, and D-LADMM) that can converge order(s) of magnitude faster than general-purpose counterparts.  We provide the first general L2O convergence theory by wrapping all L2O schemes for convex optimization within a single framework. Existing L2O schemes form special cases, and we give a practical guide for applying our L2O framework to other problems. Using safeguarding, our theory proves, as the number of network layers increases, the distance between inferences and the solution set goes to zero, i.e., each cluster point is a solution. Our numerical examples demonstrate the efficacy of our approach for both existing and new L2O methods. ",
        "keywords": "L2O;learn to optimize;fixed point;machine learning;neural network;ADMM;LADMM;ALISTA;D-LADMM",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Howard Heaton;Xiaohan Chen;Zhangyang Wang;Wotao Yin",
        "authorids": "heaton@math.ucla.edu;chernxh@tamu.edu;atlaswang@tamu.edu;wotao.yin@alibaba-inc.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nheaton2020universal,\ntitle={Universal Safeguarded Learned Convex Optimization with Guaranteed Convergence},\nauthor={Howard Heaton and Xiaohan Chen and Zhangyang Wang and Wotao Yin},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxZigSYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HkxZigSYwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "366;740;254",
        "wc_reply_reviewers": "0;267;385",
        "wc_reply_authors": "760;986;1023",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            453.3333333333333,
            207.79690939846907
        ],
        "wc_reply_reviewers_avg": [
            217.33333333333334,
            161.0514065618663
        ],
        "wc_reply_authors_avg": [
            923.0,
            116.24399626073884
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZqDd4KiV42wJ:scholar.google.com/&scioq=Universal+Safeguarded+Learned+Convex+Optimization+with+Guaranteed+Convergence&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Hkxbz1HKvr",
        "title": "Learning Key Steps to Attack Deep Reinforcement Learning Agents",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel reinforcement learning framework where an attacker can learn more effective key steps to attack the reinforcement learning agent.",
        "abstract": "Deep reinforcement learning agents are known to be vulnerable to adversarial attacks. In particular, recent studies have shown that attacking a few key steps is effective for decreasing the agent's cumulative reward. However, all existing attacking methods find those key steps with human-designed heuristics, and it is not clear how more effective key steps can be identified. This paper introduces a novel reinforcement learning framework that learns more effective key steps through interacting with the agent. The proposed framework does not require any human heuristics nor knowledge, and can be flexibly coupled with any white-box or black-box adversarial attack scenarios. Experiments on benchmark Atari games across different scenarios demonstrate that the proposed framework is superior to existing methods for identifying more effective key steps.",
        "keywords": "deep reinforcement learning;adversarial attacks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chien-Min Yu;Hsuan-Tien Lin",
        "authorids": "r07922080@csie.ntu.edu.tw;htlin@csie.ntu.edu.tw",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nyu2020learning,\ntitle={Learning Key Steps to Attack Deep Reinforcement Learning Agents},\nauthor={Chien-Min Yu and Hsuan-Tien Lin},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkxbz1HKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkxbz1HKvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "377;492;538",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "428;762;424",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            469.0,
            67.71016664184683
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            538.0,
            158.40033669997885
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17892150693867631491&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HkxcUxrFPS",
        "title": "Improving Visual Relation Detection using Depth Maps",
        "track": "main",
        "status": "Reject",
        "tldr": "Synthetically generated depth maps can improved visual relation detection.",
        "abstract": "State of the art visual relation detection methods mostly rely on object information extracted from RGB images such as predicted class probabilities, 2D bounding boxes and feature maps. In this paper, we argue that the 3D positions of objects in space can provide additional valuable information about object relations. This information helps not only to detect spatial relations, such as \\textit{standing behind}, but also non-spatial relations, such as \\textit{holding}. Since 3D information of a scene is not easily accessible, we propose incorporating a pre-trained RGB-to-Depth model within visual relation detection frameworks. We discuss different feature extraction strategies from depth maps and show their critical role in relation detection.\nOur experiments confirm that the performance of state-of-the-art visual relation detection approaches can significantly be improved by utilizing depth map information.",
        "keywords": "Visual Relation Detection;Scene Graph Generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sahand Sharifzadeh;Sina Moayed Baharlou;Max Berrendorf;Rajat Koner;Volker Tresp",
        "authorids": "sharifzadeh@dbs.ifi.lmu.de;sina.baharlou@gmail.com;berrendorf@dbs.ifi.lmu.de;koner@dbs.ifi.lmu.de;volker.tresp@siemens.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsharifzadeh2020improving,\ntitle={Improving Visual Relation Detection using Depth Maps},\nauthor={Sahand Sharifzadeh and Sina Moayed Baharlou and Max Berrendorf and Rajat Koner and Volker Tresp},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxcUxrFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkxcUxrFPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "385;539;1273",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "781;650;2250",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;5",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            732.3333333333334,
            387.444046139425
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1227.0,
            725.3445158451717
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.8856180831641267
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16888087297926157083&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "HkxdQkSYDB",
        "title": "Graph Convolutional Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Learning to cooperate is crucially important in multi-agent environments. The key is to understand the mutual interplay between agents. However, multi-agent environments are highly dynamic, where agents keep moving and their neighbors change quickly. This makes it hard to learn abstract representations of mutual interplay between agents. To tackle these difficulties, we propose graph convolutional reinforcement learning, where graph convolution adapts to the dynamics of the underlying graph of the multi-agent environment, and relation kernels capture the interplay between agents by their relation representations. Latent features produced by convolutional layers from gradually increased receptive fields are exploited to learn cooperation, and cooperation is further improved by temporal relation regularization for consistency. Empirically, we show that our method substantially outperforms existing methods in a variety of cooperative scenarios.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiechuan Jiang;Chen Dun;Tiejun Huang;Zongqing Lu",
        "authorids": "jiechuan.jiang@pku.edu.cn;cd46@rice.edu;tjhuang@pku.edu.cn;zongqing.lu@pku.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nJiang2020Graph,\ntitle={Graph Convolutional Reinforcement Learning},\nauthor={Jiechuan Jiang and Chen Dun and Tiejun Huang and Zongqing Lu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxdQkSYDB}\n}",
        "github": "https://github.com/PKU-AI-Edge/DGN/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HkxdQkSYDB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "754;341;923",
        "wc_reply_reviewers": "68;0;260",
        "wc_reply_authors": "766;211;1465",
        "reply_reviewers": "1;0;2",
        "reply_authors": "1;1;4",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            672.6666666666666,
            244.46176706298178
        ],
        "wc_reply_reviewers_avg": [
            109.33333333333333,
            110.09490855116276
        ],
        "wc_reply_authors_avg": [
            814.0,
            513.0672470544188
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 528,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6834204890559222071&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HkxeThNFPH",
        "title": "Safe Policy Learning for Continuous Control",
        "track": "main",
        "status": "Reject",
        "tldr": "A general framework for incorporating long-term safety constraints in policy-based reinforcement learning",
        "abstract": "We study continuous action reinforcement learning problems in which it is crucial that the agent interacts with the environment only through safe policies, i.e.,~policies that keep the agent in desirable situations, both during training and at convergence. We formulate these problems as {\\em constrained} Markov decision processes (CMDPs) and present safe policy optimization algorithms that are based on a Lyapunov approach to solve them. Our algorithms can use any standard policy gradient (PG) method, such as deep deterministic policy gradient (DDPG) or proximal policy optimization (PPO), to train a neural network policy, while guaranteeing near-constraint satisfaction for every policy update by projecting either the policy parameter or the selected action onto the set of feasible solutions induced by the state-dependent linearized Lyapunov constraints. Compared to the existing constrained PG algorithms, ours are more data efficient as they are able to utilize both on-policy and off-policy data. Moreover, our action-projection algorithm often leads to less conservative policy updates and allows for natural integration into an end-to-end PG training pipeline. We evaluate our algorithms and compare them with the state-of-the-art baselines on several simulated (MuJoCo) tasks, as well as a real-world robot obstacle-avoidance problem, demonstrating their effectiveness in terms of balancing performance and constraint satisfaction.",
        "keywords": "reinforcement learning;policy gradient;safety",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yinlam Chow;Ofir Nachum;Aleksandra Faust;Edgar Duenez-Guzman;Mohammad Ghavamzadeh",
        "authorids": "yinlamchow@google.com;ofirnachum@google.com;sandrafaust@google.com;duenez@google.com;mgh@fb.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchow2020safe,\ntitle={Safe Policy Learning for Continuous Control},\nauthor={Yinlam Chow and Ofir Nachum and Aleksandra Faust and Edgar Duenez-Guzman and Mohammad Ghavamzadeh},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxeThNFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkxeThNFPH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "569;111;82",
        "wc_reply_reviewers": "361;0;0",
        "wc_reply_authors": "1107;44;15",
        "reply_reviewers": "2;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            254.0,
            223.0530579630476
        ],
        "wc_reply_reviewers_avg": [
            120.33333333333333,
            170.17703200556244
        ],
        "wc_reply_authors_avg": [
            388.6666666666667,
            508.0763284739366
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14218960118952328120&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HkxedlrFwB",
        "title": "Accelerating First-Order Optimization Algorithms",
        "track": "main",
        "status": "Reject",
        "tldr": "Accelerating First-Order Optimization Algorithms",
        "abstract": "Several stochastic optimization algorithms are currently available. In most cases, selecting the best optimizer for a given problem is not an easy task. Therefore, instead of looking for yet another \u2019absolute\u2019 best optimizer, accelerating existing ones according to the context might prove more effective. This paper presents a simple and intuitive technique to accelerate first-order optimization algorithms. When applied to first-order optimization algorithms, it converges much more quickly and achieves lower function/loss values when compared to traditional algorithms. The proposed solution modifies the update rule, based on the variation of the direction of the gradient during training. Several tests were conducted with SGD, AdaGrad, Adam and AMSGrad on three public datasets. Results clearly show that the proposed technique, has the potential to improve the performance of existing optimization algorithms.",
        "keywords": "Neural Networks;Gradient Descent;First order optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ange Tato;Roger Nkambou",
        "authorids": "angetato@gmail.com;nkambou@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ntato2020accelerating,\ntitle={Accelerating First-Order Optimization Algorithms},\nauthor={Ange Tato and Roger Nkambou},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxedlrFwB}\n}",
        "github": "https://github.com/angetato/Custom-Optimizer-on-Keras",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkxedlrFwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "718;154;327",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            399.6666666666667,
            235.91570999452796
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:S3wmFeAfx10J:scholar.google.com/&scioq=Accelerating+First-Order+Optimization+Algorithms&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Hkxi2gHYvH",
        "title": "Predictive Coding for Boosting Deep Reinforcement Learning with Sparse Rewards",
        "track": "main",
        "status": "Reject",
        "tldr": "We apply predictive coding to provide reward signals in sparse reward problems.",
        "abstract": "While recent progress in deep reinforcement learning has enabled robots to learn complex behaviors, tasks with long horizons and sparse rewards remain an ongoing challenge. In this work, we propose an effective reward shaping method through predictive coding to tackle sparse reward problems. By learning predictive representations offline and using these representations for reward shaping, we gain access to reward signals that understand the structure and dynamics of the environment. In particular, our method achieves better learning by providing reward signals that 1) understand environment dynamics 2) emphasize on features most useful for learning 3) resist noise in learned representations through reward accumulation. We demonstrate the usefulness of this approach in different domains ranging from robotic manipulation to navigation, and we show that reward signals produced through predictive coding are as effective for learning as hand-crafted rewards.",
        "keywords": "reinforcement learning;representation learning;reward shaping;predictive coding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xingyu Lu;Pieter Abbeel;Stas Tiomkin",
        "authorids": "xingyulu0701@berkeley.edu;pabbeel@cs.berkeley.edu;stas@berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlu2020predictive,\ntitle={Predictive Coding for Boosting Deep Reinforcement Learning with Sparse Rewards},\nauthor={Xingyu Lu and Pieter Abbeel and Stas Tiomkin},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkxi2gHYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hkxi2gHYvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "591;397;904",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "403;614;366",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            630.6666666666666,
            208.873699211323
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            461.0,
            109.23674595421939
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11780278655244326888&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HkxjqxBYDB",
        "title": "Episodic Reinforcement Learning with Associative Memory",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Sample efficiency has been one of the major challenges for deep reinforcement learning. Non-parametric episodic control has been proposed to speed up parametric reinforcement learning by rapidly latching on previously successful policies. However, previous work on episodic reinforcement learning neglects the relationship between states and only stored the experiences as unrelated items. To improve sample efficiency of reinforcement learning, we propose a novel framework, called Episodic Reinforcement Learning with Associative Memory (ERLAM), which associates related experience trajectories to enable reasoning effective strategies. We build a graph on top of states in memory based on state transitions and develop a reverse-trajectory propagation strategy to allow rapid value propagation through the graph. We use the non-parametric associative memory as early guidance for a parametric reinforcement learning model. Results on navigation domain and Atari games show our framework achieves significantly higher sample efficiency than state-of-the-art episodic reinforcement learning models.",
        "keywords": "Deep Reinforcement Learning;Episodic Control;Episodic Memory;Associative Memory;Non-Parametric Method;Sample Efficiency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guangxiang Zhu*;Zichuan Lin*;Guangwen Yang;Chongjie Zhang",
        "authorids": "guangxiangzhu@outlook.com;linzc16@mails.tsinghua.edu.cn;ygw@tsinghua.edu.cn;chongjie@tsinghua.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nZhu*2020Episodic,\ntitle={Episodic Reinforcement Learning with Associative Memory},\nauthor={Guangxiang Zhu* and Zichuan Lin* and Guangwen Yang and Chongjie Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxjqxBYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkxjqxBYDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "459;201;225",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "314;490;584",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.0,
            116.37869220780924
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            462.6666666666667,
            111.90869294007305
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 57,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18020414945375324278&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HkxlcnVFwB",
        "title": "GenDICE: Generalized Offline Estimation of Stationary Values",
        "track": "main",
        "status": "Talk",
        "tldr": "In this paper, we proposed a novel algorithm, GenDICE, for general stationary distribution correction estimation, which can handle both discounted and average off-policy evaluation on multiple behavior-agnostic samples.",
        "abstract": "An important problem that arises in reinforcement learning and Monte Carlo methods is estimating quantities defined by the stationary distribution of a Markov chain. In many real-world applications, access to the underlying transition operator is limited to a fixed set of data that has already been collected, without additional interaction with the environment being available. We show that consistent estimation remains possible in this scenario, and that effective estimation can still be achieved in important applications. Our approach is based on estimating a ratio that corrects for the discrepancy between the stationary and empirical distributions, derived from fundamental properties of the stationary distribution, and exploiting constraint reformulations based on variational divergence minimization. The resulting algorithm, GenDICE, is straightforward and effective. We prove the consistency of the method under general conditions, provide a detailed error analysis, and demonstrate strong empirical performance on benchmark tasks, including off-line PageRank and off-policy policy evaluation.",
        "keywords": "Off-policy Policy Evaluation;Reinforcement Learning;Stationary Distribution Correction Estimation;Fenchel Dual",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruiyi Zhang*;Bo Dai*;Lihong Li;Dale Schuurmans",
        "authorids": "ryzhang@cs.duke.edu;bodai@google.com;lihongli.cs@gmail.com;schuurmans@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nZhang*2020GenDICE:,\ntitle={GenDICE: Generalized Offline Estimation of Stationary Values},\nauthor={Ruiyi Zhang* and Bo Dai* and Lihong Li and Dale Schuurmans},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxlcnVFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HkxlcnVFwB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "206;325;219",
        "wc_reply_reviewers": "103;40;0",
        "wc_reply_authors": "192;512;26",
        "reply_reviewers": "1;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            250.0,
            53.29790489941107
        ],
        "wc_reply_reviewers_avg": [
            47.666666666666664,
            42.3975890299856
        ],
        "wc_reply_authors_avg": [
            243.33333333333334,
            201.70164985828833
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 211,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15415072754082786789&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HkxnclHKDr",
        "title": "Provable Representation Learning for Imitation Learning via Bi-level Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "Using a bi-level optimization framework, we learn representations by leveraging multiple imitation learning tasks to provably reduce the sample complexity of learning a policy for a new task",
        "abstract": "A common strategy in modern learning systems is to learn a representation which is useful for many tasks, a.k.a, representation learning. We study this strategy in the imitation learning setting where multiple experts trajectories are available. We formulate representation learning as a bi-level optimization problem where the \"outer\" optimization tries to learn the joint representation and the \"inner\" optimization encodes the imitation learning setup and tries to learn task-specific parameters. We instantiate this framework for the cases where the imitation setting being behavior cloning and observation alone. Theoretically, we provably show using our framework that representation learning can reduce the sample complexity of imitation learning in both settings. We also provide proof-of-concept experiments to verify our theoretical findings.",
        "keywords": "imitation learning;representation learning;multitask learning;theory;behavioral cloning;imitation from observations alone;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sanjeev Arora;Simon S. Du;Sham Kakade;Yuping Luo;Nikunj Saunshi",
        "authorids": "arora@cs.princeton.edu;ssdu@ias.edu;sham@cs.washington.edu;yupingl@cs.princeton.edu;nsaunshi@cs.princeton.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\narora2020provable,\ntitle={Provable Representation Learning for Imitation Learning via Bi-level Optimization},\nauthor={Sanjeev Arora and Simon S. Du and Sham Kakade and Yuping Luo and Nikunj Saunshi},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxnclHKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkxnclHKDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "226;579;735",
        "wc_reply_reviewers": "0;307;0",
        "wc_reply_authors": "110;440;269",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            513.3333333333334,
            212.92304295736108
        ],
        "wc_reply_reviewers_avg": [
            102.33333333333333,
            144.72118788284672
        ],
        "wc_reply_authors_avg": [
            273.0,
            134.75162336684483
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 78,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13489075581887463567&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "Hkxp3JHtPr",
        "title": "Deep Variational Semi-Supervised Novelty Detection",
        "track": "main",
        "status": "Reject",
        "tldr": "We proposed two VAE modifications that account for negative data examples, and used them for semi-supervised anomaly detection.",
        "abstract": "In anomaly detection (AD), one seeks to identify whether a test sample is abnormal,  given a data set of normal samples.   A recent and promising approach to AD relies on deep generative models, such as variational autoencoders (VAEs),for unsupervised learning of the normal data distribution. In semi-supervised AD (SSAD), the data also includes a small sample of labeled anomalies. In this work,we propose two variational methods for training VAEs for SSAD. The intuitive idea in both methods is to train the encoder to \u2018separate\u2019 between latent vectors for normal and outlier data. We show that this idea can be derived from principled probabilistic formulations of the problem, and propose simple and effective algorithms.  Our methods can be applied to various data types, as we demonstrate on SSAD datasets ranging from natural images to astronomy and medicine, and can be combined with any VAE model architecture. When comparing to state-of-the-art SSAD methods that are not specific to particular data types, we obtain marked improvement in outlier detection.",
        "keywords": "anomaly detection;semi-supervised anomaly detection;variational autoencoder",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tal Daniel;Thanard Kurutach;Aviv Tamar",
        "authorids": "taldanielm@campus.technion.ac.il;thanard.kurutach@berkeley.edu;avivt@technion.ac.il",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndaniel2020deep,\ntitle={Deep Variational Semi-Supervised Novelty Detection},\nauthor={Tal Daniel and Thanard Kurutach and Aviv Tamar},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkxp3JHtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkxp3JHtPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "1272;240;259",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "666;358;481",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            590.3333333333334,
            482.0735305831351
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            501.6666666666667,
            126.58681693692365
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16745922939534412393&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Hkxvl0EtDH",
        "title": "A Causal View on Robustness of Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We present a causal view on the robustness of neural networks against input manipulations, which applies not only to traditional classification tasks but also to general measurement data. Based on this view, we design a deep causal manipulation augmented model (deep CAMA) which explicitly models the manipulations of data as a cause to the observed effect variables. We further develop data augmentation and test-time fine-tuning methods to improve deep CAMA's robustness. When compared with discriminative deep neural networks, our proposed model shows superior robustness against unseen manipulations. As a by-product, our model achieves disentangled representation which separates the representation of manipulations from those of other latent causes.",
        "keywords": "Neural Network Robustness;Variational autoencoder (VAE);Causality;Deep generative model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cheng Zhang;Yingzhen Li",
        "authorids": "cheng.zhang@microsoft.com;yingzhen.li@microsoft.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzhang2020a,\ntitle={A Causal View on Robustness  of Neural Networks},\nauthor={Cheng Zhang and Yingzhen Li},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkxvl0EtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hkxvl0EtDH",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "437;279;162",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;580;273",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.6666666666667,
            112.68343070547486
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            284.3333333333333,
            236.91958316882312
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 99,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16034502372142635207&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HkxwmRVtwH",
        "title": "Gaussian Process Meta-Representations Of Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We derive a Gaussian Process prior for Bayesian Neural Networks based on representations of units and use compositional kernels to model inductive biases for deep learning.",
        "abstract": "Bayesian inference offers a theoretically grounded and general way to train neural networks and can potentially give calibrated uncertainty. It is, however, challenging to specify a meaningful and tractable prior over the network parameters. More crucially, many existing inference methods assume mean-field approximate posteriors, ignoring interactions between parameters in high-dimensional weight space. To this end, this paper introduces two innovations: (i) a Gaussian process-based hierarchical model for the network parameters based on recently introduced unit embeddings that can flexibly encode weight structures, and (ii) input-dependent contextual variables for the weight prior that can provide convenient ways to regularize the function space being modeled by the NN through the use of kernels. \nFurthermore, we develop an efficient structured variational inference scheme that alleviates the need to perform inference in the weight space whilst retaining and learning non-trivial correlations between network parameters. \nWe show these models provide desirable test-time uncertainty estimates, demonstrate cases of modeling inductive biases for neural networks with kernels and demonstrate competitive predictive performance of the proposed model and algorithm over alternative approaches on a range of classification and active learning tasks.",
        "keywords": "Bayesian Neural Networks;Representation Learning;Gaussian Processes;Variational Inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Theofanis Karaletsos;Thang Bui",
        "authorids": "theofanis.karaletsos@gmail.com;thang.buivn@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkaraletsos2020gaussian,\ntitle={Gaussian Process Meta-Representations Of Neural Networks},\nauthor={Theofanis Karaletsos and Thang Bui},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxwmRVtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HkxwmRVtwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "367;1167;302",
        "wc_reply_reviewers": "0;480;0",
        "wc_reply_authors": "503;2394;327",
        "reply_reviewers": "0;3;0",
        "reply_authors": "1;5;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            612.0,
            393.3403954168281
        ],
        "wc_reply_reviewers_avg": [
            160.0,
            226.27416997969522
        ],
        "wc_reply_authors_avg": [
            1074.6666666666667,
            935.6724260599374
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.8856180831641267
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:trD6ZbDA4pEJ:scholar.google.com/&scioq=Gaussian+Process+Meta-Representations+Of+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HkxzNpNtDS",
        "title": "Generalized Natural Language Grounded Navigation via Environment-agnostic Multitask Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose to learn a more generalized policy for natural language grounded navigation tasks via environment-agnostic multitask learning.",
        "abstract": "Recent research efforts enable study for natural language grounded navigation in photo-realistic environments, e.g., following natural language instructions or dialog. However, existing methods tend to overfit training data in seen environments and fail to generalize well in previously unseen environments. In order to close the gap between seen and unseen environments, we aim at learning a generalizable navigation model from two novel perspectives:\n(1) we introduce a multitask navigation model that can be seamlessly trained on both Vision-Language Navigation (VLN) and Navigation from Dialog History (NDH) tasks, which benefits from richer natural language guidance and effectively transfers knowledge across tasks;\n(2) we propose to learn environment-agnostic representations for navigation policy that are invariant among environments, thus generalizing better on unseen environments.\nExtensive experiments show that our environment-agnostic multitask navigation model significantly reduces the performance gap between seen and unseen environments and outperforms the baselines on unseen environments by 16% (relative measure on success rate) on VLN and 120% (goal progress) on NDH, establishing the new state of the art for NDH task. ",
        "keywords": "Natural Language Grounded Navigation;Multitask Learning;Agnostic Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin Wang;Vihan Jain;Eugene Ie;William Wang;Zornitsa Kozareva;Sujith Ravi",
        "authorids": "xwang@cs.ucsb.edu;vihanjain@google.com;eugeneie@google.com;william@cs.ucsb.edu;kozareva@google.com;sravi@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nwang2020generalized,\ntitle={Generalized Natural Language Grounded Navigation via Environment-agnostic Multitask Learning},\nauthor={Xin Wang and Vihan Jain and Eugene Ie and William Wang and Zornitsa Kozareva and Sujith Ravi},\nyear={2020},\nurl={https://openreview.net/forum?id=HkxzNpNtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HkxzNpNtDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "319;503;379",
        "wc_reply_reviewers": "0;0;110",
        "wc_reply_authors": "961;248;755",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            400.3333333333333,
            76.61737545897942
        ],
        "wc_reply_reviewers_avg": [
            36.666666666666664,
            51.85449728701349
        ],
        "wc_reply_authors_avg": [
            654.6666666666666,
            299.60232902224834
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17563845898606020407&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hkxzx0NtDB",
        "title": "Your classifier is secretly an energy based model and you should treat it like one",
        "track": "main",
        "status": "Talk",
        "tldr": "We show that there is a hidden generative model inside of every classifier. We demonstrate how to train this model and show the many benefits of doing so.  ",
        "abstract": "We propose to reinterpret a standard discriminative classifier of p(y|x) as an energy based model for the joint distribution p(x, y). In this setting, the standard class probabilities can be easily computed as well as unnormalized values of p(x) and p(x|y). Within this framework, standard discriminative architectures may be used and the model can also be trained on unlabeled data. We demonstrate that energy based training of the joint distribution improves calibration, robustness, and out-of-distribution detection while also enabling our models to generate samples rivaling the quality of recent GAN approaches. We improve upon recently proposed techniques for scaling up the training of energy based models and present an approach which adds little overhead compared to standard classification training. Our approach is the first to achieve performance rivaling the state-of-the-art in both generative and discriminative learning within one hybrid model. ",
        "keywords": "energy based models;adversarial robustness;generative models;out of distribution detection;outlier detection;hybrid models;robustness;calibration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Will Grathwohl;Kuan-Chieh Wang;Joern-Henrik Jacobsen;David Duvenaud;Mohammad Norouzi;Kevin Swersky",
        "authorids": "wgrathwohl@cs.toronto.edu;wangkua1@cs.toronto.edu;j.jacobsen@vectorinstitute.ai;duvenaud@cs.toronto.edu;mnorouzi@google.com;kswersky@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nGrathwohl2020Your,\ntitle={Your classifier is secretly an energy based model and you should treat it like one},\nauthor={Will Grathwohl and Kuan-Chieh Wang and Joern-Henrik Jacobsen and David Duvenaud and Mohammad Norouzi and Kevin Swersky},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hkxzx0NtDB}\n}",
        "github": "https://wgrathwohl.github.io/JEM/",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hkxzx0NtDB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "259;525;283",
        "wc_reply_reviewers": "0;9;0",
        "wc_reply_authors": "1079;247;545",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            355.6666666666667,
            120.13695887980887
        ],
        "wc_reply_reviewers_avg": [
            3.0,
            4.242640687119285
        ],
        "wc_reply_authors_avg": [
            623.6666666666666,
            344.1872875178409
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            26,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 674,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13087658900756056358&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Hye-p0VFPB",
        "title": "Efficient Systolic Array Based on Decomposable MAC for Quantized Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep Neural Networks (DNNs) have achieved high accuracy in various machine learning applications in recent years. As the recognition accuracy of deep learning applications increases, reducing the complexity of these neural networks and performing the DNN computation on embedded systems or mobile devices become an emerging and crucial challenge. Quantization has been presented to reduce the utilization of computational resources by compressing the input data and weights from floating-point numbers to integers with shorter bit-width. For practical power reduction, it is necessary to operate these DNNs with quantized parameters on appropriate hardware. Therefore, systolic arrays are adopted to be the major computation units for matrix multiplication in DNN accelerators. To obtain a better tradeoff between the precision/accuracy and power consumption, using parameters with various bit-widths among different layers within a DNN is an advanced quantization method. In this paper, we propose a novel decomposition strategy to construct a low-power decomposable multiplier-accumulator (MAC) for the energy efficiency of quantized DNNs. In the experiments, when 65% multiplication operations of VGG-16 are operated in shorter bit-width with at most 1% accuracy loss on the CIFAR-10 dataset, our decomposable MAC has 50% energy reduction compared with a non-decomposable MAC.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ning-Chi Huang;Huan-Jan Chou;Kai-Chiang Wu",
        "authorids": "nchuang@cs.nctu.edu.tw;kulugu2.cs07g@nctu.edu.tw;kcw@cs.nctu.edu.tw",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhuang2020efficient,\ntitle={Efficient Systolic Array Based on Decomposable {\\{}MAC{\\}} for Quantized Deep Neural Networks},\nauthor={Ning-Chi Huang and Huan-Jan Chou and Kai-Chiang Wu},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye-p0VFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hye-p0VFPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "102;90;223",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            138.33333333333334,
            60.068479439904436
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qz0Q5kUS0cgJ:scholar.google.com/&scioq=Efficient+Systolic+Array+Based+on+Decomposable+MAC+for+Quantized+Deep+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Hye00pVtPS",
        "title": "CONFEDERATED MACHINE LEARNING ON HORIZONTALLY AND VERTICALLY SEPARATED MEDICAL DATA FOR LARGE-SCALE HEALTH SYSTEM INTELLIGENCE",
        "track": "main",
        "status": "Reject",
        "tldr": "a confederated learning method that train model from horizontally and vertically separated medical data ",
        "abstract": "A patient\u2019s health information is generally fragmented across silos. Though it is technically feasible to unite data for analysis in a manner that underpins a rapid learning healthcare system, privacy concerns and regulatory barriers limit data centralization. Machine learning can be conducted in a federated manner on patient datasets with the same set of variables, but separated across sites of care. But federated learning cannot handle the situation where different data types for a given\npatient are separated vertically across different organizations. We call methods that enable machine learning model training on data separated by two or more degrees \u201cconfederated machine learning.\u201d We built and evaluated a confederated machine\nlearning model to stratify the risk of accidental falls among the elderly.",
        "keywords": "Confederated learning;siloed medical data;representation joining",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dianbo Liu;Tim Miller;Kenneth Mandl",
        "authorids": "dianbo.liu@childrens.harvard.edu;timothy.miller@childrens.harvard.edu;kenneth.mandl@childrens.harvard.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nliu2020confederated,\ntitle={{\\{}CONFEDERATED{\\}} {\\{}MACHINE{\\}} {\\{}LEARNING{\\}} {\\{}ON{\\}} {\\{}HORIZONTALLY{\\}} {\\{}AND{\\}} {\\{}VERTICALLY{\\}} {\\{}SEPARATED{\\}} {\\{}MEDICAL{\\}} {\\{}DATA{\\}} {\\{}FOR{\\}} {\\{}LARGE{\\}}-{\\{}SCALE{\\}} {\\{}HEALTH{\\}} {\\{}SYSTEM{\\}} {\\{}INTELLIGENCE{\\}}},\nauthor={Dianbo Liu and Tim Miller and Kenneth Mandl},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye00pVtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hye00pVtPS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "400;118;725",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            414.3333333333333,
            248.0138884999969
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 17,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16581685242656739690&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Hye190VKvH",
        "title": "Longitudinal Enrichment of Imaging Biomarker Representations for Improved Alzheimer's Disease Diagnosis",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Longitudinal data is often available inconsistently across individuals resulting in ignoring of additionally available data. Alzheimer's Disease (AD) is a progressive disease that affects over 5 million patients in the US alone, and is the 6th leading cause of death. Early detection of AD can significantly improve or extend a patient's life so it is critical to use all available information about patients. \nWe propose an unsupervised method to learn a consistent representation by utilizing inconsistent data through minimizing the ratio of $p$-Order Principal Components Analysis (PCA) and Locality Preserving Projections (LPP). Our method's representation can outperform the use of consistent data alone and does not require the use of complex tensor-specific approaches. We run experiments on patient data from the Alzheimer\u2019s Disease Neuroimaging Initiative (ADNI), which consists of inconsistent data, to predict patients' diagnosis.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Saad Elbeleidy;Lyujian Lu;L. Zoe Baker;Hua Wang;Feiping Nie",
        "authorids": "selbeleidy@mymail.mines.edu;lyujianlu@mines.edu;laurenzoebaker@mymail.mines.edu;huawangcs@gmail.com;feipingnie@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nelbeleidy2020longitudinal,\ntitle={Longitudinal Enrichment of Imaging Biomarker Representations for Improved Alzheimer's Disease Diagnosis},\nauthor={Saad Elbeleidy and Lyujian Lu and L. Zoe Baker and Hua Wang and Feiping Nie},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye190VKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hye190VKvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "430;342;174",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            315.3333333333333,
            106.198974675946
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gjdALouQn2sJ:scholar.google.com/&scioq=Longitudinal+Enrichment+of+Imaging+Biomarker+Representations+for+Improved+Alzheimer%27s+Disease+Diagnosis&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "Hye1RJHKwB",
        "title": "Training Generative Adversarial Networks from Incomplete Observations using Factorised Discriminators",
        "track": "main",
        "status": "Poster",
        "tldr": "We decompose the discriminator in a GAN in a principled way so that each component can be independently trained on different parts of the input. The resulting \"FactorGAN\" can be used for semi-supervised learning and in missing data scenarios.",
        "abstract": "Generative adversarial networks (GANs) have shown great success in applications such as image generation and inpainting.\nHowever, they typically require large datasets, which are often not available, especially in the context of prediction tasks such as image segmentation that require labels. Therefore, methods such as the CycleGAN use more easily available unlabelled data, but do not offer a way to leverage additional labelled data for improved performance. To address this shortcoming, we show how to factorise the joint data distribution into a set of lower-dimensional distributions along with their dependencies. This allows splitting the discriminator in a GAN into multiple \"sub-discriminators\" that can be independently trained from incomplete observations. Their outputs can be combined to estimate the density ratio between the joint real and the generator distribution, which enables training generators as in the original GAN framework. We apply our method to image generation, image segmentation and audio source separation, and obtain improved performance over a standard GAN when additional incomplete training examples are available. For the Cityscapes segmentation task in particular, our method also improves accuracy by an absolute 14.9% over CycleGAN while using only 25 additional paired examples.",
        "keywords": "Adversarial Learning;Semi-supervised Learning;Image generation;Image segmentation;Missing Data",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Stoller;Sebastian Ewert;Simon Dixon",
        "authorids": "d.stoller@qmul.ac.uk;sewert@spotify.com;s.e.dixon@qmul.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nStoller2020Training,\ntitle={Training Generative Adversarial Networks from Incomplete Observations using Factorised Discriminators},\nauthor={Daniel Stoller and Sebastian Ewert and Simon Dixon},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye1RJHKwB}\n}",
        "github": "https://www.dropbox.com/s/gtc7m7pc4n2yt05/source.zip?dl=1",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hye1RJHKwB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "234;234;526",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "597;599;478",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            331.3333333333333,
            137.65012007098125
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            558.0,
            56.57443474456167
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8246776617616848513&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Hye1kTVFDS",
        "title": "The Variational Bandwidth Bottleneck: Stochastic Evaluation on an Information Budget",
        "track": "main",
        "status": "Poster",
        "tldr": "Training agents with adaptive computation based on information bottleneck can promote generalization. ",
        "abstract": "In many applications, it is desirable to extract only the relevant information from complex input data, which involves making a decision about which input features are relevant.\nThe information bottleneck method formalizes this as an information-theoretic optimization problem by maintaining an optimal tradeoff between compression (throwing away irrelevant input information), and predicting the target. In many problem settings, including the reinforcement learning problems we consider in this work, we might prefer to compress only part of the input. This is typically the case when we have a standard conditioning input, such as a state observation, and a ``privileged'' input, which might correspond to the goal of a task, the output of a costly planning algorithm, or communication with another agent. In such cases, we might prefer to compress the privileged input, either to achieve better generalization (e.g., with respect to goals) or to minimize access to costly information (e.g., in the case of communication). Practical implementations of the information bottleneck based on variational inference require access to the privileged input in order to compute the bottleneck variable, so although they perform compression, this compression operation itself needs unrestricted, lossless access. In this work, we propose the variational bandwidth bottleneck, which decides for each example on the estimated value of the privileged information before seeing it, i.e., only based on the standard input, and then accordingly chooses stochastically, whether to access the privileged input or not. We formulate a tractable approximation to this framework and demonstrate in a series of reinforcement learning experiments that it can improve generalization and reduce access to computationally costly information.",
        "keywords": "Variational Information Bottleneck;Reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anirudh Goyal;Yoshua Bengio;Matthew Botvinick;Sergey Levine",
        "authorids": "anirudhgoyal9119@gmail.com;yoshua.bengio@mila.quebec;botvinick@google.com;svlevine@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nGoyal2020The,\ntitle={The Variational Bandwidth Bottleneck: Stochastic Evaluation on an Information Budget},\nauthor={Anirudh Goyal and Yoshua Bengio and Matthew Botvinick and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye1kTVFDS}\n}",
        "github": "[![github](/images/github_icon.svg) maximecb/gym-minigrid](https://github.com/maximecb/gym-minigrid)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hye1kTVFDS",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "1377;322",
        "wc_reply_reviewers": "599;27",
        "wc_reply_authors": "1108;210",
        "reply_reviewers": "2;1",
        "reply_authors": "6;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            849.5,
            527.5
        ],
        "wc_reply_reviewers_avg": [
            313.0,
            286.0
        ],
        "wc_reply_authors_avg": [
            659.0,
            449.0
        ],
        "reply_reviewers_avg": [
            1.5,
            0.5
        ],
        "reply_authors_avg": [
            3.5,
            2.5
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5182568436909686711&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Hye4KeSYDr",
        "title": "Evaluations and Methods for Explanation through Robustness Analysis",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose new objective measurement for evaluating explanations based on the notion of adversarial robustness. The evaluation criteria further allows us to derive new explanations which capture pertinent features qualitatively and quantitatively.",
        "abstract": "Among multiple ways of interpreting a machine learning model, measuring the importance of a set of features tied to a prediction is probably one of the most intuitive way to explain a model. In this paper, we establish the link between a set of features to a prediction with a new evaluation criteria, robustness analysis, which measures the minimum tolerance of adversarial perturbation. By measuring the tolerance level for an adversarial attack, we can extract a set of features that provides most robust support for a current prediction, and also can extract a set of features that contrasts the current prediction to a target class by setting a targeted adversarial attack. By applying this methodology to various prediction tasks across multiple domains, we observed the derived explanations are indeed capturing the significant feature set qualitatively and quantitatively.",
        "keywords": "Interpretability;Explanations;Adversarial Robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cheng-Yu Hsieh;Chih-Kuan Yeh;Xuanqing Liu;Pradeep Ravikumar;Seungyeon Kim;Sanjiv Kumar;Cho-Jui Hsieh",
        "authorids": "r05922048@ntu.edu.tw;cjyeh@cs.cmu.edu;xqliu@cs.ucla.edu;pradeepr@cs.cmu.edu;seungyeonk@google.com;sanjivk@google.com;chohsieh@cs.ucla.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nhsieh2020evaluations,\ntitle={Evaluations and Methods for Explanation through Robustness Analysis},\nauthor={Cheng-Yu Hsieh and Chih-Kuan Yeh and Xuanqing Liu and Pradeep Ravikumar and Seungyeon Kim and Sanjiv Kumar and Cho-Jui Hsieh},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye4KeSYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hye4KeSYDr",
        "pdf_size": 0,
        "rating": "3;6",
        "confidence": "0;0",
        "wc_review": "426;682",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "1222;1812",
        "reply_reviewers": "0;0",
        "reply_authors": "2;3",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            554.0,
            128.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1517.0,
            295.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.5,
            0.5
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 69,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15196298478913450721&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "Hye4WaVYwr",
        "title": "Bootstrapping the Expressivity with Model-based Planning",
        "track": "main",
        "status": "Reject",
        "tldr": "We compare deep model-based and model-free RL algorithms by studying the approximability of $Q$-functions, policies, and dynamics by neural networks. ",
        "abstract": "We compare the model-free reinforcement learning with the model-based approaches through the lens of the expressive power of neural networks for policies, $Q$-functions, and dynamics.  We show, theoretically and empirically, that even for one-dimensional continuous state space, there are many MDPs whose optimal $Q$-functions and policies are much more complex than the dynamics. We hypothesize many real-world MDPs also have a similar property. For these MDPs, model-based planning is a favorable algorithm, because the resulting policies can approximate the optimal policy significantly better than a neural network parameterization can, and model-free or model-based policy optimization rely on policy parameterization. Motivated by the theory, we apply a simple multi-step model-based bootstrapping planner (BOOTS) to bootstrap a weak $Q$-function into a stronger policy. Empirical results show that applying BOOTS on top of model-based or model-free policy optimization algorithms at the test time improves the performance on MuJoCo benchmark tasks. ",
        "keywords": "reinforcement learning theory;model-based reinforcement learning;planning;expressivity;approximation theory;deep reinforcement learning theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kefan Dong;Yuping Luo;Tengyu Ma",
        "authorids": "dkf16@mails.tsinghua.edu.cn;yupingl@cs.princeton.edu;tengyuma@cs.stanford.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndong2020bootstrapping,\ntitle={Bootstrapping the Expressivity with Model-based Planning},\nauthor={Kefan Dong and Yuping Luo and Tengyu Ma},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye4WaVYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hye4WaVYwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "386;215;467",
        "wc_reply_reviewers": "249;0;0",
        "wc_reply_authors": "1180;461;396",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            356.0,
            105.04284840006957
        ],
        "wc_reply_reviewers_avg": [
            83.0,
            117.37972567696688
        ],
        "wc_reply_authors_avg": [
            679.0,
            355.2529615170951
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7660973938066529430&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Hye5TaVtDH",
        "title": "Matrix Multilayer Perceptron",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Models that output a vector of responses given some inputs, in the form of a conditional mean vector, are at the core of machine learning. This includes neural networks such as the multilayer perceptron (MLP). However, models that output a symmetric positive definite (SPD) matrix of responses given inputs, in the form of a conditional covariance function, are far less studied, especially within the context of neural networks. Here, we introduce a new variant of the MLP, referred to as the matrix MLP, that is specialized at learning SPD matrices. Our construction not only respects the SPD constraint, but also makes explicit use of it. This translates into a model which effectively performs the task of SPD matrix learning even in scenarios where data are scarce. We present an application of the model in heteroscedastic multivariate regression, including convincing performance on six real-world datasets. ",
        "keywords": "Multilayer Perceptron;symmetric positive definite;heteroscedastic regression;covariance estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jalil Taghia;Maria B\u00e5nkestad;Fredrik Lindsten;Thomas Sch\u00f6n",
        "authorids": "jalil.taghia@ericsson.com;maria.bankestad@ri.se;fredrik.lindsten@liu.se;thomas.schon@it.uu.se",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntaghia2020matrix,\ntitle={Matrix Multilayer Perceptron},\nauthor={Jalil Taghia and Maria B{\\r{a}}nkestad and Fredrik Lindsten and Thomas Sch{\\\"o}n},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye5TaVtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hye5TaVtDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "554;90;449",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "342;6;815",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            364.3333333333333,
            198.6627516392766
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            387.6666666666667,
            331.84768929267665
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "Hye87grYDH",
        "title": "Sparse Transformer: Concentrated Attention Through Explicit Selection",
        "track": "main",
        "status": "Reject",
        "tldr": "This work propose Sparse Transformer to improve the concentration of attention on the global context through an explicit selection of the most relevant segments for sequence to sequence learning. ",
        "abstract": "Self-attention-based Transformer has demonstrated the state-of-the-art performances in a number of natural language processing tasks. Self attention is able to model long-term dependencies, but it may suffer from the extraction of irrelevant information in the context. To tackle the problem, we propose a novel model called Sparse Transformer. Sparse Transformer is able to improve the concentration of attention on the global context through an explicit selection of the most relevant segments. Extensive experimental results on a series of natural language processing tasks, including neural machine translation, image captioning, and language modeling, all demonstrate the advantages of Sparse Transformer in model performance. \n  Sparse Transformer reaches the state-of-the-art performances in the IWSLT 2015 English-to-Vietnamese translation and IWSLT 2014 German-to-English translation. In addition, we conduct qualitative analysis to account for Sparse Transformer's superior performance. ",
        "keywords": "Attention;Transformer;Machine Translation;Natural Language Processing;Sparse;Sequence to sequence learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guangxiang Zhao;Junyang Lin;Zhiyuan Zhang;Xuancheng Ren;Xu Sun",
        "authorids": "1701214310@pku.edu.cn;junyang.ljy@alibaba-inc.com;zzy1210@pku.edu.cn;renxc@pku.edu.cn;xusun@pku.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhao2020sparse,\ntitle={Sparse Transformer: Concentrated Attention Through Explicit Selection},\nauthor={Guangxiang Zhao and Junyang Lin and Zhiyuan Zhang and Xuancheng Ren and Xu Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye87grYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hye87grYDH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "658;510;319",
        "wc_reply_reviewers": "50;0;0",
        "wc_reply_authors": "246;151;176",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            495.6666666666667,
            138.76679077582727
        ],
        "wc_reply_reviewers_avg": [
            16.666666666666668,
            23.570226039551585
        ],
        "wc_reply_authors_avg": [
            191.0,
            40.2077936060494
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8383198343907365196&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HyeAPeBFwS",
        "title": "Quantifying uncertainty with GAN-based priors",
        "track": "main",
        "status": "Reject",
        "tldr": "Quantifying uncertainty in inference via GAN priors",
        "abstract": "Bayesian inference is used extensively to quantify the uncertainty in an inferred field given the measurement of a related field when the two are linked by a mathematical model. Despite its many applications, Bayesian inference faces challenges when inferring fields that have discrete representations of large dimension, and/or have prior distributions that are difficult to characterize mathematically. In this work we demonstrate how the approximate distribution learned by a generative adversarial network (GAN) may be used as a prior in a Bayesian update to address both these challenges. We demonstrate the efficacy of this approach by inferring and quantifying uncertainty in inference problems arising in computer vision and  physics-based applications. In both instances we highlight the role of computing uncertainty in providing a measure of confidence in the solution, and in designing successive measurements to improve this confidence. ",
        "keywords": "Bayesian inference;Uncertainty quantification;Generative adversarial networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dhruv V. Patel;Assad A. Oberai",
        "authorids": "dhruvvpa@usc.edu;aoberai@usc.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\npatel2020quantifying,\ntitle={Quantifying uncertainty with {\\{}GAN{\\}}-based priors},\nauthor={Dhruv V. Patel and Assad A. Oberai},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeAPeBFwS}\n}",
        "github": "https://github.com/iclr-2020/BI-GANP",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyeAPeBFwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "362;163;213",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2464;715;1294",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            246.0,
            84.52613008216257
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1491.0,
            727.487456936544
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.9428090415820634
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13310140885499079873&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HyeCnkHtwH",
        "title": "Efficient generation of structured objects with Constrained Adversarial Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We extend GANs towards the generation of structured objects like molecules and video game levels",
        "abstract": "Despite their success, generative adversarial networks (GANs) cannot easily generate structured objects like molecules or game maps. The issue is that such objects must satisfy structural requirements (e.g., molecules must be chemically valid, game maps must guarantee reachability of the end goal) that are difficult to capture with examples alone.  As a remedy, we     propose constrained adversarial networks (CANs), which embed the constraints into the model during training by penalizing the generator whenever it outputs invalid structures. As in unconstrained GANs, new objects can be sampled straightforwardly from the generator, but in addition they satisfy the constraints with high probability.  Our approach handles arbitrary logical constraints and leverages knowledge compilation techniques to efficiently evaluate the expected disagreement between the     model and the constraints.  This setup is further extended to hybrid logical-neural constraints for capturing complex requirements like graph reachability.  An extensive empirical analysis on constrained images, molecules, and video game levels shows that CANs efficiently generate valid structures that are both high-quality and novel.",
        "keywords": "deep generative models;generative adversarial networks;constraints",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jacopo Gobbi;Luca Di Liello;Pierfrancesco Ardino;Paolo Morettin;Stefano Teso;Andrea Passerini",
        "authorids": "jacopo.gobbi@studenti.unitn.it;luca.diliello@studenti.unitn.it;pierfrancesco.ardino@unitn.it;paolo.morettin@unitn.it;stefano.teso@gmail.com;andrea.passerini@unitn.it",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ngobbi2020efficient,\ntitle={Efficient generation of structured objects with Constrained Adversarial Networks},\nauthor={Jacopo Gobbi and Luca Di Liello and Pierfrancesco Ardino and Paolo Morettin and Stefano Teso and Andrea Passerini},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeCnkHtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HyeCnkHtwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "595;255;433",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "660;271;564",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            427.6666666666667,
            138.8556404647967
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            498.3333333333333,
            165.457614579149
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 38,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4567145751569316371&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 16
    },
    {
        "id": "HyeEIyBtvr",
        "title": "BETANAS: Balanced Training and selective drop for Neural Architecture Search",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel method to search for neural architectures via weight sharing.",
        "abstract": "Automatic neural architecture search techniques are becoming increasingly important in machine learning area recently.  Especially, weight sharing methods have shown remarkable potentials on searching good network architectures with few computational resources. However, existing weight sharing methods mainly suffer limitations on searching strategies: these methods either uniformly train all network paths to convergence which introduces conflicts between branches and wastes a large amount of computation on unpromising candidates, or selectively train branches with different frequency which leads to unfair evaluation and comparison among paths. To address these issues, we propose a novel neural architecture search method  with balanced training strategy to ensure fair comparisons and a selective drop mechanism to reduces conflicts among candidate paths. The experimental results show that our proposed method can achieve a leading performance of 79.0% on ImageNet under mobile settings, which outperforms other state-of-the-art methods in both accuracy and efficiency.",
        "keywords": "neural architecture search;weight sharing;auto machine learning;deep learning;CNN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Muyuan Fang;Qiang Wang;Jian Zhang;Zhao Zhong",
        "authorids": "fangmuyuan@huawei.com;wangqiang168@huawei.com;zhangjian157@huawei.com;zorro.zhongzhao@huawei.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfang2020betanas,\ntitle={{\\{}BETANAS{\\}}: Balanced Training and selective drop for Neural Architecture Search},\nauthor={Muyuan Fang and Qiang Wang and Jian Zhang and Zhao Zhong},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeEIyBtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyeEIyBtvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "301;154;315",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "251;145;220",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            256.6666666666667,
            72.82093716019376
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            205.33333333333334,
            44.4996878890428
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6902057932856302894&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HyeG9lHYwH",
        "title": "Compression without Quantization",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a principled, end-to-end differentiable, bits-back efficient transform coding framework and apply it to image compression.",
        "abstract": " Standard compression algorithms work by mapping an image to discrete code using an encoder from which the original image can be reconstructed through a decoder. This process, due to the quantization step, is inherently non-differentiable so these algorithms must rely on approximate methods to train the encoder and decoder end-to-end. In this paper, we present an innovative framework for lossy image compression which is able to circumvent the quantization step by relying on a non-deterministic compression codec. The decoder maps the input image to a distribution in continuous space from which a sample can be encoded with expected code length being the relative entropy to the encoding distribution, i.e. it is bits-back efficient. The result is a principled, end-to-end differentiable compression framework that can be straight-forwardly trained using standard gradient-based optimizers. To showcase the efficiency of our method, we apply it to lossy image compression by training Probabilistic Ladder Networks (PLNs) on the CLIC 2018 dataset and show that their rate-distortion curves on the Kodak dataset are competitive with the state-of-the-art on low bitrates.",
        "keywords": "Image Compression;Bits-back efficient;Quantization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gergely Flamich;Marton Havasi;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato",
        "authorids": "gf332@cam.ac.uk;mh740@cam.ac.uk;jmh233@cam.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nflamich2020compression,\ntitle={Compression without Quantization},\nauthor={Gergely Flamich and Marton Havasi and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeG9lHYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HyeG9lHYwH",
        "pdf_size": 0,
        "rating": "1;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "246;877;277;176",
        "wc_reply_reviewers": "0;105;0;0",
        "wc_reply_authors": "503;448;387;208",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.25,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            394.0,
            281.24988888886696
        ],
        "wc_reply_reviewers_avg": [
            26.25,
            45.46633369868303
        ],
        "wc_reply_authors_avg": [
            386.5,
            110.924523888994
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14517104131871413280&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HyeG9yHKPr",
        "title": "Causally Correct Partial Models for Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Causally correct partial models do not have to generate the whole observation to remain causally correct in stochastic environments.",
        "abstract": "In reinforcement learning, we can learn a model of future observations and rewards, and use it to plan the agent's next actions. However, jointly modeling future observations can be computationally expensive or even intractable if the observations are high-dimensional (e.g. images). For this reason, previous works have considered partial models, which model only part of the observation. In this paper, we show that partial models can be causally incorrect: they are confounded by the observations they don't model, and can therefore lead to incorrect planning. To address this, we introduce a general family of partial models that are provably causally correct, but avoid the need to fully model future observations.",
        "keywords": "causality;model-based reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Danilo J. Rezende;Ivo Danihelka;George Papamakarios;Nan Rosemary Ke;Ray Jiang;Theophane Weber;Karol Gregor;Hamza Merzic;Fabio Viola;Jane Wang;Jovana Mitrovic;Frederic Besse;Ioannis Antonoglou;Lars Buesing;Julian Schrittwieser;Thomas Hubert;David Silver",
        "authorids": "danilor@google.com;danihelka@google.com;gpapamak@google.com;rosemary.nan.ke@gmail.com;rayjiang@google.com;theophane@google.com;karolg@google.com;hamzamerzic@google.com;fviola@google.com;wangjane@google.com;mitrovic@google.com;fbesse@google.com;ioannisa@google.com;lbuesing@google.com;swj@google.com;tkhubert@google.com;davidsilver@google.com",
        "gender": ";;;;;;;;;;;;;;;;",
        "homepage": ";;;;;;;;;;;;;;;;",
        "dblp": ";;;;;;;;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;;;;;;;",
        "orcid": ";;;;;;;;;;;;;;;;",
        "linkedin": ";;;;;;;;;;;;;;;;",
        "or_profile": ";;;;;;;;;;;;;;;;",
        "aff": ";;;;;;;;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;;;;;;;",
        "position": ";;;;;;;;;;;;;;;;",
        "bibtex": "@misc{\nrezende2020causally,\ntitle={Causally Correct Partial Models for Reinforcement Learning},\nauthor={Danilo J. Rezende and Ivo Danihelka and George Papamakarios and Nan Rosemary Ke and Ray Jiang and Theophane Weber and Karol Gregor and Hamza Merzic and Fabio Viola and Jane Wang and Jovana Mitrovic and Frederic Besse and Ioannis Antonoglou and Lars Buesing and Julian Schrittwieser and Thomas Hubert and David Silver},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeG9yHKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HyeG9yHKPr",
        "pdf_size": 0,
        "rating": "1;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "304;598;188;458",
        "wc_reply_reviewers": "0;0;0;129",
        "wc_reply_authors": "474;289;123;702",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;1;1;2",
        "rating_avg": [
            4.5,
            2.692582403567252
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            387.0,
            154.96128548769852
        ],
        "wc_reply_reviewers_avg": [
            32.25,
            55.858638544096294
        ],
        "wc_reply_authors_avg": [
            397.0,
            215.46113338604715
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            17,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 39,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15230288299029197959&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HyeJf1HKvS",
        "title": "Deep Graph Matching Consensus",
        "track": "main",
        "status": "Poster",
        "tldr": "We develop a deep graph matching architecture which refines initial correspondences in order to reach neighborhood consensus.",
        "abstract": "This work presents a two-stage neural architecture for learning and refining structural correspondences between graphs. First, we use localized node embeddings computed by a graph neural network to obtain an initial ranking of soft correspondences between nodes. Secondly, we employ synchronous message passing networks to iteratively re-rank the soft correspondences to reach a matching consensus in local neighborhoods between graphs. We show, theoretically and empirically, that our message passing scheme computes a well-founded measure of consensus for corresponding neighborhoods, which is then used to guide the iterative re-ranking process. Our purely local and sparsity-aware architecture scales well to large, real-world inputs while still being able to recover global correspondences consistently. We demonstrate the practical effectiveness of our method on real-world tasks from the fields of computer vision and entity alignment between knowledge graphs, on which we improve upon the current state-of-the-art.",
        "keywords": "graph matching;graph neural networks;neighborhood consensus;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matthias Fey;Jan E. Lenssen;Christopher Morris;Jonathan Masci;Nils M. Kriege",
        "authorids": "matthias.fey@tu-dortmund.de;janeric.lenssen@udo.edu;christopher.morris@tu-dortmund.de;jonathan@nnaisense.com;nils.kriege@tu-dortmund.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nFey2020Deep,\ntitle={Deep Graph Matching Consensus},\nauthor={Matthias Fey and Jan E. Lenssen and Christopher Morris and Jonathan Masci and Nils M. Kriege},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeJf1HKvS}\n}",
        "github": "https://github.com/rusty1s/deep-graph-matching-consensus",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyeJf1HKvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "232;387;301",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "79;1226;1156",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            306.6666666666667,
            63.40522235764355
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            820.3333333333334,
            524.980211267265
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 262,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13831077548402480322&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HyeJmlrFvH",
        "title": "Provably Communication-efficient Data-parallel SGD via Nonuniform Quantization",
        "track": "main",
        "status": "Reject",
        "tldr": "NUQSGD closes the gap between the theoretical guarantees of QSGD and the empirical performance of QSGDinf.",
        "abstract": "As the size and complexity of models and datasets grow, so does the need for communication-efficient variants of stochastic gradient descent that can be deployed on clusters to perform model fitting in parallel. Alistarh et al. (2017) describe two variants of data-parallel SGD that quantize and encode gradients to lessen communication costs. For the first variant, QSGD, they provide strong theoretical guarantees. For the second variant, which we call QSGDinf, they demonstrate impressive empirical gains for distributed training of large neural networks. Building on their work, we propose an alternative scheme for quantizing gradients and show that it yields stronger theoretical guarantees than exist for QSGD while matching the empirical performance of QSGDinf.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ali Ramezani-Kebrya;Fartash Faghri;Ilya Markov;Vitalii Aksenov;Dan Alistarh;Daniel M. Roy",
        "authorids": "alir@vectorinstitute.ai;faghri@cs.toronto.edu;droy@utstat.toronto.edu;dan.alistarh@ist.ac.at;markovilya197@gmail.com;vitalii.aksenov@ist.ac.at",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nramezani-kebrya2020provably,\ntitle={Provably Communication-efficient Data-parallel {\\{}SGD{\\}} via Nonuniform Quantization},\nauthor={Ali Ramezani-Kebrya and Fartash Faghri and Ilya Markov and Vitalii Aksenov and Dan Alistarh and Daniel M. Roy},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeJmlrFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyeJmlrFvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "357;404;494",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "773;279;652",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            418.3333333333333,
            56.84090858606052
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            568.0,
            210.2395459152884
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kwsuwuLgBrwJ:scholar.google.com/&scioq=Provably+Communication-efficient+Data-parallel+SGD+via+Nonuniform+Quantization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HyeKcgHFvS",
        "title": "Gradient-based training of Gaussian Mixture Models in High-Dimensional Spaces",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a stochastic gradient descent algorithm for training GMMs in high-dimensional spaces, which performs similarly to the traditional EM procedure but which is much more memory-efficient.",
        "abstract": "We present an approach for efficiently training Gaussian Mixture Models (GMMs) with Stochastic Gradient Descent (SGD) on large amounts of high-dimensional data (e.g., images). In such a scenario, SGD is strongly superior in terms of execution time and memory usage, although it is conceptually more complex than the traditional Expectation-Maximization (EM) algorithm.\nFor enabling SGD training, we propose three novel ideas:\nFirst, we show that minimizing an upper bound to the GMM log likelihood instead of the full one is feasible and numerically much more stable way in high-dimensional spaces.\nSecondly, we propose a new regularizer that prevents SGD from converging to pathological local minima.\nAnd lastly, we present a simple method for enforcing the constraints inherent to GMM training when using SGD.\nWe also propose an SGD-compatible simplification to the full GMM model based on local principal directions, which avoids excessive memory use in high-dimensional spaces due to quadratic growth of covariance matrices.\nExperiments on several standard image datasets show the validity of our approach, and we provide a publicly available TensorFlow implementation.",
        "keywords": "GMM;SGD",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexander Gepperth;Benedikt Pf\u00fclb",
        "authorids": "alexander.gepperth@cs.hs-fulda.de;benedikt.pfuelb@cs.hs-fulda.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngepperth2020gradientbased,\ntitle={Gradient-based training of Gaussian Mixture Models in High-Dimensional Spaces},\nauthor={Alexander Gepperth and Benedikt Pf{\\\"u}lb},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeKcgHFvS}\n}",
        "github": "https://github.com/anonymous-iclr20/GMM.git",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyeKcgHFvS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "232;630;685",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "512;773;1215",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            515.6666666666666,
            201.83546654529167
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            833.3333333333334,
            290.1520674098708
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10985055703431111742&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HyePberFvH",
        "title": "Monte Carlo Deep Neural Network Arithmetic",
        "track": "main",
        "status": "Reject",
        "tldr": "Determining the sensitivity of Deep Neural Networks to floating point rounding error using Monte Carlo Methods",
        "abstract": "Quantization is a crucial technique for achieving low-power, low latency and high throughput hardware implementations of Deep Neural Networks. Quantized floating point representations have received recent interest due to their hardware efficiency benefits and ability to represent a higher dynamic range than fixed point representations, leading to improvements in accuracy.  We present a novel technique, Monte Carlo Deep Neural Network Arithmetic (MCA), for determining the sensitivity of Deep Neural Networks to quantization in floating point arithmetic.We do this by applying Monte Carlo Arithmetic to the inference computation and analyzing the relative standard deviation of the neural network loss.  The method makes no assumptions regarding the underlying parameter distributions. We evaluate our method on pre-trained image classification models on the CIFAR10 andImageNet datasets.  For the same network topology and dataset, we demonstrate the ability to gain the equivalent of bits of precision by simply choosing weight parameter sets which demonstrate a lower loss of significance from the Monte Carlo trials.   Additionally,  we can apply MCA to compare the sensitivity of different network topologies to quantization effects.",
        "keywords": "deep learning;quantization;floating point;monte carlo methods",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Julian Faraone;Philip Leong",
        "authorids": "julian.faraone@sydney.edu.au;philip.leong@sydney.edu.au",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nfaraone2020monte,\ntitle={Monte Carlo Deep Neural Network Arithmetic},\nauthor={Julian Faraone and Philip Leong},\nyear={2020},\nurl={https://openreview.net/forum?id=HyePberFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HyePberFvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "1445;270;223",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "852;501;95",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            646.0,
            565.3040479836197
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            482.6666666666667,
            309.31573225786985
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14845718845198848869&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HyeSin4FPB",
        "title": "Learning to Control PDEs with Differentiable Physics",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We train a combination of neural networks to predict optimal trajectories for complex physical systems.",
        "abstract": "Predicting outcomes and planning interactions with the physical world are long-standing goals for machine learning. A variety of such tasks involves continuous physical systems, which can be described by partial differential equations (PDEs) with many degrees of freedom. Existing methods that aim to control the dynamics of such systems are typically limited to relatively short time frames or a small number of interaction parameters. We present a novel hierarchical predictor-corrector scheme which enables neural networks to learn to understand and control complex nonlinear physical systems over long time frames. We propose to split the problem into two distinct tasks: planning and control. To this end, we introduce a predictor network that plans optimal trajectories and a control network that infers the corresponding control parameters. Both stages are trained end-to-end using a differentiable PDE solver. We demonstrate that our method successfully develops an understanding of complex physical systems and learns to control them for tasks involving PDEs such as the incompressible Navier-Stokes equations.",
        "keywords": "Differentiable physics;Optimal control;Deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Philipp Holl;Nils Thuerey;Vladlen Koltun",
        "authorids": "philipp.holl@tum.de;nils.thuerey@tum.de;vkoltun@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nHoll2020Learning,\ntitle={Learning to Control PDEs with Differentiable Physics},\nauthor={Philipp Holl and Nils Thuerey and Vladlen Koltun},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeSin4FPB}\n}",
        "github": "[![github](/images/github_icon.svg) tum-pbs/PhiFlow](https://github.com/tum-pbs/PhiFlow)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyeSin4FPB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "420;543;451",
        "wc_reply_reviewers": "0;34;33",
        "wc_reply_authors": "595;508;738",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            471.3333333333333,
            52.23238671765078
        ],
        "wc_reply_reviewers_avg": [
            22.333333333333332,
            15.797327481430381
        ],
        "wc_reply_authors_avg": [
            613.6666666666666,
            94.82029787386712
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 235,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7687371584395325411&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HyeX7aVKvr",
        "title": "Zero-shot task adaptation by homoiconic meta-mapping",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an approach to performing novel tasks zero-shot based on adapting task representations",
        "abstract": "How can deep learning systems flexibly reuse their knowledge? Toward this goal, we propose a new class of challenges, and a class of architectures that can solve them. The challenges are meta-mappings, which involve systematically transforming task behaviors to adapt to new tasks zero-shot. We suggest that the key to achieving these challenges is representing the task being performed in such a way that this task representation is itself transformable. We therefore draw inspiration from functional programming and recent work in meta-learning to propose a class of Homoiconic Meta-Mapping (HoMM) approaches that represent data points and tasks in a shared latent space, and learn to infer transformations of that space. HoMM approaches can be applied to any type of machine learning task, including supervised learning and reinforcement learning. We demonstrate the utility of this perspective by exhibiting zero-shot remapping of behavior to adapt to new tasks.",
        "keywords": "Meta-mapping;zero-shot;task adaptation;task representation;meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrew K. Lampinen;James L. McClelland",
        "authorids": "lampinen@stanford.edu;jlmcc@stanford.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlampinen2020zeroshot,\ntitle={Zero-shot task adaptation by homoiconic meta-mapping},\nauthor={Andrew K. Lampinen and James L. McClelland},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeX7aVKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HyeX7aVKvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "494;331;1187",
        "wc_reply_reviewers": "0;96;0",
        "wc_reply_authors": "1150;821;1786",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;2;3",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            670.6666666666666,
            371.1175315479211
        ],
        "wc_reply_reviewers_avg": [
            32.0,
            45.254833995939045
        ],
        "wc_reply_authors_avg": [
            1252.3333333333333,
            400.5498997905532
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12988769805907024040&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HyeYJ1SKDH",
        "title": "FLUID FLOW MASS TRANSPORT FOR GENERATIVE NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Generative Adversarial Networks have been shown to be powerful tools for generating content resulting in them being intensively studied in recent years. Training these networks requires maximizing a generator loss and minimizing a discriminator loss, leading to a difficult saddle point problem that is slow and difficult to converge. Motivated by techniques in the registration of point clouds and the fluid flow formulation of mass transport, we investigate a new formulation that is based on strict minimization, without the need for the maximization. This formulation views the problem as a matching problem rather than an adversarial one, and thus allows us to quickly converge and obtain meaningful metrics in the optimization path.",
        "keywords": "generative network;optimal mass transport;gaussian mixture;model matching",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jingrong Lin;Keegan Lensink;Eldad Haber",
        "authorids": "jlin@eoas.ubc.ca;klensink@eoas.ubc.ca;ehaber@eoas.ubc.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlin2020fluid,\ntitle={{\\{}FLUID{\\}} {\\{}FLOW{\\}} {\\{}MASS{\\}} {\\{}TRANSPORT{\\}} {\\{}FOR{\\}} {\\{}GENERATIVE{\\}} {\\{}NETWORKS{\\}}},\nauthor={Jingrong Lin and Keegan Lensink and Eldad Haber},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeYJ1SKDH}\n}",
        "github": "https://github.com/Jingrong-LIN/cGAN/blob/master/GANS2Dsubmision.ipynb",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyeYJ1SKDH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "241;307;338",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.3333333333333,
            40.45024378445972
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9682572796529591529&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HyeYTgrFPB",
        "title": "Massively Multilingual Sparse Word Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose an efficient algorithm for determining multilingually comparable sparse word representations that we release for 27 typologically diverse languages.",
        "abstract": "In this paper, we introduce Mamus for constructing multilingual sparse word representations. Our algorithm operates by determining a shared set of semantic units which get reutilized across languages, providing it a competitive edge both in terms of speed and evaluation performance. We demonstrate that our proposed algorithm behaves competitively to strong baselines through a series of rigorous experiments performed towards downstream applications spanning over dependency parsing, document classification and natural language inference. Additionally, our experiments relying on the QVEC-CCA evaluation score suggests that the proposed sparse word representations convey an increased interpretability as opposed to alternative approaches. Finally, we are releasing our multilingual sparse word representations for the 27 typologically diverse set of languages that we conducted our various experiments on.",
        "keywords": "sparse word representations;multilinguality;sparse coding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "G\u00e1bor Berend",
        "authorids": "berendg@inf.u-szeged.hu",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nBerend2020Massively,\ntitle={Massively Multilingual Sparse Word Representations},\nauthor={G\u00e1bor Berend},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeYTgrFPB}\n}",
        "github": "https://github.com/begab/mamus",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyeYTgrFPB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "556;179;187",
        "wc_reply_reviewers": "69;0;0",
        "wc_reply_authors": "471;79;179",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            307.3333333333333,
            175.86421529754773
        ],
        "wc_reply_reviewers_avg": [
            23.0,
            32.526911934581186
        ],
        "wc_reply_authors_avg": [
            243.0,
            166.30894944850883
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9628937347076669673&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Hye_V0NKwr",
        "title": "Locality and Compositionality in Zero-Shot Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "An analysis of the effects of compositionality and locality on representation learning for zero-shot learning.",
        "abstract": "In this work we study locality and compositionality in the context of learning representations for Zero Shot Learning (ZSL). \nIn order to well-isolate the importance of these properties in learned representations, we impose the additional constraint that, differently from most recent work in ZSL, no pre-training on different datasets (e.g. ImageNet) is performed.\nThe results of our experiment show how locality, in terms of small parts of the input, and compositionality, i.e. how well can the learned representations be expressed as a function of a smaller vocabulary, are both deeply related to generalization and motivate the focus on more local-aware models in future research directions for representation learning.",
        "keywords": "Zero-shot learning;Compositionality;Locality;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tristan Sylvain;Linda Petrini;Devon Hjelm",
        "authorids": "tristan.sylvain@gmail.com;lindapetrini@gmail.com;devon.hjelm@microsoft.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nsylvain2020locality,\ntitle={Locality and Compositionality in Zero-Shot Learning},\nauthor={Tristan Sylvain and Linda Petrini and Devon Hjelm},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hye_V0NKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hye_V0NKwr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "395;628;843",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "235;372;754",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            622.0,
            182.94443600904256
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            453.6666666666667,
            219.60924894508022
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 64,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6394471402557129601&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HyeaSkrYPH",
        "title": "Certified Defenses for Adversarial Patches",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Adversarial patch attacks are among one of the most practical threat models against real-world computer vision systems. This paper studies certified and empirical defenses against patch attacks. We begin with a set of experiments showing that most existing defenses, which work by pre-processing input images to mitigate adversarial patches, are easily broken by simple white-box adversaries. Motivated by this finding, we propose the first certified defense against patch attacks, and propose faster methods for its training. Furthermore, we experiment with different patch shapes for testing, obtaining surprisingly good robustness transfer across shapes, and present preliminary results on certified defense against sparse attacks. Our complete implementation can be found on: https://github.com/Ping-C/certifiedpatchdefense.",
        "keywords": "certified defenses;patch attack;adversarial robustness;sparse defense",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ping-yeh Chiang*;Renkun Ni*;Ahmed Abdelkader;Chen Zhu;Christoph Studor;Tom Goldstein",
        "authorids": "pchiang@cs.umd.edu;rn9zm@cs.umd.edu;akader@cs.umd.edu;chenzhu@cs.umd.edu;studer@cornell.edu;tomg@cs.umd.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nChiang*2020Certified,\ntitle={Certified Defenses for Adversarial Patches},\nauthor={Ping-yeh Chiang* and Renkun Ni* and Ahmed Abdelkader and Chen Zhu and Christoph Studor and Tom Goldstein},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeaSkrYPH}\n}",
        "github": "https://github.com/Ping-C/certifiedpatchdefense",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HyeaSkrYPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "187;642;397",
        "wc_reply_reviewers": "0;470;0",
        "wc_reply_authors": "79;860;1091",
        "reply_reviewers": "0;2;0",
        "reply_authors": "1;3;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            408.6666666666667,
            185.936070255941
        ],
        "wc_reply_reviewers_avg": [
            156.66666666666666,
            221.5601247717849
        ],
        "wc_reply_authors_avg": [
            676.6666666666666,
            433.00833966821267
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 198,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2964763599882748614&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HyebplHYwB",
        "title": "The Shape of Data: Intrinsic Distance for Data Distributions",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a metric for comparing data distributions based on their geometry while not relying on any positional information.",
        "abstract": "The ability to represent and compare machine learning models is crucial in order to quantify subtle model changes, evaluate generative models, and gather insights on neural network architectures. Existing techniques for comparing data distributions focus on global data properties such as mean and covariance; in that sense, they are extrinsic and uni-scale. We develop a first-of-its-kind intrinsic and multi-scale method for characterizing and comparing data manifolds, using a lower-bound of the spectral variant of the Gromov-Wasserstein inter-manifold distance, which compares all data moments. In a thorough experimental study, we demonstrate that our method effectively discerns the structure of data manifolds even on unaligned data of different dimensionalities; moreover, we showcase its efficacy in evaluating the quality of generative models.",
        "keywords": "Deep Learning;Generative Models;Nonlinear Dimensionality Reduction;Manifold Learning;Similarity and Distance Learning;Spectral Methods",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anton Tsitsulin;Marina Munkhoeva;Davide Mottin;Panagiotis Karras;Alex Bronstein;Ivan Oseledets;Emmanuel Mueller",
        "authorids": "tsitsulin@bit.uni-bonn.de;marina.munkhoeva@skolkovotech.ru;davide@cs.au.dk;piekarras@gmail.com;bron@cs.technion.ac.il;i.oseledets@skoltech.ru;mueller@bit.uni-bonn.de",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nTsitsulin2020The,\ntitle={The Shape of Data: Intrinsic Distance for Data Distributions},\nauthor={Anton Tsitsulin and Marina Munkhoeva and Davide Mottin and Panagiotis Karras and Alex Bronstein and Ivan Oseledets and Emmanuel Mueller},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyebplHYwB}\n}",
        "github": "https://github.com/xgfs/imd",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyebplHYwB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "99;325;305",
        "wc_reply_reviewers": "0;31;0",
        "wc_reply_authors": "440;442;370",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            243.0,
            102.15021618511958
        ],
        "wc_reply_reviewers_avg": [
            10.333333333333334,
            14.613540144521982
        ],
        "wc_reply_authors_avg": [
            417.3333333333333,
            33.47967874530592
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 63,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9523832381533072749&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "Hyee-0VFPH",
        "title": "Stochastic Geodesic Optimization for Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We utilize an adaptive coefficient on top of regular momentum inspired by geodesic optimization which significantly speeds up training in both convex and non-convex functions.",
        "abstract": "We develop a novel and efficient algorithm for optimizing neural networks inspired by a recently proposed geodesic optimization algorithm. Our algorithm, which we call Stochastic  Geodesic Optimization (SGeO), utilizes an adaptive coefficient on top of Polyak's Heavy Ball method effectively controlling the amount of weight put on the previous update to the parameters based on the change of direction in the optimization path. Experimental results on strongly convex functions with Lipschitz gradients and deep Autoencoder benchmarks show that SGeO reaches lower errors than established first-order methods and competes well with lower or similar errors to a recent second-order method called K-FAC (Kronecker-Factored Approximate Curvature). We also incorporate Nesterov style lookahead gradient into our algorithm (SGeO-N) and observe notable improvements. ",
        "keywords": "Neural Network Optimization;Geodesic Optimization;Momentum",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zana Rashidi;Aijun An;Xiaogang Wang",
        "authorids": "zrashidi@eecs.yorku.ca;aan@cse.yorku.ca;stevenw@mathstat.yorku.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hyee-0VFPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "635;223;687",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            515.0,
            207.56364485782828
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zsAg2c35evEJ:scholar.google.com/&scioq=Stochastic+Geodesic+Optimization+for+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HyenUkrtDB",
        "title": "Detecting Noisy Training Data with Loss Curves",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a new metric - Area Under the Loss Curve (AUL) - which uses the training dynamics to identify noisy training samples.",
        "abstract": "This paper introduces a new method to discover mislabeled training samples and to mitigate their impact on the training process of deep networks. At the heart of our algorithm lies the Area Under the Loss (AUL) statistic, which can be easily computed for each sample in the training set. We show that the AUL can use training dynamics to differentiate between (clean) samples that benefit from generalization and (mislabeled) samples that need to be \u201cmemorized\u201d. We demonstrate that the estimated AUL score conditioned on clean vs. noisy is approximately Gaussian distributed and can be well estimated with a simple Gaussian Mixture Model (GMM). The resulting GMM provides us with mixing coefficients that reveal the percentage of mislabeled samples in a data set as well as probability estimates that each individual training sample is mislabeled. We show that these probability estimates can be used to down-weight suspicious training samples and successfully alleviate the damaging impact of label noise. We demonstrate on the CIFAR10/100 datasets that our proposed approach is significantly more accurate and consistent across model architectures than all prior work.",
        "keywords": "Deep learning;noisy data;robust training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Geoff Pleiss;Tianyi Zhang;Ethan R. Elenberg;Kilian Q. Weinberger",
        "authorids": "geoff@cs.cornell.edu;tz58@cornell.edu;eelenberg@asapp.com;kqw4@cornell.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\npleiss2020detecting,\ntitle={Detecting Noisy Training Data with Loss Curves},\nauthor={Geoff Pleiss and Tianyi Zhang and Ethan R. Elenberg and Kilian Q. Weinberger},\nyear={2020},\nurl={https://openreview.net/forum?id=HyenUkrtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyenUkrtDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "296;349;223",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "154;322;234",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            289.3333333333333,
            51.65483735549094
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            236.66666666666666,
            68.61162842809924
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7544997444384193705&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hyepjh4FwB",
        "title": "ProtoAttend: Attention-Based Prototypical Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new learning framework that bases decision-making on few relevant examples that we call prototypes.",
        "abstract": "We propose a novel inherently interpretable machine learning method that bases decisions on few relevant examples that we call prototypes. Our method, ProtoAttend, can be integrated into a wide range of neural network architectures including pre-trained models. It utilizes an attention mechanism that relates the encoded representations to samples in order to determine prototypes. The resulting model outperforms state of the art in three high impact problems without sacrificing accuracy of the original model: (1) it enables high-quality interpretability that outputs samples most relevant to the decision-making (i.e. a sample-based interpretability method); (2) it achieves state of the art confidence estimation by quantifying the mismatch across prototype labels; and (3) it obtains state of the art in distribution mismatch detection. All this can be achieved with minimal additional test time and a practically viable training time computational cost.",
        "keywords": "Interpretability;sample-based explanations;prototypes;confidence estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sercan O. Arik;Tomas Pfister",
        "authorids": "soarik@google.com;tpfister@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\narik2020protoattend,\ntitle={ProtoAttend: Attention-Based Prototypical Learning},\nauthor={Sercan O. Arik and Tomas Pfister},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyepjh4FwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hyepjh4FwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "141;302;214",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1097;636;560",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            219.0,
            65.82299496883036
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            764.3333333333334,
            237.26824950300917
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 74,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8785645963726222822&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HyeqPJHYvH",
        "title": "Stochastic Latent Residual Video Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Video prediction is a challenging task: models have to account for the inherent uncertainty of the future. Most works in the literature are based on stochastic image-autoregressive recurrent networks, raising several performance and applicability issues. An alternative is to use fully latent temporal models which untie frame synthesis and dynamics. However, no such model for video prediction has been proposed in the literature yet, due to design and training difficulties. In this paper, we overcome these difficulties by introducing a novel stochastic temporal model. It is based on residual updates of a latent state, motivated by discretization schemes of differential equations. This first-order principle naturally models video dynamics as it allows our simpler, lightweight, interpretable, latent model to outperform prior state-of-the-art methods on challenging datasets.",
        "keywords": "stochastic video prediction;variational autoencoder;residual dynamics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jean-Yves Franceschi;Edouard Delasalles;Mickael Chen;Sylvain Lamprier;Patrick Gallinari",
        "authorids": "jean-yves.franceschi@lip6.fr;edouard.delasalles@lip6.fr;mickael.chen@lip6.fr;sylvain.lamprier@lip6.fr;patrick.gallinari@lip6.fr",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nfranceschi2020stochastic,\ntitle={Stochastic Latent Residual Video Prediction},\nauthor={Jean-Yves Franceschi and Edouard Delasalles and Mickael Chen and Sylvain Lamprier and Patrick Gallinari},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeqPJHYvH}\n}",
        "github": "https://sites.google.com/view/srvp/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyeqPJHYvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "608;232;306",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "772;343;707",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            382.0,
            162.63660924486427
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            607.3333333333334,
            188.78618122686368
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 189,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13364014516718772272&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "HyerxgHYvH",
        "title": "Neural Arithmetic Unit by reusing many small pre-trained networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We train many small networks each for a specific operation, these are then combined to perform complex operations",
        "abstract": "We propose a solution for evaluation of mathematical expression. However, instead of designing a single end-to-end model we propose a Lego bricks style architecture. In this architecture instead of training a complex end-to-end neural network, many small networks can be trained independently each accomplishing one specific operation and acting a single lego brick. More difficult or complex task can then be solved using a combination of these smaller network. In this work we first identify 8 fundamental operations that are commonly used to solve arithmetic operations (such as 1 digit multiplication, addition, subtraction, sign calculator etc). These fundamental operations are then learned using simple feed forward neural networks. We then shows that different operations can be designed simply by reusing these smaller networks. As an example we reuse these smaller networks to develop larger and a more complex network to solve n-digit multiplication, n-digit division, and cross product. This bottom-up strategy not only introduces reusability, we also show that it allows to generalize for computations involving n-digits and we show results for up to 7 digit numbers. Unlike existing methods, our solution also generalizes for both positive as well as negative numbers.",
        "keywords": "NALU;feed forward NN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ammar Ahmad;Oneeb Babar;Murtaza Taj",
        "authorids": "ammarahmad977@gmail.com;oneebalibabar@gmail.com;murtaza.taj@lums.edu.pk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nahmad2020neural,\ntitle={Neural Arithmetic Unit by reusing many small pre-trained networks},\nauthor={Ammar Ahmad and Oneeb Babar and Murtaza Taj},\nyear={2020},\nurl={https://openreview.net/forum?id=HyerxgHYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyerxgHYvH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "186;296;249",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            243.66666666666666,
            45.065384597148274
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13475589152708257972&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hyes70EYDB",
        "title": "Visual Interpretability Alone Helps Adversarial Robustness",
        "track": "main",
        "status": "Reject",
        "tldr": "Exploring the connection between robustness in interpretability and robustness in classification",
        "abstract": "Recent works have empirically shown that there exist   adversarial examples that can be hidden from neural network interpretability, and  interpretability is itself susceptible to adversarial attacks. In this paper, we theoretically show  that with the correct measurement of interpretation, it is actually difficult to hide adversarial examples, as confirmed by experiments on MNIST, CIFAR-10 and Restricted ImageNet. Spurred by that, we develop a novel defensive scheme built only on robust interpretation (without resorting to adversarial loss minimization). We show that our defense achieves similar classification robustness to state-of-the-art robust training methods while attaining higher interpretation robustness under various settings of adversarial attacks.",
        "keywords": "adversarial robustness;visual explanation;CNN;image classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akhilan Boopathy;Sijia Liu;Gaoyuan Zhang;Pin-Yu Chen;Shiyu Chang;Luca Daniel",
        "authorids": "akhilan@mit.edu;sijia.liu@ibm.com;gaoyuan.zhang@ibm.com;pin-yu.chen@ibm.com;shiyu.chang@ibm.com;dluca@mit.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nboopathy2020visual,\ntitle={Visual Interpretability Alone Helps Adversarial Robustness},\nauthor={Akhilan Boopathy and Sijia Liu and Gaoyuan Zhang and Pin-Yu Chen and Shiyu Chang and Luca Daniel},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyes70EYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hyes70EYDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "265;239;213",
        "wc_reply_reviewers": "88;0;0",
        "wc_reply_authors": "1753;993;68",
        "reply_reviewers": "1;0;0",
        "reply_authors": "4;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            239.0,
            21.228911104120876
        ],
        "wc_reply_reviewers_avg": [
            29.333333333333332,
            41.48359782961079
        ],
        "wc_reply_authors_avg": [
            938.0,
            688.9968553387357
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "HyetFnEFDS",
        "title": "Diving into Optimization of Topology in Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Seeking\u00a0effective\u00a0networks\u00a0has\u00a0become\u00a0one\u00a0of\u00a0the\u00a0most\u00a0crucial\u00a0and\u00a0practical\u00a0areas\u00a0in\u00a0deep\u00a0learning. The\u00a0architecture\u00a0of\u00a0a\u00a0neural\u00a0network\u00a0can\u00a0be\u00a0represented\u00a0as\u00a0a\u00a0directed\u00a0acyclic\u00a0graph, whose\u00a0nodes\u00a0denote\u00a0transformation\u00a0of\u00a0layers\u00a0and\u00a0edges\u00a0represent\u00a0information\u00a0flow. Despite\u00a0the\u00a0selection\u00a0of\u00a0\\textit{micro}\u00a0node\u00a0operations,\u00a0\\textit{macro}\u00a0connections\u00a0among\u00a0the\u00a0whole network,\u00a0noted\u00a0as\u00a0\\textit{topology},\u00a0largely\u00a0affects\u00a0the\u00a0optimization\u00a0process. We\u00a0first\u00a0rethink\u00a0the\u00a0residual\u00a0connections\u00a0via\u00a0a\u00a0new\u00a0\\textit{topological\u00a0view}\u00a0and observe\u00a0the\u00a0benefits\u00a0provided\u00a0by\u00a0dense\u00a0connections\u00a0to\u00a0the\u00a0optimization. Motivated\u00a0by\u00a0which,\u00a0we\u00a0propose\u00a0an\u00a0innovation\u00a0method\u00a0to\u00a0optimize\u00a0the\u00a0topology\u00a0of a\u00a0neural\u00a0network. The\u00a0optimization\u00a0space\u00a0is\u00a0defined\u00a0as\u00a0a\u00a0complete\u00a0graph,\u00a0through\u00a0assigning\u00a0learnable\u00a0weights\u00a0which\u00a0reflect\u00a0the\u00a0importance\u00a0of\u00a0connections, the\u00a0optimization\u00a0of\u00a0topology\u00a0is\u00a0transformed\u00a0into\u00a0learning\u00a0a\u00a0set\u00a0of\u00a0 continuous\u00a0variables\u00a0of\u00a0edges. To\u00a0extend\u00a0the\u00a0optimization\u00a0to\u00a0larger\u00a0search\u00a0spaces,\u00a0a\u00a0new\u00a0series\u00a0of\u00a0networks,\nnamed\u00a0as\u00a0TopoNet,\u00a0are\u00a0designed. To\u00a0further\u00a0focus\u00a0on\u00a0critical\u00a0edges\u00a0and\u00a0promote\u00a0generalization\u00a0ablity\u00a0in\u00a0dense\u00a0topologies,\u00a0auxiliary\u00a0sparsity\u00a0constraint\u00a0is\u00a0adopted\u00a0to\u00a0constrain\u00a0the\u00a0distribution\u00a0of\u00a0edges. Experiments\u00a0on\u00a0classical\u00a0networks\u00a0prove\u00a0the\u00a0effectiveness\u00a0of\u00a0the\u00a0optimization\u00a0of\u00a0topology. Experiments\u00a0with\u00a0TopoNets\u00a0further\u00a0verify\u00a0both\u00a0availability\u00a0and\u00a0transferability\u00a0of\u00a0the\u00a0proposed\u00a0method\u00a0in\ndifferent\u00a0tasks\u00a0e.g.\u00a0image\u00a0classification,\u00a0object\u00a0detection\u00a0and\u00a0face\u00a0recognition.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kun Yuan;Quanquan Li;Yucong Zhou;Jing Shao;Junjie Yan",
        "authorids": "yuankun@sensetime.com;liquanquan@sensetime.com;zhouyucong@sensetime.com;shaojing@sensetime.com;yanjunjie@sensetime.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyuan2020diving,\ntitle={Diving into Optimization of Topology in Neural Networks},\nauthor={Kun Yuan and Quanquan Li and Yucong Zhou and Jing Shao and Junjie Yan},\nyear={2020},\nurl={https://openreview.net/forum?id=HyetFnEFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3;AnonReviewer5",
        "site": "https://openreview.net/forum?id=HyetFnEFDS",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "380;1381;327;848",
        "wc_reply_reviewers": "0;49;0;0",
        "wc_reply_authors": "370;856;253;547",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "1;2;1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            734.0,
            425.0205877366413
        ],
        "wc_reply_reviewers_avg": [
            12.25,
            21.21762239271875
        ],
        "wc_reply_authors_avg": [
            506.5,
            227.31311004867274
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7565597976016809156&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HyeuP2EtDB",
        "title": "Scoring-Aggregating-Planning: Learning task-agnostic priors from interactions and sparse rewards for zero-shot generalization",
        "track": "main",
        "status": "Reject",
        "tldr": "We learn dense scores and dynamics model as priors from exploration data and use them to induce a good policy in new tasks in zero-shot condition.",
        "abstract": "Humans can learn task-agnostic priors from interactive experience and utilize the priors for novel tasks without any finetuning. In this paper, we propose Scoring-Aggregating-Planning (SAP), a framework that can learn task-agnostic semantics and dynamics priors from arbitrary quality interactions as well as the corresponding sparse rewards and then plan on unseen tasks in zero-shot condition. The framework finds a neural score function for local regional state and action pairs that can be aggregated to approximate the quality of a full trajectory; moreover, a dynamics model that is learned with self-supervision can be incorporated for planning. Many of previous works that leverage interactive data for policy learning either need massive on-policy environmental interactions or assume access to expert data while we can achieve a similar goal with pure off-policy imperfect data. Instantiating our framework results in a generalizable policy to unseen tasks. Experiments demonstrate that the proposed method can outperform baseline methods on a wide range of applications including gridworld, robotics tasks and video games.",
        "keywords": "learning priors from exploration data;policy zero-shot generalization;reward shaping;model-based",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Huazhe Xu;Boyuan Chen;Yang Gao;Trevor Darrell",
        "authorids": "huazhe_xu@eecs.berkeley.edu;boyuanchen@berkeley.edu;yg@eecs.berkeley.edu;trevordarrell@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nxu2020scoringaggregatingplanning,\ntitle={Scoring-Aggregating-Planning: Learning task-agnostic priors from interactions and sparse rewards for zero-shot generalization},\nauthor={Huazhe Xu and Boyuan Chen and Yang Gao and Trevor Darrell},\nyear={2020},\nurl={https://openreview.net/forum?id=HyeuP2EtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyeuP2EtDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "604;185;260",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1311;304;1053",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            349.6666666666667,
            182.4286770829143
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            889.3333333333334,
            427.0849512164478
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3627916913994409967&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HyevIJStwH",
        "title": "Understanding Why Neural Networks Generalize Well Through GSNR of Parameters",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "As deep neural networks (DNNs) achieve tremendous success across many application domains, researchers tried to explore in many aspects on why they generalize well. In this paper, we provide a novel perspective on these issues using the gradient signal to noise ratio (GSNR) of parameters during training process of DNNs. The GSNR of a parameter is simply defined as the ratio between its gradient's squared mean and variance, over the data distribution. Based on several approximations, we establish a quantitative relationship between model parameters' GSNR and the generalization gap. This relationship indicates that larger GSNR during training process leads to better generalization performance. Futher, we show that, different from that of shallow models (e.g. logistic regression, support vector machines), the gradient descent optimization dynamics of DNNs naturally produces large GSNR during training, which is probably the key to DNNs\u2019 remarkable generalization ability.",
        "keywords": "DNN;generalization;GSNR;gradient descent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jinlong Liu;Yunzhi Bai;Guoqing Jiang;Ting Chen;Huayan Wang",
        "authorids": "ljlwykqh@126.com;yunzhi.bai@outlook.fr;jianggq@pku.edu.cn;roushi0322@sina.cn;wanghuayan@kuaishou.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLiu2020Understanding,\ntitle={Understanding Why Neural Networks Generalize Well Through GSNR of Parameters},\nauthor={Jinlong Liu and Yunzhi Bai and Guoqing Jiang and Ting Chen and Huayan Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyevIJStwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HyevIJStwH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "378;612;290",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "314;257;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            426.6666666666667,
            135.88557277683637
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            190.33333333333334,
            136.5829011585597
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 61,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9609795906883331251&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HyewT1BKvr",
        "title": "SpectroBank: A filter-bank convolutional layer for CNN-based audio applications",
        "track": "main",
        "status": "Reject",
        "tldr": "A new convolution layer where the kernels are based on audio signal processing filters with few learnable parameters.",
        "abstract": "We propose and investigate the design of a new convolutional layer where kernels are parameterized functions. This layer aims at being the input layer of convolutional neural networks for audio applications. The kernels are defined as functions having a band-pass filter shape, with a limited number of trainable parameters. We show that networks having such an input layer can achieve state-of-the-art accuracy on several audio classification tasks. This approach, while reducing the number of weights to be trained along with network training time, enables larger kernel sizes, an advantage for audio applications. Furthermore, the learned filters bring additional interpretability and a better understanding of the data properties exploited by the network.",
        "keywords": "audio;classification;convolutional neural network;deep learning;filter;filter-bank;raw waveform",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Helena Peic Tukuljac;Benjamin Ricaud;Nicolas Aspert;Pierre Vandergheynst",
        "authorids": "helena.peictukuljac@epfl.ch;benjamin.ricaud@epfl.ch;nicolas.aspert@epfl.ch;pierre.vandergheynst@epfl.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntukuljac2020spectrobank,\ntitle={SpectroBank: A filter-bank convolutional layer for {\\{}CNN{\\}}-based audio applications},\nauthor={Helena Peic Tukuljac and Benjamin Ricaud and Nicolas Aspert and Pierre Vandergheynst},\nyear={2020},\nurl={https://openreview.net/forum?id=HyewT1BKvr}\n}",
        "github": "https://app.box.com/s/vh5u7mpwrllhuqr8yl9jobohjrrjw797",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyewT1BKvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "526;620;381",
        "wc_reply_reviewers": "0;240;64",
        "wc_reply_authors": "618;979;487",
        "reply_reviewers": "0;2;1",
        "reply_authors": "1;3;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            509.0,
            98.309036546325
        ],
        "wc_reply_reviewers_avg": [
            101.33333333333333,
            101.47358714901573
        ],
        "wc_reply_authors_avg": [
            694.6666666666666,
            208.04540102797006
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16747149837535567736&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hyez1CVYvr",
        "title": "Simultaneous Classification and Out-of-Distribution Detection Using Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel loss function that achieves state-of-the-art results in out-of-distribution detection with Outlier Exposure both on image and text classi\ufb01cation tasks.",
        "abstract": "Deep neural networks have achieved great success in classi\ufb01cation tasks during the last years. However, one major problem to the path towards arti\ufb01cial intelligence is the inability of neural networks to accurately detect samples from novel class distributions and therefore, most of the existent classi\ufb01cation algorithms assume that all classes are known prior to the training stage. In this work, we propose a methodology for training a neural network that allows it to ef\ufb01ciently detect out-of-distribution (OOD) examples without compromising much of its classi\ufb01cation accuracy on the test examples from known classes. Based on the Outlier Exposure (OE) technique, we propose a novel loss function that achieves state-of-the-art results in out-of-distribution detection with OE both on image and text classi\ufb01cation tasks. Additionally, the way this method was constructed makes it suitable for training any classi\ufb01cation algorithm that is based on Maximum Likelihood methods.",
        "keywords": "Out-of-Distribution Detection;OOD detection;Outlier Exposure;Classification;Open-World Classification;Anomaly Detection;Novelty Detection;Calibration;Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aristotelis-Angelos Papadopoulos;Nazim Shaikh;Jiamian Wang;Mohammad Reza Rajati",
        "authorids": "aristotp@usc.edu;nshaikh@usc.edu;jiamianw@usc.edu;rajati@usc.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\npapadopoulos2020simultaneous,\ntitle={Simultaneous Classification and Out-of-Distribution Detection Using Deep Neural Networks},\nauthor={Aristotelis-Angelos Papadopoulos and Nazim Shaikh and Jiamian Wang and Mohammad Reza Rajati},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyez1CVYvr}\n}",
        "github": "https://www.dropbox.com/sh/1czixfxns51t9gu/AAD45AvxBFWOHx8RUFxFADbJa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hyez1CVYvr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "432;649;707",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1731;2315;1342",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;4;3",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            596.0,
            118.35821334688465
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1796.0,
            399.87581405564737
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            3.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15566755865019451839&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HyezBa4tPB",
        "title": "Dirichlet Wrapper to Quantify Classification Uncertainty in Black-Box Systems",
        "track": "main",
        "status": "Reject",
        "tldr": "A Dirichlet Deep Learning wrapper to quantify uncertainty in black-box systems applied to a rejection system to improve the quality of predictions",
        "abstract": "Nowadays, machine learning models are becoming a utility in many sectors. AI companies deliver pre-trained encapsulated models as application programming interfaces (APIs) that developers can combine with third party components, their models, and proprietary data, to create complex data products. This complexity and the lack of control and knowledge of the internals of these external components might cause unavoidable effects, such as lack of transparency, difficulty in auditability, and the emergence of uncontrolled potential risks. These issues are especially critical when practitioners use these components as black-boxes in new datasets. In order to provide actionable insights in this type of scenarios, in this work we propose the use of a wrapping deep learning model to enrich the output of a classification black-box with a measure of uncertainty. Given a black-box classifier, we propose a probabilistic neural network that works in parallel to the black-box and uses a Dirichlet layer as the fusion layer with the black-box.  This Dirichlet layer yields a distribution on top of the multinomial output parameters of the classifier and enables the estimation of aleatoric uncertainty for any data sample.  \nBased on the resulting uncertainty measure, we advocate for a rejection system that selects the more confident predictions, discarding those more uncertain, leading to an improvement in the trustability of the resulting system. We showcase the proposed technique and methodology in two practical scenarios, one for NLP and another for computer vision, where a simulated API based is applied to different domains. Results demonstrate the effectiveness of the uncertainty computed by the wrapper and its high correlation to wrong predictions and misclassifications.",
        "keywords": "uncertainty;black-box classifiers;rejection;deep learning;NLP;CV",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jos\u00e9 Mena Rold\u00e1n;Oriol Pujol Vila;Jordi Vitri\u00e0 Marca",
        "authorids": "jmenarol7@alumnes.ub.edu;oriol_pujol@ub.edu;jordi.vitria@ub.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nrold{\\'a}n2020dirichlet,\ntitle={Dirichlet Wrapper to Quantify Classification Uncertainty in Black-Box Systems},\nauthor={Jos{\\'e} Mena Rold{\\'a}n and Oriol Pujol Vila and Jordi Vitri{\\`a} Marca},\nyear={2020},\nurl={https://openreview.net/forum?id=HyezBa4tPB}\n}",
        "github": "https://colab.research.google.com/drive/1YsJucBwBSW3Fy6ECAhab21Nzrib3K_t2",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyezBa4tPB",
        "pdf_size": 0,
        "rating": "1;1;1;6",
        "confidence": "0;0;0;0",
        "wc_review": "310;546;653;220",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.25,
            2.165063509461097
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            432.25,
            174.4023724035886
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4-J41fVA5ooJ:scholar.google.com/&scioq=Dirichlet+Wrapper+to+Quantify+Classification+Uncertainty+in+Black-Box+Systems&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HyezmlBKwr",
        "title": "Test-Time Training for Out-of-Distribution Generalization",
        "track": "main",
        "status": "Reject",
        "tldr": "Training on a single test input with self-supervision makes the prediction better on this input when it is out-of-distribution.",
        "abstract": "We introduce a general approach, called test-time training, for improving the performance of predictive models when test and training data come from different distributions. Test-time training turns a single unlabeled test instance into a self-supervised learning problem, on which we update the model parameters before making a prediction on the test sample. We show that this simple idea leads to surprising improvements on diverse image classification benchmarks aimed at evaluating robustness to distribution shifts. Theoretical investigations on a convex model reveal helpful intuitions for when we can expect our approach to help.",
        "keywords": "out-of-distribution;distribution shifts",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Sun;Xiaolong Wang;Zhuang Liu;John Miller;Alexei A. Efros;Moritz Hardt",
        "authorids": "yusun@berkeley.edu;dragonwxl123@gmail.com;zhuangl@berkeley.edu;miller_john@berkeley.edu;efros@eecs.berkeley.edu;hardt@berkeley.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nsun2020testtime,\ntitle={Test-Time Training for Out-of-Distribution Generalization},\nauthor={Yu Sun and Xiaolong Wang and Zhuang Liu and John Miller and Alexei A. Efros and Moritz Hardt},\nyear={2020},\nurl={https://openreview.net/forum?id=HyezmlBKwr}\n}",
        "github": "https://drive.google.com/open?id=1xw-NylSnEjyHs67TXAptviOsx4YuSuZZ",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyezmlBKwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "499;294;635",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "300;301;274",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            476.0,
            140.1594330277726
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            291.6666666666667,
            12.498888839501783
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 87,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8473008852522685531&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hyg-JC4FDr",
        "title": "Imitation Learning via Off-Policy Distribution Matching",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "When performing imitation learning from expert demonstrations, distribution matching is a popular approach, in which one alternates between estimating distribution ratios and then using these ratios as rewards in a standard reinforcement learning (RL) algorithm. Traditionally, estimation of the distribution ratio requires on-policy data, which has caused previous work to either be exorbitantly data- inefficient or alter the original objective in a manner that can drastically change its optimum. In this work, we show how the original distribution ratio estimation objective may be transformed in a principled manner to yield a completely off-policy objective. In addition to the data-efficiency that this provides, we are able to show that this objective also renders the use of a separate RL optimization unnecessary. Rather, an imitation policy may be learned directly from this objective without the use of explicit rewards. We call the resulting algorithm ValueDICE and evaluate it on a suite of popular imitation learning benchmarks, finding that it can achieve state-of-the-art sample efficiency and performance.",
        "keywords": "reinforcement learning;deep learning;imitation learning;adversarial learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ilya Kostrikov;Ofir Nachum;Jonathan Tompson",
        "authorids": "kostrikov@cs.nyu.edu;ofirnachum@google.com;tompson@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nKostrikov2020Imitation,\ntitle={Imitation Learning via Off-Policy Distribution Matching},\nauthor={Ilya Kostrikov and Ofir Nachum and Jonathan Tompson},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyg-JC4FDr}\n}",
        "github": "[![github](/images/github_icon.svg) google-research/google-research](https://github.com/google-research/google-research) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=Hyg-JC4FDr)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hyg-JC4FDr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "615;330;231",
        "wc_reply_reviewers": "0;0;97",
        "wc_reply_authors": "199;275;180",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            392.0,
            162.7820628939196
        ],
        "wc_reply_reviewers_avg": [
            32.333333333333336,
            45.72623851673007
        ],
        "wc_reply_authors_avg": [
            218.0,
            41.04469108991645
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 231,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17232131883135762020&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Hyg4kkHKwH",
        "title": "V1Net: A computational model of cortical horizontal connections",
        "track": "main",
        "status": "Reject",
        "tldr": "In this work, we present V1Net -- a novel recurrent neural network modeling cortical horizontal connections that give rise to robust visual representations through perceptual grouping.",
        "abstract": "The primate visual system builds robust, multi-purpose representations of the external world in order to support several diverse downstream cortical processes. Such representations are required to be invariant to the sensory inconsistencies caused by dynamically varying lighting, local texture distortion, etc. A key architectural feature combating such environmental irregularities is \u2018long-range horizontal connections\u2019 that aid the perception of the global form of objects. In this work, we explore the introduction of such horizontal connections into standard deep convolutional networks; we present V1Net -- a novel convolutional-recurrent unit that models linear and nonlinear horizontal inhibitory and excitatory connections inspired by primate visual cortical connectivity. We introduce the Texturized Challenge -- a new benchmark to evaluate object recognition performance under perceptual noise -- which we use to evaluate V1Net against an array of carefully selected control models with/without recurrent processing. Additionally, we present results from an ablation study of V1Net demonstrating the utility of diverse neurally inspired horizontal connections for state-of-the-art AI systems on the task of object boundary detection from natural images. We also present the emergence of several biologically plausible horizontal connectivity patterns, namely center-on surround-off, association fields and border-ownership connectivity patterns in a V1Net model trained to perform boundary detection on natural images from the Berkeley Segmentation Dataset 500 (BSDS500). Our findings suggest an increased representational similarity between V1Net and biological visual systems, and highlight the importance of neurally inspired recurrent contextual processing principles for learning visual representations that are robust to perceptual noise and furthering the state-of-the-art in computer vision.",
        "keywords": "Biologically plausible deep learning;Recurrent Neural Networks;Perceptual grouping;horizontal connections;visual neuroscience;perceptual robustness;Gestalt psychology",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vijay Veerabadran;Virginia R. de Sa",
        "authorids": "vveeraba@ucsd.edu;desa@ucsd.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nveerabadran2020vnet,\ntitle={V1Net: A computational model of cortical horizontal connections},\nauthor={Vijay Veerabadran and Virginia R. de Sa},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyg4kkHKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hyg4kkHKwH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "201;438;289",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "556;492;538",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            309.3333333333333,
            97.81728999631008
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            528.6666666666666,
            26.948510575210314
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8923396683183360974&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hyg53gSYPB",
        "title": "Defense against Adversarial Examples by Encoder-Assisted Search in the Latent Coding Space",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep neural networks were shown to be vulnerable to crafted adversarial perturbations, and thus bring serious safety problems. To solve this problem, we proposed $\\text{AE-GAN}_\\text{+sr}$, a framework for purifying input images by searching a closest natural reconstruction with little computation. We first build a reconstruction network AE-GAN, which adapted auto-encoder by introducing adversarial loss to the objective function. In this way, we can enhance the generative ability of decoder and preserve the abstraction ability of encoder to form a self-organized latent space. In the inference time, when given an input, we will start a search process in the latent space which aims to find the closest reconstruction to the given image on the distribution of normal data. The encoder can provide a good start point for the searching process, which saves much computation cost. Experiments show that our method is robust against various attacks and can reach comparable even better performance to similar methods with much fewer computations.",
        "keywords": "Adversarial Defense;Auto-encoder;Adversarial Attack;GAN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenjing Huang;Shikui Tu;Lei Xu",
        "authorids": ";tushikui@sjtu.edu.cn;",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhuang2020defense,\ntitle={Defense against Adversarial Examples by Encoder-Assisted Search in the Latent Coding Space},\nauthor={Wenjing Huang and Shikui Tu and Lei Xu},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyg53gSYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hyg53gSYPB",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "334;280;161;171",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "959;1298;532;85",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;2;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            236.5,
            73.12489316231512
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            718.5,
            455.457187889268
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17880842768406952309&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hyg5TRNtDH",
        "title": "Unsupervised Temperature Scaling: Robust Post-processing Calibration for Domain Shift",
        "track": "main",
        "status": "Reject",
        "tldr": "A robust post-processing calibration method for domain shift.",
        "abstract": "The uncertainty estimation is critical in real-world decision making applications, especially when distributional shift between the training and test data are prevalent. Many calibration methods in the literature have been proposed to improve the predictive uncertainty of DNNs which are generally not well-calibrated. However, none of them is specifically designed to work properly under domain shift condition. In this paper, we propose Unsupervised Temperature Scaling (UTS) as a robust calibration method to domain shift. It exploits test samples to adjust the uncertainty prediction of deep models towards the test distribution.  UTS utilizes a novel loss function, weighted NLL, that allows unsupervised calibration.  We evaluate UTS on a wide range of model-datasets which shows the possibility of calibration without labels and demonstrate the robustness of UTS compared to other methods (e.g., TS, MC-dropout, SVI, ensembles) in shifted domains.  ",
        "keywords": "calibration;domain shift;uncertainty prediction;deep neural networks;temperature scaling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Azadeh Sadat Mozafari;Hugo Siqueira Gomes;Christian Gagne",
        "authorids": "azadeh-sadat.mozafari.1@ulaval.ca;hugo.siqueira-gomes.1@ulaval.ca;christian.gagne@gel.ulaval.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmozafari2020unsupervised,\ntitle={Unsupervised Temperature Scaling: Robust Post-processing Calibration for Domain Shift},\nauthor={Azadeh Sadat Mozafari and Hugo Siqueira Gomes and Christian Gagne},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyg5TRNtDH}\n}",
        "github": "https://github.com/uts-iclr2020/uts-iclr2020",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hyg5TRNtDH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "630;344;227",
        "wc_reply_reviewers": "125;0;0",
        "wc_reply_authors": "2186;529;356",
        "reply_reviewers": "1;0;0",
        "reply_authors": "6;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            400.3333333333333,
            169.27754986674663
        ],
        "wc_reply_reviewers_avg": [
            41.666666666666664,
            58.92556509887896
        ],
        "wc_reply_authors_avg": [
            1023.6666666666666,
            824.9227573267425
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18203020549186979491&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hyg6neStDr",
        "title": "DOUBLE-HARD DEBIASING: TAILORING WORD EMBEDDINGS FOR GENDER BIAS MITIGATION",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Gender bias in word embeddings has been widely investigated. However, recent work has shown that existing approaches, including the well-known Hard Debias algorithm which projects word embeddings to a subspace orthogonal to an inferred gender direction, are insufficient to deliver gender-neutral word embeddings. In our work, we discover that semantic-agnostic corpus statistics such as word frequency are important factors that limit the debiasing performance. We propose a simple but effective processing technique, Double-Hard Debias, to attenuate the effect due to such noise. We experiment with Word2Vec and GloVe embeddings and demonstrate on several benchmarks that our approach preserves the distributional semantics while effectively reducing gender bias to a larger extent than previous debiasing techniques. ",
        "keywords": "Gender bias;Word embeddings",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianlu Wang;Xi Victoria Lin;Nazneen Fatema Rajani;Vicente Ordonez;Caimng Xiong",
        "authorids": "tianlu@virginia.edu;xilin@salesforce.com;nazneen.rajani@salesforce.com;vicente@virginia.edu;cxiong@salesforce.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hyg6neStDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "170;160;341",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            223.66666666666666,
            83.06757623900087
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:W03zQEN9W5sJ:scholar.google.com/&scioq=DOUBLE-HARD+DEBIASING:+TAILORING+WORD+EMBEDDINGS+FOR+GENDER+BIAS+MITIGATION&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Hyg96gBKPS",
        "title": "Monotonic Multihead Attention",
        "track": "main",
        "status": "Poster",
        "tldr": "Make the transformer streamable with monotonic attention.",
        "abstract": "Simultaneous machine translation models start generating a target sequence before they have encoded or read the source sequence. Recent approach for this task either apply a fixed policy on transformer, or a learnable monotonic attention on a weaker recurrent neural network based structure. In this paper, we propose a new attention mechanism, Monotonic Multihead Attention (MMA), which introduced the monotonic attention mechanism to multihead attention. We also introduced two novel interpretable approaches for latency control that are specifically designed for multiple attentions. We apply MMA to the simultaneous machine translation task and demonstrate better latency-quality tradeoffs compared to MILk, the previous state-of-the-art approach.\n",
        "keywords": "Simultaneous Translation;Transformer;Monotonic Attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xutai Ma;Juan Miguel Pino;James Cross;Liezl Puzon;Jiatao Gu",
        "authorids": "xutai_ma@jhu.edu;juancarabina@fb.com;jcross@fb.com;lie@fb.com;jgu@fb.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nMa2020Monotonic,\ntitle={Monotonic Multihead Attention},\nauthor={Xutai Ma and Juan Miguel Pino and James Cross and Liezl Puzon and Jiatao Gu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyg96gBKPS}\n}",
        "github": "[![github](/images/github_icon.svg) pytorch/fairseq](https://github.com/pytorch/fairseq) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=Hyg96gBKPS)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hyg96gBKPS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "210;866;546",
        "wc_reply_reviewers": "0;0;4",
        "wc_reply_authors": "513;2062;761",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;5;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            540.6666666666666,
            267.83742996244735
        ],
        "wc_reply_reviewers_avg": [
            1.3333333333333333,
            1.8856180831641267
        ],
        "wc_reply_authors_avg": [
            1112.0,
            679.3384036448011
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.8856180831641267
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 157,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15976847532322302730&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Hyg9anEFPS",
        "title": "Image-guided Neural Object Rendering",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a learned image-guided rendering technique that combines the benefits of image-based rendering and GAN-based image synthesis while considering view-dependent effects.",
        "abstract": "We propose a learned image-guided rendering technique that combines the benefits of image-based rendering and GAN-based image synthesis. The goal of our method is to generate photo-realistic re-renderings of reconstructed objects for virtual and augmented reality applications (e.g., virtual showrooms, virtual tours and sightseeing, the digital inspection of historical artifacts). A core component of our work is the handling of view-dependent effects. Specifically, we directly train an object-specific deep neural network to synthesize the view-dependent appearance of an object.\nAs input data we are using an RGB video of the object. This video is used to reconstruct a proxy geometry of the object via multi-view stereo. Based on this 3D proxy, the appearance of a captured view can be warped into a new target view as in classical image-based rendering. This warping assumes diffuse surfaces, in case of view-dependent effects, such as specular highlights, it leads to artifacts. To this end, we propose EffectsNet, a deep neural network that predicts view-dependent effects. Based on these estimations, we are able to convert observed images to diffuse images. These diffuse images can be projected into other views. In the target view, our pipeline reinserts the new view-dependent effects. To composite multiple reprojected images to a final output, we learn a composition network that outputs photo-realistic results. Using this image-guided approach, the network does not have to allocate capacity on ``remembering'' object appearance, instead it learns how to combine the appearance of captured images. We demonstrate the effectiveness of our approach both qualitatively and quantitatively on synthetic as well as on real data.",
        "keywords": "Neural Rendering;Neural Image Synthesis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Justus Thies;Michael Zollh\u00f6fer;Christian Theobalt;Marc Stamminger;Matthias Nie\u00dfner",
        "authorids": "justus.thies@tum.de;michael@zollhoefer.com;marc.stamminger@fau.de;theobalt@mpi-inf.mpg.de;niessner@tum.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nThies2020Image-guided,\ntitle={Image-guided Neural Object Rendering},\nauthor={Justus Thies and Michael Zollh\u00f6fer and Christian Theobalt and Marc Stamminger and Matthias Nie\u00dfner},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyg9anEFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hyg9anEFPS",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "544;402;296;426",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.0,
            88.14193099768123
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 71,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8532168888521123979&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HygDF1rYDB",
        "title": "Explaining Time Series by Counterfactuals",
        "track": "main",
        "status": "Reject",
        "tldr": "Explaining Multivariate Time Series Models by finding important observations in time using Counterfactuals",
        "abstract": "We propose a method to automatically compute the importance of features at every observation in time series, by simulating counterfactual trajectories given previous observations. We define the importance of each observation as the change in the model output caused by replacing the observation with a generated one. Our method can be applied to arbitrarily complex time series models. We compare the generated feature importance to existing methods like sensitivity analyses, feature occlusion, and other explanation baselines to show that our approach generates more precise explanations and is less sensitive to noise in the input signals.",
        "keywords": "explainability;counterfactual modeling;time series",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sana Tonekaboni;Shalmali Joshi;David Duvenaud;Anna Goldenberg",
        "authorids": "stonekaboni@cs.toronto.edu;shalmali@vectorinstitute.ai;duvenaud@cs.toronto.edu;anna.goldenberg@utoronto.ca",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntonekaboni2020explaining,\ntitle={Explaining Time Series by Counterfactuals},\nauthor={Sana Tonekaboni and Shalmali Joshi and David Duvenaud and Anna Goldenberg},\nyear={2020},\nurl={https://openreview.net/forum?id=HygDF1rYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HygDF1rYDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "256;239;642",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "677;627;661",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            379.0,
            186.09854020563048
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            655.0,
            20.848661028149188
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9870508272831002868&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HygDF6NFPB",
        "title": "A Fair Comparison of Graph Neural Networks for Graph Classification",
        "track": "main",
        "status": "Poster",
        "tldr": "We provide a rigorous comparison of different Graph Neural Networks for graph classification.",
        "abstract": "Experimental reproducibility and replicability are critical topics in machine learning. Authors have often raised concerns about their lack in scientific publications to improve the quality of the field. Recently, the graph representation learning field has attracted the attention of a wide research community, which resulted in a large stream of works.\nAs such, several Graph Neural Network models have been developed to effectively tackle graph classification. However, experimental procedures often lack rigorousness and are hardly reproducible. Motivated by this, we provide an overview of common practices that should be avoided to fairly compare with the state of the art. To counter this troubling trend, we ran more than 47000 experiments in a controlled and uniform framework to re-evaluate five popular models across nine common benchmarks. Moreover, by comparing GNNs with structure-agnostic baselines we provide convincing evidence that, on some datasets, structural information has not been exploited yet. We believe that this work can contribute to the development of the graph learning field, by providing a much needed grounding for rigorous evaluations of graph classification models.",
        "keywords": "graph neural networks;graph classification;reproducibility;graph representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Federico Errica;Marco Podda;Davide Bacciu;Alessio Micheli",
        "authorids": "federico.errica@phd.unipi.it;marco.podda@di.unipi.it;bacciu@di.unipi.it;micheli@di.unipi.it",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nErrica2020A,\ntitle={A Fair Comparison of Graph Neural Networks for Graph Classification},\nauthor={Federico Errica and Marco Podda and Davide Bacciu and Alessio Micheli},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HygDF6NFPB}\n}",
        "github": "https://github.com/diningphil/gnn-comparison",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HygDF6NFPB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "1707;143;493",
        "wc_reply_reviewers": "302;0;0",
        "wc_reply_authors": "3015;142;873",
        "reply_reviewers": "1;0;0",
        "reply_authors": "5;1;3",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            781.0,
            670.190022804478
        ],
        "wc_reply_reviewers_avg": [
            100.66666666666667,
            142.36416527889156
        ],
        "wc_reply_authors_avg": [
            1343.3333333333333,
            1219.136944272008
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            3.0,
            1.632993161855452
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 588,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3840429300245249800&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HygFxxrFvB",
        "title": "Differentially Private Mixed-Type Data Generation For Unsupervised Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose private synthetic data generation algorithm that first combines autoencoder and GAN, and develop new evaluation metrics for synthetic data generation task.",
        "abstract": "In this work we introduce the DP-auto-GAN framework for synthetic data generation, which combines the low dimensional representation of autoencoders with the flexibility of GANs.  This framework can be used to take in raw sensitive data, and privately train a model for generating synthetic data that should satisfy the same statistical properties as the original data.  This learned model can be used to generate arbitrary amounts of publicly available synthetic data, which can then be freely shared due to the post-processing guarantees of differential privacy.  Our framework is applicable to unlabled \\emph{mixed-type data}, that may include binary, categorical, and real-valued data.  We implement this framework on both unlabeled binary data (MIMIC-III) and unlabeled mixed-type data (ADULT).  We also introduce new metrics for evaluating the quality of synthetic mixed-type data, particularly in unsupervised settings.",
        "keywords": "Differential privacy;synthetic data;private data generation;mixed-type;unsupervised learning;autoencoder;GAN;private deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Uthaipon Tantipongpipat;Chris Waites;Digvijay Boob;Amaresh Siva;Rachel Cummings",
        "authorids": "uthaipon@gmail.com;cwaites10@gmail.com;digvijaybb40@gmail.com;ankit.siva@gatech.edu;racheladcummings@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ntantipongpipat2020differentially,\ntitle={Differentially Private Mixed-Type Data Generation For Unsupervised Learning},\nauthor={Uthaipon Tantipongpipat and Chris Waites and Digvijay Boob and Amaresh Siva and Rachel Cummings},\nyear={2020},\nurl={https://openreview.net/forum?id=HygFxxrFvB}\n}",
        "github": "https://github.com/DPautoGAN/DPAutoGAN",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HygFxxrFvB",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "373;247",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            310.0,
            63.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "HygHbTVYPB",
        "title": "LDMGAN: Reducing Mode Collapse in GANs with Latent Distribution Matching",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an AE-based GAN that alleviates mode collapse in GANs.",
        "abstract": "Generative Adversarial Networks (GANs) have shown impressive results in modeling distributions over complicated manifolds such as those of natural images. However, GANs often suffer from mode collapse, which means they are prone to characterize only a single or a few modes of the data distribution. In order to address this problem, we propose a novel framework called LDMGAN. We \ufb01rst introduce Latent Distribution Matching (LDM) constraint which regularizes the generator by aligning distribution of generated samples with that of real samples in latent space. To make use of such latent space, we propose a regularized AutoEncoder (AE) that maps the data distribution to prior distribution in encoded space. Extensive experiments on synthetic data and real world datasets show that our proposed framework signi\ufb01cantly improves GAN\u2019s stability and diversity.",
        "keywords": "Deep Learning;Unsupervised Learning;Generative Adversarial Networks;Mode Collapse;AutoEncoder",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiwen Zuo;Lei Zhao;Huiming Zhang;Qihang Mo;Haibo Chen;Zhizhong Wang;AiLin Li;Lihong Qiu;Wei Xing;Dongming Lu",
        "authorids": "zzwcs@zju.edu.cn;cszhl@zju.edh.cn;qinglanwuji@zju.edu.cn;moqihang@zju.edu.cn;feng123@zju.edu.cn;endywon@zju.edu.cn;11921050@zju.edu.cn;zjusheldon@zju.edu.cn;wxing@zju.edu.cn;ldm@zju.edu.cn",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@misc{\nzuo2020ldmgan,\ntitle={{\\{}LDMGAN{\\}}: Reducing Mode Collapse in {\\{}GAN{\\}}s with Latent Distribution Matching},\nauthor={Zhiwen Zuo and Lei Zhao and Huiming Zhang and Qihang Mo and Haibo Chen and Zhizhong Wang and AiLin Li and Lihong Qiu and Wei Xing and Dongming Lu},\nyear={2020},\nurl={https://openreview.net/forum?id=HygHbTVYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HygHbTVYPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "404;242;452",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            366.0,
            89.8443097808648
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13870457913273396716&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HygHtpVtPH",
        "title": "Laplacian Denoising Autoencoder",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new denoising autoencoder with Laplacian pyramid editing, results in improved representation learning capability.",
        "abstract": "While deep neural networks have been shown to perform remarkably well in many machine learning tasks, labeling a large amount of supervised data is usually very costly to scale. Therefore, learning robust representations with unlabeled data is critical in relieving human effort and vital for many downstream applications. Recent advances in unsupervised and self-supervised learning approaches for visual data benefit greatly from domain knowledge. Here we are interested in a more generic unsupervised learning framework that can be easily generalized to other domains. In this paper, we propose to learn data representations with a novel type of denoising autoencoder, where the input noisy data is generated by corrupting the clean data in gradient domain. This can be naturally generalized to span multiple scales with a Laplacian pyramid representation of the input data. In this way, the agent has to learn more robust representations that can exploit the underlying data structures across multiple scales. Experiments on several visual benchmarks demonstrate that better representations can be learned with the proposed approach, compared to its counterpart with single-scale corruption. Besides, we also demonstrate that the learned representations perform well when transferring to other vision tasks.",
        "keywords": "unsupervised;representation learning;Laplacian",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jianbo Jiao;Linchao Bao;Yunchao Wei;Shengfeng He;Honghui Shi;Rynson Lau;Thomas Huang",
        "authorids": "jiaojianbo.i@gmail.com;linchaobao@gmail.com;wychao1987@gmail.com;shengfenghe7@gmail.com;shihonghui3@gmail.com;rynson.lau@cityu.edu.hk;t-huang1@illinois.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\njiao2020laplacian,\ntitle={Laplacian Denoising Autoencoder},\nauthor={Jianbo Jiao and Linchao Bao and Yunchao Wei and Shengfeng He and Honghui Shi and Rynson Lau and Thomas Huang},\nyear={2020},\nurl={https://openreview.net/forum?id=HygHtpVtPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HygHtpVtPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "374;372;156",
        "wc_reply_reviewers": "0;93;0",
        "wc_reply_authors": "546;613;235",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            300.6666666666667,
            102.2980395163509
        ],
        "wc_reply_reviewers_avg": [
            31.0,
            43.840620433565945
        ],
        "wc_reply_authors_avg": [
            464.6666666666667,
            164.68623365526201
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7620256902142821139&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HygN634KvH",
        "title": "Temporal Probabilistic Asymmetric Multi-task Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We proposed a novel probabilistic asymmetric multi-task learning framework that allows asymmetric knowledge transfer between tasks and across time-steps, based on the uncertainty",
        "abstract": "When performing multi-task predictions with time-series data, knowledge learned for one task at a specific time step may be useful in learning for another task at a later time step (e.g. prediction of sepsis may be useful for prediction of mortality for risk prediction at intensive care units). To capture such dynamically changing asymmetric relationships between tasks and long-range temporal dependencies in time-series data, we propose a novel temporal asymmetric multi-task learning model, which learns to combine features from other tasks at diverse timesteps for the prediction of each task. One crucial challenge here is deciding on the direction and the amount of knowledge transfer, since loss-based knowledge transfer Lee et al. (2016; 2017) does not apply in our case where we do not have loss at each timestep. We propose to tackle this challenge by proposing a novel uncertainty- based probabilistic knowledge transfer mechanism, such that we perform knowledge transfer from more certain tasks with lower variance to uncertain ones with higher variance. We validate our Temporal Probabilistic Asymmetric Multi-task Learning (TP-AMTL) model on two clinical risk prediction tasks against recent deep learning models for time-series analysis, which our model significantly outperforms by successfully preventing negative transfer. Further qualitative analysis of our model by clinicians suggests that the learned knowledge transfer graphs are helpful in analyzing the model\u2019s predictions.",
        "keywords": "Multi-task learning;Time-series analysis;Variational Inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nguyen Anh Tuan;Hyewon Jeong;Eunho Yang;Sungju Hwang",
        "authorids": "nanhtuan@kaist.ac.kr;jhw162@kaist.ac.kr;eunhoy@kaist.ac.kr;sjhwang82@kaist.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntuan2020temporal,\ntitle={Temporal Probabilistic Asymmetric Multi-task Learning},\nauthor={Nguyen Anh Tuan and Hyewon Jeong and Eunho Yang and Sungju Hwang},\nyear={2020},\nurl={https://openreview.net/forum?id=HygN634KvH}\n}",
        "github": "https://www.dropbox.com/sh/vmfv7kvd5h0rwbu/AAAye8ybP9PCPb-3RozMPEjQa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HygN634KvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "157;398;277",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "640;637;582",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            277.3333333333333,
            98.38812033077073
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            619.6666666666666,
            26.662499674428293
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15940636369123978830&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HygOjhEYDH",
        "title": "Intensity-Free Learning of Temporal Point Processes",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Learn in temporal point processes by modeling the conditional density, not the conditional intensity.",
        "abstract": "Temporal point processes are the dominant paradigm for modeling sequences of events happening at irregular intervals. The standard way of learning in such models is by estimating the conditional intensity function.  However, parameterizing the intensity function usually incurs several trade-offs. We show how to overcome the limitations of intensity-based approaches by directly modeling the conditional distribution of inter-event times.  We draw on the literature on normalizing flows to design models that are flexible and efficient. We additionally propose a simple mixture model that matches the flexibility of flow-based models, but also permits sampling and computing moments in closed form.  The proposed models achieve state-of-the-art performance in standard prediction tasks and are suitable for novel applications, such as learning sequence embeddings and imputing missing data.",
        "keywords": "Temporal point process;neural density estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Oleksandr Shchur;Marin Bilo\u0161;Stephan G\u00fcnnemann",
        "authorids": "shchur@in.tum.de;bilos@in.tum.de;guennemann@in.tum.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nShchur2020Intensity-Free,\ntitle={Intensity-Free Learning of Temporal Point Processes},\nauthor={Oleksandr Shchur and Marin Bilo\u0161 and Stephan G\u00fcnnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HygOjhEYDH}\n}",
        "github": "https://github.com/shchur/ifl-tpp",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HygOjhEYDH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "384;617;295",
        "wc_reply_reviewers": "0;0;255",
        "wc_reply_authors": "499;343;536",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            432.0,
            135.76695719749583
        ],
        "wc_reply_reviewers_avg": [
            85.0,
            120.20815280171308
        ],
        "wc_reply_authors_avg": [
            459.3333333333333,
            83.63545234461812
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 206,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6068412872697213311&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HygP3TVFvS",
        "title": "Non-Gaussian processes and neural networks at finite widths",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We develop an analytical method to study Bayesian inference of finite-width neural networks and find that the renormalization-group flow picture naturally emerges.",
        "abstract": "Gaussian processes are ubiquitous in nature and engineering. A case in point is a class of neural networks in the infinite-width limit, whose priors correspond to Gaussian processes. Here we perturbatively extend this correspondence to finite-width neural networks, yielding non-Gaussian processes as priors. The methodology developed herein allows us to track the flow of preactivation distributions by progressively integrating out random variables from lower to higher layers, reminiscent of renormalization-group flow. We further develop a perturbative prescription to perform Bayesian inference with weakly non-Gaussian priors.",
        "keywords": "Gaussian process;perturbation theory;renormalization group;Bayesian inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sho Yaida",
        "authorids": "shoyaida@fb.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HygP3TVFvS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "271;192;424",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.6666666666667,
            96.30622455941717
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 110,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13873212244817759429&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HygPjlrYvB",
        "title": "Learning from Positive and Unlabeled Data with Adversarial Training",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Positive-unlabeled (PU) learning learns a binary classifier using only positive and unlabeled examples without labeled negative examples. This paper shows that the GAN (Generative Adversarial Networks) style of adversarial training is quite suitable for PU learning. GAN learns a generator to generate data (e.g., images) to fool a discriminator which tries to determine whether the generated data belong to a (positive) training class. PU learning is similar and can be naturally casted as trying to identify (not generate) likely positive data from the unlabeled set also to fool a discriminator that determines whether the identified likely positive data from the unlabeled set (U) are indeed positive (P). A direct adaptation of GAN for PU learning does not produce a strong classifier. This paper proposes a more effective method called Predictive Adversarial Networks (PAN) using a new objective function based on KL-divergence, which performs much better.~Empirical evaluation using both image and text data shows the effectiveness of PAN.  ",
        "keywords": "Positive and Unlabeled learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenpeng Hu;Ran Le;Bing Liu;Feng Ji;Haiqing Chen;Dongyan Zhao;Jinwen Ma;Rui Yan",
        "authorids": "wenpeng.hu@pku.edu.cn;;;;;;jinwen.ma@pku.edu.cn;",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nhu2020learning,\ntitle={Learning from Positive and Unlabeled Data  with Adversarial Training},\nauthor={Wenpeng Hu and Ran Le and Bing Liu and Feng Ji and Haiqing Chen and Dongyan Zhao and Jinwen Ma and Rui Yan},\nyear={2020},\nurl={https://openreview.net/forum?id=HygPjlrYvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HygPjlrYvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "448;239;421",
        "wc_reply_reviewers": "182;0;0",
        "wc_reply_authors": "1517;246;522",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.3333333333333,
            92.8164257493731
        ],
        "wc_reply_reviewers_avg": [
            60.666666666666664,
            85.79562278396777
        ],
        "wc_reply_authors_avg": [
            761.6666666666666,
            545.8573277168881
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8448834932944854469&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HygQ7TNtPr",
        "title": "Rethinking Neural Network Quantization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Quantization reduces computation costs of neural networks but suffers from performance degeneration. Is this accuracy drop due to the reduced capacity, or inefficient training during the quantization procedure? After looking into the gradient propagation process of neural networks by viewing the weights and intermediate activations as random variables, we discover two critical rules for efficient training. Recent quantization approaches violates the two rules and results in degenerated convergence. To deal with this problem, we propose a simple yet effective technique, named scale-adjusted training (SAT), to comply with the discovered rules and facilitates efficient training. We also analyze the quantization error introduced in calculating the gradient in the popular parameterized clipping activation (PACT) technique. Through SAT together with gradient-calibrated PACT, quantized models obtain comparable or even better performance than their full-precision counterparts, achieving state-of-the-art accuracy with consistent improvement over previous quantization methods on a wide spectrum of models including MobileNet-V1/V2 and PreResNet-50.",
        "keywords": "Deep Learning;Convolutional Network;Network Quantization;Efficient Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qing Jin;Linjie Yang;Zhenyu Liao",
        "authorids": "jinqingking@gmail.com;yljatthu@gmail.com;liaozhenyu2004@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\njin2020rethinking,\ntitle={Rethinking Neural Network Quantization},\nauthor={Qing Jin and Linjie Yang and Zhenyu Liao},\nyear={2020},\nurl={https://openreview.net/forum?id=HygQ7TNtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HygQ7TNtPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "294;257;286",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "705;226;255",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            279.0,
            15.895492023421818
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            395.3333333333333,
            219.2872291969801
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1551624302657773274&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HygS91rYvH",
        "title": "Universal Adversarial Attack Using Very Few Test Examples",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Adversarial attacks such as Gradient-based attacks, Fast Gradient Sign Method (FGSM) by Goodfellow et al.(2015) and DeepFool by Moosavi-Dezfooli et al. (2016) are input-dependent, small pixel-wise perturbations of images which fool state of the art neural networks into misclassifying images but are unlikely to fool any human. On the other hand a universal adversarial attack is an input-agnostic perturbation. The same perturbation is applied to all inputs and yet the neural network is fooled on a large fraction of the inputs. In this paper, we show that multiple known input-dependent pixel-wise perturbations share a common spectral property. Using this spectral property, we show that the top singular vector of input-dependent adversarial attack directions can be used as a very simple universal adversarial attack on neural networks. We evaluate the error rates and fooling rates of three universal attacks, SVD-Gradient, SVD-DeepFool and SVD-FGSM, on state of the art neural networks. We show that these universal attack vectors can be computed using a small sample of test inputs. We establish our results both theoretically and empirically. On VGG19 and VGG16, the fooling rate of SVD-DeepFool and SVD-Gradient perturbations constructed from observing less than 0.2% of the validation set of ImageNet is as good as the universal attack of Moosavi-Dezfooli et al. (2017a). To prove our theoretical results, we use matrix concentration inequalities and spectral perturbation bounds. For completeness, we also discuss another recent approach to universal adversarial perturbations based on (p, q)-singular vectors, proposed independently by Khrulkov & Oseledets (2018), and point out the simplicity and efficiency of our universal attack as the key difference.",
        "keywords": "universal;adversarial;SVD",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amit Deshpande;Sandesh Kamath;K V Subrahmanyam",
        "authorids": "amitdesh@microsoft.com;ksandeshk@cmi.ac.in;kv@cmi.ac.in",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndeshpande2020universal,\ntitle={Universal Adversarial Attack Using Very Few Test Examples},\nauthor={Amit Deshpande and Sandesh Kamath and K V Subrahmanyam},\nyear={2020},\nurl={https://openreview.net/forum?id=HygS91rYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HygS91rYvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "425;238;226",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            296.3333333333333,
            91.11287261169826
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12524994907125598751&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HygSq3VFvH",
        "title": "Self-Supervised State-Control through Intrinsic Mutual Information Rewards",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper introduces Mutual Information-based State-Control, a self-supervised reinforcement learning framework for discovering robotic manipulation skills.",
        "abstract": "Learning to discover useful skills without a manually-designed reward function would have many applications, yet is still a challenge for reinforcement learning. In this paper, we propose Mutual Information-based State-Control (MISC), a new self-supervised Reinforcement Learning approach for learning to control states of interest without any external reward function. We formulate the intrinsic objective as rewarding the skills that maximize the mutual information between the context states and the states of interest. For example, in robotic manipulation tasks, the context states are the robot states and the states of interest are the states of an object. We evaluate our approach for different simulated robotic manipulation tasks from OpenAI Gym. We show that our method is able to learn to manipulate the object, such as pushing and picking up, purely based on the intrinsic mutual information rewards. Furthermore, the pre-trained policy and mutual information discriminator can be used to accelerate learning to achieve high task rewards. Our results show that the mutual information between the context states and the states of interest can be an effective ingredient for overcoming challenges in robotic manipulation tasks with sparse rewards. A video showing experimental results is available at https://youtu.be/cLRrkd3Y7vU",
        "keywords": "Intrinsic Reward;Deep Reinforcement Learning;Skill Discovery;Mutual Information;Self-Supervised Learning;Unsupervised Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rui Zhao;Volker Tresp;Wei Xu",
        "authorids": "zhaorui.in.germany@gmail.com;volker.tresp@siemens.com;wei.xu@horizon.ai",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhao2020selfsupervised,\ntitle={Self-Supervised State-Control through Intrinsic Mutual Information Rewards},\nauthor={Rui Zhao and Volker Tresp and Wei Xu},\nyear={2020},\nurl={https://openreview.net/forum?id=HygSq3VFvH}\n}",
        "github": "https://github.com/misc-project/misc",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HygSq3VFvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "602;366;474",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "724;331;344",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            480.6666666666667,
            96.46185198765825
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            466.3333333333333,
            182.2751278211655
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Kq2StiyCRRQJ:scholar.google.com/&scioq=Self-Supervised+State-Control+through+Intrinsic+Mutual+Information+Rewards&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HygTUxHKwH",
        "title": "Qgraph-bounded Q-learning: Stabilizing Model-Free Off-Policy Deep Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We link the graph-structure of the replay memory to soft divergence and propose Qgraphs to stabilize model-free off-policy deep RL.",
        "abstract": "In state of the art model-free off-policy deep reinforcement learning (RL), a replay memory is used to store past experience and derive all network updates. Even if both state and action spaces are continuous, the replay memory only holds a finite number of transitions.  We represent these transitions in a data graph and link its structure to soft divergence. By selecting a subgraph with a favorable structure, we construct a simple Markov Decision Process (MDP) for which exact Q-values can be computed efficiently as more data comes in - resulting in a Qgraph. We show that the Q-value for each transition in the simplified MDP is a lower bound of the Q-value for the same transition in the original continuous Q-learning problem. By using these lower bounds in TD learning, our method is less prone to soft divergence and exhibits increased sample efficiency while being more robust to hyperparameters. Qgraphs also retain information from transitions that have already been overwritten in the replay memory, which can decrease the algorithm's sensitivity to the replay memory capacity.\n",
        "keywords": "deep learning;reinforcement learning;model-free reinforcement learning;Q-learning;DDPG",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sabrina Hoppe;Marc Toussaint",
        "authorids": "sabrina.hoppe@de.bosch.com;marc.toussaint@informatik.uni-stuttgart.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nhoppe2020qgraphbounded,\ntitle={Qgraph-bounded Q-learning: Stabilizing Model-Free Off-Policy Deep Reinforcement Learning},\nauthor={Sabrina Hoppe and Marc Toussaint},\nyear={2020},\nurl={https://openreview.net/forum?id=HygTUxHKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HygTUxHKwH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "261;334;240",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "220;866;278",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            278.3333333333333,
            40.28509512076258
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            454.6666666666667,
            291.8188174573775
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6687962394954847544&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HygUBaEFDB",
        "title": "Generalizing Deep Multi-task Learning with Heterogeneous Structured Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "a distributed latent-space based knowledge-sharing framework for deep multi-task learning",
        "abstract": "Many real applications show a great deal of interest in learning multiple tasks from different data sources/modalities with unbalanced samples and dimensions. Unfortunately, existing cutting-edge deep multi-task learning (MTL) approaches cannot be directly applied to these settings, due to either heterogeneous input dimensions or the heterogeneity in the optimal network architectures of different tasks. It is thus demanding to develop knowledge-sharing mechanism to handle the intrinsic discrepancies among network architectures across tasks. To this end, we propose a flexible knowledge-sharing framework for jointly learning multiple tasks from distinct data sources/modalities. The proposed framework allows each task to own its task (data)-specific network design, via utilizing a compact tensor representation, while the sharing is achieved through the partially shared latent cores. By providing more elaborate sharing control with latent cores, our framework is effective in transferring task-invariant knowledge, yet also being efficient in learning task-specific features. Experiments on both single and multiple data sources/modalities settings display the promising results of the proposed method, especially favourable in insufficient data scenarios.",
        "keywords": "deep multi-task learning;heterogenous network architectures;tensor representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ming Hou;Xinqi Chen;Shifeng Huang;Shengli Xie;Guoxu Zhou;Qibin Zhao",
        "authorids": "ming.hou@riken.jp;xinqicham@gmail.com;sfengmmin@163.com;shlxie@gdut.edu.cn;gx.zhou@gdut.edu.cn;qibin.zhao@riken.jp",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HygUBaEFDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "397;324;394",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            371.6666666666667,
            33.7276675083759
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3155884868281433937&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HygW26VYwS",
        "title": "Attention Privileged Reinforcement Learning for Domain Transfer",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Applying reinforcement learning (RL) to physical systems presents notable challenges, given requirements regarding sample efficiency, safety, and physical constraints compared to simulated environments. To enable transfer of policies trained in simulation, randomising simulation parameters leads to more robust policies, but also in significantly extended training time. In this paper, we exploit access to privileged information (such as environment states) often available in simulation, in order to improve and accelerate learning over randomised environments. We introduce Attention Privileged Reinforcement Learning (APRiL), which equips the agent with an attention mechanism and makes use of state information in simulation, learning to align attention between state- and image-based policies while additionally sharing generated data. During deployment we can apply the image-based policy to remove the requirement of access to additional information. We experimentally demonstrate accelerated and more robust learning on a number of diverse domains, leading to improved final performance for environments both within and outside the training distribution.",
        "keywords": "sim-to-real;domain randomisation;attention;transfer learning;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sasha Salter;Dushyant Rao;Markus Wulfmeier;Raia Hadsell;Ingmar Posner",
        "authorids": "sasha@robots.ox.ac.uk;dushyantr@google.com;mwulfmeier@google.com;raia@google.com;ingmar@robots.ox.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsalter2020attention,\ntitle={Attention Privileged Reinforcement Learning for Domain Transfer},\nauthor={Sasha Salter and Dushyant Rao and Markus Wulfmeier and Raia Hadsell and Ingmar Posner},\nyear={2020},\nurl={https://openreview.net/forum?id=HygW26VYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HygW26VYwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "988;687;917",
        "wc_reply_reviewers": "94;67;327",
        "wc_reply_authors": "1006;799;412",
        "reply_reviewers": "2;1;1",
        "reply_authors": "3;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            864.0,
            128.47048947780448
        ],
        "wc_reply_reviewers_avg": [
            162.66666666666666,
            116.72284361778642
        ],
        "wc_reply_authors_avg": [
            739.0,
            246.18285886714372
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10341570631686737885&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HygXkJHtvB",
        "title": "Using Objective Bayesian Methods to Determine the Optimal Degree of Curvature within the Loss Landscape",
        "track": "main",
        "status": "Reject",
        "tldr": "We reflect that the widest point in the parameter landscape corresponds to a model which has overfit the training data, we propose a new determinant of the optimal width within the parameter landscape.",
        "abstract": "The efficacy of the width of the basin of attraction surrounding a minimum in parameter space as an indicator for the generalizability of a model parametrization is a point of contention surrounding the training of artificial neural networks, with the dominant view being that wider areas in the landscape reflect better generalizability by the trained model. In this work, however, we aim to show that this is only true for a noiseless system and in general the trend of the model towards wide areas in the landscape reflect the propensity of the model to overfit the training data. Utilizing the objective Bayesian (Jeffreys) prior we instead propose a different determinant of the optimal width within the parameter landscape determined solely by the curvature of the landscape. In doing so we utilize the decomposition of the landscape into the dimensions of principal curvature and find the first principal curvature dimension of the parameter space to be independent of noise within the training data.",
        "keywords": "Objective Bayes;Information Geometry;Artificial Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Devon Jarvis;Richard Klein;Benjamin Rosman",
        "authorids": "devonjarvi@gmail.com;kleinric@gmail.com;benjros@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\njarvis2020using,\ntitle={Using Objective Bayesian Methods to Determine the Optimal Degree of Curvature within the Loss Landscape},\nauthor={Devon Jarvis and Richard Klein and Benjamin Rosman},\nyear={2020},\nurl={https://openreview.net/forum?id=HygXkJHtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HygXkJHtvB",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "367;382;184",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "946;220;623",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            311.0,
            90.01111042532472
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            596.3333333333334,
            296.9874670008072
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3Hj2Zh7Yy7YJ:scholar.google.com/&scioq=Using+Objective+Bayesian+Methods+to+Determine+the+Optimal+Degree+of+Curvature+within+the+Loss+Landscape&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HygYmJBKwH",
        "title": "YaoGAN: Learning Worst-case Competitive Algorithms from Self-generated Inputs",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We tackle the challenge of using machine learning to find algorithms with strong worst-case guarantees for online combinatorial optimization problems. Whereas the previous approach along this direction (Kong et al., 2018) relies on significant domain expertise to provide hard distributions over input instances at training, we ask whether this can be accomplished from first principles, i.e., without any human-provided data beyond specifying the objective of the optimization problem. To answer this question, we draw insights from classic results in game theory, analysis of algorithms, and online learning to introduce a novel framework. At the high level, similar to a generative adversarial network (GAN), our framework has two components whose respective goals are to learn the optimal algorithm as well as a set of input instances that captures the essential difficulty of the given optimization problem. The two components are trained against each other and evolved simultaneously. We test our ideas on the ski rental problem and the fractional AdWords problem. For these well-studied problems, our preliminary results demonstrate that the framework is capable of finding algorithms as well as difficult input instances that are consistent with known optimal results. We believe our new framework points to a promising direction which can facilitate the research of algorithm design by leveraging ML to improve the state of the art both in theory and in practice.    \n    ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Goran Zuzic;Di Wang;Aranyak Mehta;D. Sivakumar",
        "authorids": "zuza777@gmail.com;wadi@google.com;aranyak@google.com;siva@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzuzic2020yaogan,\ntitle={Yao{\\{}GAN{\\}}: Learning Worst-case Competitive Algorithms from Self-generated Inputs},\nauthor={Goran Zuzic and Di Wang and Aranyak Mehta and D. Sivakumar},\nyear={2020},\nurl={https://openreview.net/forum?id=HygYmJBKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HygYmJBKwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "263;599;1071",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "763;1001;1478",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;3",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            644.3333333333334,
            331.41850012065146
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1080.6666666666667,
            297.2836281323873
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:z4YEuS2_Q3kJ:scholar.google.com/&scioq=YaoGAN:+Learning+Worst-case+Competitive+Algorithms+from+Self-generated+Inputs&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HygaSxHYvH",
        "title": "Masked Translation Model",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We use a transformer encoder to do translation by training it in the style of a masked translation model.",
        "abstract": "We introduce the masked translation model (MTM) which combines encoding and decoding of sequences within the same model component. The MTM is based on the idea of masked language modeling and supports both autoregressive and non-autoregressive decoding strategies by simply changing the order of masking. In experiments on the WMT 2016 Romanian-English task, the MTM shows strong constant-time translation performance, beating all related approaches with comparable complexity. We also extensively compare various decoding strategies supported by the MTM, as well as several length modeling techniques and training settings.",
        "keywords": "Neural Machine Translation;Non-Autoregressive Decoding;Deep Learning;Transformer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arne Nix;Yunsu Kim;Jan Rosendahl;Shahram Khadivi;Hermann Ney",
        "authorids": "nix@i6.informatik.rwth-aachen.de;kim@i6.informatik.rwth-aachen.de;rosendahl@i6.informatik.rwth-aachen.de;skhadivi@ebay.com;ney@i6.informatik.rwth-aachen.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HygaSxHYvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "262;544;623",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            476.3333333333333,
            154.95017120208533
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4872789113070606969&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hygab1rKDS",
        "title": "Quantum Algorithms for Deep Convolutional Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We provide the first algorithm for quantum computers implementing universal convolutional neural network with a speedup",
        "abstract": "Quantum computing is a powerful computational paradigm with applications in several fields, including machine learning. In the last decade, deep learning, and in particular Convolutional Neural Networks (CNN), have become essential for applications in signal processing and image recognition. Quantum deep learning, however, remains a challenging problem, as it is difficult to implement non linearities with quantum unitaries. In this paper we propose a quantum algorithm for evaluating and training deep convolutional neural networks with potential speedups over classical CNNs for both the forward and backward passes. The quantum CNN (QCNN) reproduces completely the outputs of the classical CNN and allows for non linearities and pooling operations. The QCNN is in particular interesting for deep networks and could allow new frontiers in the image recognition domain, by allowing for many more convolution kernels, larger kernels, high dimensional inputs and high depth input channels. We also present numerical simulations for the classification of the MNIST dataset to provide practical evidence for the efficiency of the QCNN.",
        "keywords": "quantum computing;quantum machine learning;convolutional neural network;theory;algorithm",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Iordanis Kerenidis;Jonas Landman;Anupam Prakash",
        "authorids": "jkeren@gmail.com;landman@irif.fr;anupamprakash1@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nKerenidis2020Quantum,\ntitle={Quantum Algorithms for Deep Convolutional Neural Networks},\nauthor={Iordanis Kerenidis and Jonas Landman and Anupam Prakash},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hygab1rKDS}\n}",
        "github": "https://github.com/JonasLandman/QCNN",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hygab1rKDS",
        "pdf_size": 0,
        "rating": "6;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "339;142;508;374",
        "wc_reply_reviewers": "94;0;0;0",
        "wc_reply_authors": "459;288;575;112",
        "reply_reviewers": "1;0;0;0",
        "reply_authors": "2;1;1;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            340.75,
            130.9415423003716
        ],
        "wc_reply_reviewers_avg": [
            23.5,
            40.703193977868615
        ],
        "wc_reply_authors_avg": [
            358.5,
            175.14636736170124
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 192,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6858802029383173289&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HygaikBKvS",
        "title": "Off-Policy Actor-Critic with Shared Experience Replay",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigate and propose solutions for two challenges in reinforcement learning: (a) efficient actor-critic learning with experience replay (b) stability of very off-policy learning.",
        "abstract": "We investigate the combination of actor-critic reinforcement learning algorithms with uniform large-scale experience replay and propose solutions for two challenges: (a) efficient actor-critic learning with experience replay (b) stability of very off-policy learning. We employ those insights to accelerate hyper-parameter sweeps in which all participating agents run concurrently and share their experience via a common replay module.\n\nTo this end we analyze the bias-variance tradeoffs in V-trace, a form of importance sampling for actor-critic methods. Based on our analysis, we then argue for mixing experience sampled from replay with on-policy experience, and propose a new trust region scheme that scales effectively to data distributions where V-trace becomes unstable.\n\nWe provide extensive empirical validation of the proposed solution. We further show the benefits of this setup by demonstrating state-of-the-art data efficiency on Atari among agents trained up until 200M environment frames.",
        "keywords": "Reinforcement Learning;Off-Policy Learning;Experience Replay",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Simon Schmitt;Matteo Hessel;Karen Simonyan",
        "authorids": "suschmitt@google.com;mtthss@google.com;simonyan@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nschmitt2020offpolicy,\ntitle={Off-Policy Actor-Critic with Shared Experience Replay},\nauthor={Simon Schmitt and Matteo Hessel and Karen Simonyan},\nyear={2020},\nurl={https://openreview.net/forum?id=HygaikBKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HygaikBKvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "78;334;206",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "5;653;289",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            206.0,
            104.51156235874893
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            315.6666666666667,
            265.2160544830489
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 63,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11832379629273861939&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HygbQaNYwr",
        "title": "Adversarial Training: embedding adversarial perturbations into the parameter space of a neural network to build a robust system",
        "track": "main",
        "status": "Reject",
        "tldr": "Perturbation bias inside of the neural network helps us to achieve adversarial training with negligible cost; alleviate accuracy trade-off between clean and adversarial examples; and diversify adversarial perturbations.",
        "abstract": "Adversarial training, in which a network is trained on both adversarial and clean examples, is one of the most trusted defense methods against adversarial attacks. However, there are three major practical difficulties in implementing and deploying this method - expensive in terms of extra memory and computation costs; accuracy trade-off between clean and adversarial examples; and lack of diversity of adversarial perturbations. Classical adversarial training uses fixed, precomputed perturbations in adversarial examples (input space). In contrast, we introduce dynamic adversarial perturbations into the parameter space of the network, by adding perturbation biases to the fully connected layers of deep convolutional neural network. During training, using only clean images, the perturbation biases are updated in the Fast Gradient Sign Direction to automatically create and store adversarial perturbations by recycling the gradient information computed. The network learns and adjusts itself automatically to these learned adversarial perturbations. Thus, we can achieve adversarial training with negligible cost compared to requiring a training set of adversarial example images. In addition, if combined with classical adversarial training, our perturbation biases can alleviate accuracy trade-off difficulties, and diversify adversarial perturbations.",
        "keywords": "Adversarial Training;Adversarial Examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shixian Wen;Laurent Itti",
        "authorids": "shixianw@usc.edu;itti@usc.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nwen2020adversarial,\ntitle={Adversarial Training: embedding adversarial perturbations into the parameter space of a neural network to build a robust system},\nauthor={Shixian Wen and Laurent Itti},\nyear={2020},\nurl={https://openreview.net/forum?id=HygbQaNYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HygbQaNYwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "244;294;226",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            254.66666666666666,
            28.767265347188555
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13653988992771036204&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HygcdeBFvr",
        "title": "Score and Lyrics-Free Singing Voice Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "Our models generate singing voices without lyrics and scores. They take accompaniment as input and output singing voices.",
        "abstract": "Generative models for singing voice have been mostly concerned with the task of \"singing voice synthesis,\" i.e., to produce singing voice waveforms given musical scores and text lyrics. In this work, we explore a novel yet challenging alternative: singing voice generation without pre-assigned scores and lyrics, in both training and inference time. In particular, we experiment with three different schemes: 1) free singer, where the model generates singing voices without taking any conditions; 2) accompanied singer, where the model generates singing voices over a waveform of instrumental music; and 3) solo singer, where the model improvises a chord sequence first and then uses that to generate voices. We outline the associated challenges and propose a pipeline to tackle these new tasks. This involves the development of source separation and transcription models for data preparation, adversarial networks for audio generation, and customized metrics for evaluation.",
        "keywords": "singing voice generation;GAN;generative adversarial network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jen-Yu Liu;Yu-Hua Chen;Yin-Cheng Yeh;Yi-Hsuan Yang",
        "authorids": "ciauaishere@gmail.com;r08946011@ntu.edu.tw;deanyeh.ee01@g2.nctu.edu.tw;affige@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nliu2020score,\ntitle={Score and Lyrics-Free Singing Voice Generation},\nauthor={Jen-Yu Liu and Yu-Hua Chen and Yin-Cheng Yeh and Yi-Hsuan Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=HygcdeBFvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer1;AnonReviewer4;AnonReviewer6",
        "site": "https://openreview.net/forum?id=HygcdeBFvr",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "530;201;484;1303",
        "wc_reply_reviewers": "0;0;200;0",
        "wc_reply_authors": "760;432;1031;781",
        "reply_reviewers": "0;0;1;0",
        "reply_authors": "1;1;2;1",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            629.5,
            408.7435014773935
        ],
        "wc_reply_reviewers_avg": [
            50.0,
            86.60254037844386
        ],
        "wc_reply_authors_avg": [
            751.0,
            212.80390034019584
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 26,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6830032216571583683&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HygegyrYwH",
        "title": "Polylogarithmic width suffices for gradient descent to achieve arbitrarily small test error with shallow ReLU networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Recent theoretical work has guaranteed that overparameterized networks trained by gradient descent achieve arbitrarily low training error, and sometimes even low test error.\nThe required width, however, is always polynomial in at least one of the sample size $n$, the (inverse) target error $1/\\epsilon$, and the (inverse) failure probability $1/\\delta$. \nThis work shows that $\\widetilde{\\Theta}(1/\\epsilon)$ iterations of gradient descent with $\\widetilde{\\Omega}(1/\\epsilon^2)$ training examples on two-layer ReLU networks of any width exceeding $\\textrm{polylog}(n,1/\\epsilon,1/\\delta)$ suffice to achieve a test misclassification error of $\\epsilon$. \nWe also prove that stochastic gradient descent can achieve $\\epsilon$ test error with polylogarithmic width and $\\widetilde{\\Theta}(1/\\epsilon)$ samples. \nThe analysis relies upon the separation margin of the limiting kernel, which is guaranteed positive, can distinguish between true labels and random labels, and can give a tight sample-complexity analysis in the infinite-width setting.",
        "keywords": "neural tangent kernel;polylogarithmic width;test error;gradient descent;classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ziwei Ji;Matus Telgarsky",
        "authorids": "ziweiji2@illinois.edu;mjt@illinois.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nJi2020Polylogarithmic,\ntitle={Polylogarithmic width suffices for gradient descent to achieve arbitrarily small test error with shallow ReLU networks},\nauthor={Ziwei Ji and Matus Telgarsky},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HygegyrYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HygegyrYwH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "358;1012;695",
        "wc_reply_reviewers": "16;98;0",
        "wc_reply_authors": "860;622;896",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            688.3333333333334,
            267.0359942446378
        ],
        "wc_reply_reviewers_avg": [
            38.0,
            42.926293418680665
        ],
        "wc_reply_authors_avg": [
            792.6666666666666,
            121.57119541879794
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 223,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6221230641216018950&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Hygi7xStvS",
        "title": "Lossless Data Compression with Transformer",
        "track": "main",
        "status": "Reject",
        "tldr": "Application of transformer networks to lossless data compression",
        "abstract": "Transformers have replaced long-short term memory and other recurrent neural networks variants in sequence modeling. It achieves state-of-the-art performance on a wide range of tasks related to natural language processing, including language modeling, machine translation, and sentence representation. Lossless compression is another problem that can benefit from better sequence models. It is closely related to the problem of online learning of language models. But, despite this ressemblance, it is an area where purely neural network based methods have not yet reached the compression ratio of state-of-the-art algorithms. In this paper, we propose a Transformer based lossless compression method that match the best compression ratio for text. Our approach is purely based on neural networks and does not rely on hand-crafted features as other lossless compression algorithms. We also provide a thorough study of the impact of the different components of the Transformer and its training on the compression ratio.",
        "keywords": "data compression;transformer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gautier Izacard;Armand Joulin;Edouard Grave",
        "authorids": "gizacard@gmail.com;ajoulin@fb.com;egrave@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nizacard2020lossless,\ntitle={Lossless Data Compression with Transformer},\nauthor={Gautier Izacard and Armand Joulin and Edouard Grave},\nyear={2020},\nurl={https://openreview.net/forum?id=Hygi7xStvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hygi7xStvS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "258;882;199",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "509;637;177",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            446.3333333333333,
            309.0030564393966
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            441.0,
            193.85217735859112
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9601831631912963767&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HygiDTVKPr",
        "title": "A Mention-Pair Model of Annotation with Nonparametric User Communities",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The availability of large datasets is essential for progress in coreference and other areas of NLP. Crowdsourcing has proven a viable alternative to expert annotation, offering similar quality for better scalability. However, crowdsourcing require adjudication, and most models of annotation focus on classification tasks where the set of classes is predetermined. This restriction does not apply to anaphoric annotation, where coders relate markables to coreference chains whose number cannot be predefined. This gap was recently covered with the introduction of a mention pair model of anaphoric annotation (MPA). In this work we extend MPA to alleviate the effects of sparsity inherent in some crowdsourcing environments. Specifically, we use a nonparametric partially pooled structure (based on a stick breaking process), fitting jointly with the ability of the annotators hierarchical community profiles. The individual estimates can thus be improved using information about the community when the data is scarce. We show, using a recently published large-scale crowdsourced anaphora dataset, that the proposed model performs better than its unpooled counterpart in conditions of sparsity, and on par when enough observations are available. The model is thus more resilient to different crowdsourcing setups, and, further provides insights into the community of workers. The model is also flexible enough to be used in standard annotation tasks for classification where it registers on par performance with the state of the art.",
        "keywords": "model of annotation;coreference resolution;anaphoric annotation;mention pair model;bayesian nonparametrics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Silviu Paun;Juntao Yu;Jon Chamberlain;Udo Kruschwitz;Massimo Poesio",
        "authorids": "s.paun@qmul.ac.uk;juntao.yu@qmul.ac.uk;jchamb@essex.ac.uk;udo@essex.ac.uk;m.poesio@qmul.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\npaun2020a,\ntitle={A Mention-Pair Model of Annotation with Nonparametric User Communities},\nauthor={Silviu Paun and Juntao Yu and Jon Chamberlain and Udo Kruschwitz and Massimo Poesio},\nyear={2020},\nurl={https://openreview.net/forum?id=HygiDTVKPr}\n}",
        "github": "https://www.dropbox.com/s/nrty7eanmgs2ecd/CommunityMPA.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HygiDTVKPr",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "469;436;297;135",
        "wc_reply_reviewers": "0;19;0;0",
        "wc_reply_authors": "732;490;279;104",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            334.25,
            131.90787504921758
        ],
        "wc_reply_reviewers_avg": [
            4.75,
            8.227241335952167
        ],
        "wc_reply_authors_avg": [
            401.25,
            234.8269309512859
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rIKvxMv3KI8J:scholar.google.com/&scioq=A+Mention-Pair+Model+of+Annotation+with+Nonparametric+User+Communities&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HygkpxStvr",
        "title": "Weakly-Supervised Trajectory Segmentation for Learning Reusable Skills",
        "track": "main",
        "status": "Reject",
        "tldr": "Weakly supervised segmentation of human demonstrations into skill primitives by only using trajectory-level labels at training with neither time-step segmentation labels nor ordering information.",
        "abstract": "Learning useful and reusable skill, or sub-task primitives, is a long-standing problem in sensorimotor control. This is challenging because it's hard to define what constitutes a useful skill. Instead of direct manual supervision which is tedious and prone to bias, in this work, our goal is to extract reusable skills from a collection of human demonstrations collected directly for several end-tasks. We propose a weakly-supervised approach for trajectory segmentation following the classic work on multiple instance learning. Our approach is end-to-end trainable, works directly from high-dimensional input (e.g., images) and only requires the knowledge of what skill primitives are present at training, without any need of segmentation or ordering of primitives. We evaluate our approach via rigorous experimentation across four environments ranging from simulation to real world robots, procedurally generated to human collected demonstrations and discrete to continuous action space. Finally, we leverage the generated skill segmentation to demonstrate preliminary evidence of zero-shot transfer to new combinations of skills. Result videos at https://sites.google.com/view/trajectory-segmentation/",
        "keywords": "skills;demonstration;agent;sub-task;primitives;robot learning;manipulation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Parsa Mahmoudieh;Trevor Darrell;Deepak Pathak",
        "authorids": "parsa.m@berkeley.edu;trevor@eecs.berkeley.edu;pathak@berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmahmoudieh2020weaklysupervised,\ntitle={Weakly-Supervised Trajectory Segmentation for Learning Reusable Skills},\nauthor={Parsa Mahmoudieh and Trevor Darrell and Deepak Pathak},\nyear={2020},\nurl={https://openreview.net/forum?id=HygkpxStvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HygkpxStvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "689;453;414",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "456;487;203",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            518.6666666666666,
            121.49165494889853
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            382.0,
            127.20324943438617
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7514019100255166162&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HygnDhEtvr",
        "title": "Reinforcement Learning Based Graph-to-Sequence Model for Natural Question Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Natural question generation (QG) aims to generate questions from a passage and an answer. Previous works on QG either (i) ignore the rich structure information hidden in text, (ii) solely rely on cross-entropy loss that leads to issues like exposure bias and inconsistency between train/test measurement, or (iii) fail to fully exploit the answer information. To address these limitations, in this paper, we propose a reinforcement learning (RL) based graph-to-sequence (Graph2Seq) model for QG. Our model consists of a Graph2Seq generator with a novel Bidirectional Gated Graph Neural Network based encoder to embed the passage, and a hybrid evaluator with a mixed objective combining both cross-entropy and RL losses to ensure the generation of syntactically and semantically valid text. We also introduce an effective Deep Alignment Network for incorporating the answer information into the passage at both the word and contextual levels. Our model is end-to-end trainable and achieves new state-of-the-art scores, outperforming existing methods by a significant margin on the standard SQuAD benchmark.",
        "keywords": "deep learning;reinforcement learning;graph neural networks;natural language processing;question generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Chen;Lingfei Wu;Mohammed J. Zaki",
        "authorids": "cheny39@rpi.edu;lwu@email.wm.edu;zaki@cs.rpi.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nChen2020Reinforcement,\ntitle={Reinforcement Learning Based Graph-to-Sequence Model for Natural Question Generation},\nauthor={Yu Chen and Lingfei Wu and Mohammed J. Zaki},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HygnDhEtvr}\n}",
        "github": "https://github.com/hugochan/RL-based-Graph2Seq-for-NQG",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HygnDhEtvr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "487;155;119",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1374;845;165",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            253.66666666666666,
            165.64486778111242
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            794.6666666666666,
            494.85373821182986
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 221,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5519507630710292821&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "HygpthEtvr",
        "title": "ProxSGD: Training Structured Neural Networks under Regularization and Constraints",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a convergent proximal-type stochastic gradient descent algorithm for constrained nonsmooth nonconvex optimization problems",
        "abstract": "In this paper, we consider the problem of training neural networks (NN). To promote a NN with specific structures, we explicitly take into consideration the nonsmooth regularization (such as L1-norm) and constraints (such as interval constraint). This is formulated as a constrained nonsmooth nonconvex optimization problem, and we propose a convergent proximal-type stochastic gradient descent (Prox-SGD) algorithm. We show that under properly selected learning rates, momentum eventually resembles the unknown real gradient and thus is crucial in analyzing the convergence. We establish that with probability 1, every limit point of the sequence generated by the proposed Prox-SGD is a stationary point. Then the Prox-SGD is tailored to train a sparse neural network and a binary neural network, and the theoretical analysis is also supported by extensive numerical tests.",
        "keywords": "stochastic gradient descent;regularization;constrained optimization;nonsmooth optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yang Yang;Yaxiong Yuan;Avraam Chatzimichailidis;Ruud JG van Sloun;Lei Lei;Symeon Chatzinotas",
        "authorids": "yang.yang@itwm.fraunhofer.de;yaxiong.yuan@uni.lu;avraam.chatzimichailidis@itwm.fraunhofer.de;r.j.g.v.sloun@tue.nl;lei.lei@uni.lu;symeon.chatzinotas@uni.lu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nYang2020ProxSGD:,\ntitle={ProxSGD: Training Structured Neural Networks under Regularization and Constraints},\nauthor={Yang Yang and Yaxiong Yuan and Avraam Chatzimichailidis and Ruud JG van Sloun and Lei Lei and Symeon Chatzinotas},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HygpthEtvr}\n}",
        "github": "[![github](/images/github_icon.svg) cc-hpc-itwm/proxsgd](https://github.com/cc-hpc-itwm/proxsgd) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HygpthEtvr)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HygpthEtvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "427;299;404",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "696;630;756",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            376.6666666666667,
            55.715547401261546
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            694.0,
            51.45872132107443
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2854119212181626693&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Hygq3JrtwS",
        "title": "On the Reflection of Sensitivity in the Generalization Error",
        "track": "main",
        "status": "Reject",
        "tldr": "We study the relation between the generalization error and the sensitivity of the output to random input perturbations in deep neural networks.",
        "abstract": "Even though recent works have brought some insight into the performance improvement of techniques used in state-of-the-art deep-learning models, more work is needed to understand the generalization properties of over-parameterized deep neural networks. We shed light on this matter by linking the loss function to the output\u2019s sensitivity to its input. We find a rather strong empirical relation between the output sensitivity and the variance in the bias-variance decomposition of the loss function, which hints on using sensitivity as a metric for comparing generalization performance of networks, without requiring labeled data. We find that sensitivity is decreased by applying popular methods which improve the generalization performance of the model, such as (1) using a deep network rather than a wide one, (2) adding convolutional layers to baseline classifiers instead of adding fully connected layers, (3) using batch normalization, dropout and max-pooling, and (4) applying parameter initialization techniques.",
        "keywords": "Generalization Error;Sensitivity Analysis;Deep Neural Networks;Bias-variance Decomposition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mahsa Forouzesh;Farnood Salehi;Patrick Thiran",
        "authorids": "mahsa.forouzesh@epfl.ch;farnood.salehi@epfl.ch;patrick.thiran@epfl.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nforouzesh2020on,\ntitle={On the Reflection of Sensitivity in the Generalization Error},\nauthor={Mahsa Forouzesh and Farnood Salehi and Patrick Thiran},\nyear={2020},\nurl={https://openreview.net/forum?id=Hygq3JrtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hygq3JrtwS",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "371;452",
        "wc_reply_reviewers": "156;0",
        "wc_reply_authors": "595;892",
        "reply_reviewers": "1;0",
        "reply_authors": "2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            411.5,
            40.5
        ],
        "wc_reply_reviewers_avg": [
            78.0,
            78.0
        ],
        "wc_reply_authors_avg": [
            743.5,
            148.5
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VF485mq08EIJ:scholar.google.com/&scioq=On+the+Reflection+of+Sensitivity+in+the+Generalization+Error&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HygqFlBtPS",
        "title": "Improved Training of Certifiably Robust Models",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Convex relaxations are effective for training and certifying neural networks against norm-bounded adversarial attacks, but they leave a large gap between certifiable and empirical (PGD) robustness. In principle, relaxation can provide tight bounds if the convex relaxation solution is feasible for the original non-relaxed problem. Therefore, we propose two regularizers that can be used to train neural networks that yield convex relaxations with tighter bounds. In all of our experiments, the proposed regularizations result in tighter certification bounds than non-regularized baselines. ",
        "keywords": "Convex Relaxation;Certified Robustness;Regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chen Zhu;Renkun Ni;Ping-yeh Chiang;Hengduo Li;Furong Huang;Tom Goldstein",
        "authorids": "chenzhu@cs.umd.edu;rn9zm@cs.umd.edu;pingyeh.chiang@gmail.com;hdli@cs.umd.edu;furongh@cs.umd.edu;tomg@cs.umd.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nzhu2020improved,\ntitle={Improved Training of Certifiably Robust Models},\nauthor={Chen Zhu and Renkun Ni and Ping-yeh Chiang and Hengduo Li and Furong Huang and Tom Goldstein},\nyear={2020},\nurl={https://openreview.net/forum?id=HygqFlBtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HygqFlBtPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "550;816;175",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1158;1490;93",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            513.6666666666666,
            262.9452837040859
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            913.6666666666666,
            595.9174066559523
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aqSWEQ4TYDIJ:scholar.google.com/&scioq=Improved+Training+of+Certifiably+Robust+Models&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "HygrAR4tPS",
        "title": "On Empirical Comparisons of Optimizers for Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Optimizer comparisons depend more than you would think on metaparameter tuning details and our prior should be that more general update rules (e.g. adaptive gradient methods) are better.",
        "abstract": "Selecting an optimizer is a central step in the contemporary deep learning pipeline. In this paper we demonstrate the sensitivity of optimizer comparisons to the metaparameter tuning protocol. Our findings suggest that the metaparameter search space may be the single most important factor explaining the rankings obtained by recent empirical comparisons in the literature. In fact, we show that these results can be contradicted when metaparameter search spaces are changed. As tuning effort grows without bound, more general update rules should never underperform the ones they can approximate (i.e., Adam should never perform worse than momentum), but the recent attempts to compare optimizers either assume these inclusion relationships are not relevant in practice or restrict the metaparameters they tune to break the inclusions. In our experiments, we find that the inclusion relationships between optimizers matter in practice and always predict optimizer comparisons. In particular, we find that the popular adative gradient methods never underperform momentum or gradient descent. We also report practical tips around tuning rarely-tuned metaparameters of adaptive gradient methods and raise concerns about fairly benchmarking optimizers for neural network training.",
        "keywords": "Deep learning;optimization;adaptive gradient methods;Adam;hyperparameter tuning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dami Choi;Christopher J. Shallue;Zachary Nado;Jaehoon Lee;Chris J. Maddison;George E. Dahl",
        "authorids": "choidami@cs.toronto.edu;shallue@google.com;znado@google.com;jaehlee@google.com;cmaddis@google.com;gdahl@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nchoi2020on,\ntitle={On Empirical Comparisons of Optimizers for Deep Learning},\nauthor={Dami Choi and Christopher J. Shallue and Zachary Nado and Jaehoon Lee and Chris J. Maddison and George E. Dahl},\nyear={2020},\nurl={https://openreview.net/forum?id=HygrAR4tPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HygrAR4tPS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "877;220;284",
        "wc_reply_reviewers": "956;0;0",
        "wc_reply_authors": "1647;123;360",
        "reply_reviewers": "8;0;0",
        "reply_authors": "9;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            460.3333333333333,
            295.78408040701277
        ],
        "wc_reply_reviewers_avg": [
            318.6666666666667,
            450.66272187622627
        ],
        "wc_reply_authors_avg": [
            710.0,
            669.5864395281613
        ],
        "reply_reviewers_avg": [
            2.6666666666666665,
            3.7712361663282534
        ],
        "reply_authors_avg": [
            3.6666666666666665,
            3.7712361663282534
        ],
        "replies_avg": [
            36,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 431,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18053422880716219016&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HygrdpVKvr",
        "title": "NAS evaluation is frustratingly hard",
        "track": "main",
        "status": "Poster",
        "tldr": "A study of how different components in the NAS pipeline contribute to the final accuracy. Also, a benchmark of 8 methods on 5 datasets.",
        "abstract": "Neural Architecture Search (NAS) is an exciting new field which promises to be as much as a game-changer as Convolutional Neural Networks were in 2012. Despite many great works leading to substantial improvements on a variety of tasks, comparison between different methods is still very much an open issue. While most algorithms are tested on the same datasets, there is no shared experimental protocol followed by all. As such, and due to the under-use of ablation studies, there is a lack of clarity regarding why certain methods are more effective than others. Our first contribution is a benchmark of 8 NAS methods on 5 datasets. To overcome the hurdle of comparing methods with different search spaces, we propose using a method\u2019s relative improvement over the randomly sampled average architecture, which effectively removes advantages arising from expertly engineered search spaces or training protocols. Surprisingly, we find that many NAS techniques struggle to significantly beat the average architecture baseline. We perform further experiments with the commonly used DARTS search space in order to understand the contribution of each component in the NAS pipeline. These experiments highlight that: (i) the use of tricks in the evaluation protocol has a predominant impact on the reported performance of architectures; (ii) the cell-based search space has a very narrow accuracy range, such that the seed has a considerable impact on architecture rankings; (iii) the hand-designed macrostructure (cells) is more important than the searched micro-structure (operations); and (iv) the depth-gap is a real phenomenon, evidenced by the change in rankings between 8 and 20 cell architectures. To conclude, we suggest best practices, that we hope will prove useful for the community and help mitigate current NAS pitfalls, e.g. difficulties in reproducibility and comparison of search methods. The\ncode used is available at https://github.com/antoyang/NAS-Benchmark.",
        "keywords": "neural architecture search;nas;benchmark;reproducibility;harking",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Antoine Yang;Pedro M. Esperan\u00e7a;Fabio M. Carlucci",
        "authorids": "antoineyang3@gmail.com;pedro.esperanca@huawei.com;fabiom.carlucci@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nYang2020NAS,\ntitle={NAS evaluation is frustratingly hard},\nauthor={Antoine Yang and Pedro M. Esperan\u00e7a and Fabio M. Carlucci},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HygrdpVKvr}\n}",
        "github": "https://github.com/antoyang/NAS-Benchmark",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HygrdpVKvr",
        "pdf_size": 0,
        "rating": "1;8;8",
        "confidence": "0;0;0",
        "wc_review": "299;425;269",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "610;281;39",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.666666666666667,
            3.299831645537222
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            331.0,
            67.58698099486321
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            310.0,
            234.00997129752113
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 218,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12471694483970544806&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HygsuaNFwr",
        "title": "Order Learning and Its Application to Age Estimation",
        "track": "main",
        "status": "Poster",
        "tldr": "The notion of order learning is proposed and it is applied to regression problems in computer vision",
        "abstract": "We propose order learning to determine the order graph of classes, representing ranks or priorities, and classify an object instance into one of the classes. To this end, we design a pairwise comparator to categorize the relationship between two instances into one of three cases: one instance is `greater than,' `similar to,' or `smaller than' the other. Then, by comparing an input instance with reference instances and maximizing the consistency among the comparison results, the class of the input can be estimated reliably. We apply order learning to develop a facial age estimator, which provides the state-of-the-art performance. Moreover, the performance is further improved when the order graph is divided into disjoint chains using gender and ethnic group information or even in an unsupervised manner.",
        "keywords": "Order learning;age estimation;aesthetic assessment",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kyungsun Lim;Nyeong-Ho Shin;Young-Yoon Lee;Chang-Su Kim",
        "authorids": "kslim@mcl.korea.ac.kr;nhshin@mcl.korea.ac.kr;yy77lee@gmail.com;changsukim@korea.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLim2020Order,\ntitle={Order Learning and Its Application to Age Estimation},\nauthor={Kyungsun Lim and Nyeong-Ho Shin and Young-Yoon Lee and Chang-Su Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HygsuaNFwr}\n}",
        "github": "https://github.com/changsukim-ku/order-learning",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HygsuaNFwr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "458;349;191",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "864;460;125",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            332.6666666666667,
            109.61244860365491
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            483.0,
            302.13352456598835
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 46,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16020374699355310218&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HygtiTEYvS",
        "title": "Self-Supervised Policy Adaptation",
        "track": "main",
        "status": "Reject",
        "tldr": "Greedy State Representation Learning (GSRL) translates a given policy when the environment representation changes ",
        "abstract": "We consider the problem of adapting an existing policy when the environment representation changes. Upon a change of the encoding of the observations the agent can no longer make use of its policy as it cannot correctly interpret the new observations. This paper proposes Greedy State Representation Learning (GSRL) to transfer the original policy by translating the environment representation back into its original encoding. To achieve this GSRL samples observations from both the environment and a dynamics model trained from prior experience. This generates pairs of state encodings, i.e., a new representation from the environment and a (biased) old representation from the forward model, that allow us to bootstrap a neural network model for state translation. Although early translations are unsatisfactory (as expected), the agent eventually learns a valid translation as it minimizes the error between expected and observed environment dynamics. Our experiments show the efficiency of our approach and that it translates the policy in considerably less steps than it would take to retrain the policy.",
        "keywords": "reinforcement learning;environment representation;representation learning;model mismatch",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Christopher Mutschler;Sebastian Pokutta",
        "authorids": "christopher.mutschler@iis.fraunhofer.de;pokutta@zib.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmutschler2020selfsupervised,\ntitle={Self-Supervised Policy Adaptation},\nauthor={Christopher Mutschler and Sebastian Pokutta},\nyear={2020},\nurl={https://openreview.net/forum?id=HygtiTEYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HygtiTEYvS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "300;288;152",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            246.66666666666666,
            67.11846905948383
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vd-1irxgXjIJ:scholar.google.com/&scioq=Self-Supervised+Policy+Adaptation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Hygv3xrtDr",
        "title": "Sparse Skill Coding: Learning Behavioral Hierarchies with Sparse Codes",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Many approaches to hierarchical reinforcement learning aim to identify sub-goal structure in tasks. We consider an alternative perspective based on identifying behavioral `motifs'---repeated action sequences that can be compressed to yield a compact code of action trajectories. We present a method for iteratively compressing action trajectories to learn nested behavioral hierarchies of arbitrary depth, with actions of arbitrary length. The learned temporally extended actions provide new action primitives that can participate in deeper hierarchies as the agent learns. We demonstrate the relevance of this approach for tasks with non-trivial hierarchical structure and show that the approach can be used to accelerate learning in recursively more complex tasks through transfer.",
        "keywords": "hierarchical reinforcement learning;unsupervised learning;compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sophia Sanborn;Michael Chang;Sergey Levine;Thomas Griffiths",
        "authorids": "sanborn@berkeley.edu;mbchang@berkeley.edu;svlevine@eecs.berkeley.edu;tomg@princeton.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsanborn2020sparse,\ntitle={Sparse Skill Coding: Learning Behavioral Hierarchies with Sparse Codes},\nauthor={Sophia Sanborn and Michael Chang and Sergey Levine and Thomas Griffiths},\nyear={2020},\nurl={https://openreview.net/forum?id=Hygv3xrtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hygv3xrtDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "699;309;205",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.3333333333333,
            212.64263187067846
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:shS0hVRZSqEJ:scholar.google.com/&scioq=Sparse+Skill+Coding:+Learning+Behavioral+Hierarchies+with+Sparse+Codes&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HygwvC4tPH",
        "title": "Learning Cross-Context Entity Representations from Text",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigate the use of a fill-in-the-blank task to learn context independent representations of entities from the text contexts in which those entities were mentioned",
        "abstract": "Language modeling tasks, in which words, or word-pieces, are predicted on the basis of a local context, have been very effective for learning word embeddings and context dependent representations of phrases. Motivated by the observation that efforts to code world knowledge into machine readable knowledge bases or human readable encyclopedias tend to be entity-centric, we investigate the use of a fill-in-the-blank task to learn context independent representations of entities from the text contexts in which those entities were mentioned. We show that large scale training of neural models allows us to learn high quality entity representations, and we demonstrate successful results on four domains: (1) existing entity-level typing benchmarks, including a 64% error reduction over previous work on TypeNet (Murty et al., 2018); (2) a novel few-shot category reconstruction task; (3) existing entity linking benchmarks, where we achieve a score of 87.3% on TAC-KBP 2010 without using any alias table, external knowledge base or in domain training data and (4) answering trivia questions, which uniquely identify entities. Our global entity representations encode fine-grained type categories, such as \"Scottish footballers\", and can answer trivia questions such as \"Who was the last inmate of Spandau jail in Berlin?\".",
        "keywords": "entities;entity representations;knowledge representation;entity linking;entity typing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jeffrey Ling;Nicholas FitzGerald;Zifei Shan;Livio Baldini Soares;Thibault F\u00e9vry;David Weiss;Tom Kwiatkowski",
        "authorids": "jeffreyling@google.com;nfitz@google.com;zifeis@google.com;liviobs@google.com;tfevry@google.com;djweiss@google.com;tomkwiat@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nling2020learning,\ntitle={Learning Cross-Context Entity Representations from Text},\nauthor={Jeffrey Ling and Nicholas FitzGerald and Zifei Shan and Livio Baldini Soares and Thibault F{\\'e}vry and David Weiss and Tom Kwiatkowski},\nyear={2020},\nurl={https://openreview.net/forum?id=HygwvC4tPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HygwvC4tPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "438;556;403",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "395;641;501",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            465.6666666666667,
            65.45397025560956
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            512.3333333333334,
            100.74831126238406
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 31,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=177575937539260062&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Hygy01StvH",
        "title": "Impact of the latent space on the ability of GANs to fit the distribution",
        "track": "main",
        "status": "Reject",
        "tldr": "We analyze the impact of the latent space of fully trained generators by pseudo inverting them.",
        "abstract": "The goal of generative models is to model the underlying data distribution of a\nsample based dataset. Our intuition is that an accurate model should in principle\nalso include the sample based dataset as part of its induced probability distribution.\nTo investigate this, we look at fully trained generative models using the Generative\nAdversarial Networks (GAN) framework and analyze the resulting generator\non its ability to memorize the dataset. Further, we show that the size of the initial\nlatent space is paramount to allow for an accurate reconstruction of the training\ndata. This gives us a link to compression theory, where Autoencoders (AE) are\nused to lower bound the reconstruction capabilities of our generative model. Here,\nwe observe similar results to the perception-distortion tradeoff (Blau & Michaeli\n(2018)). Given a small latent space, the AE produces low quality and the GAN\nproduces high quality outputs from a perceptual viewpoint. In contrast, the distortion\nerror is smaller for the AE. By increasing the dimensionality of the latent\nspace the distortion decreases for both models, but the perceptual quality only\nincreases for the AE.",
        "keywords": "Deep Learning;Generative Adversarial Networks;Compression;Perceptual Quality",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thomas Pinetz;Daniel Soukup;Thomas Pock",
        "authorids": "thomas.pinetz@ait.ac.at;daniel.soukup@ait.ac.at;pock@icg.tugraz.at",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npinetz2020impact,\ntitle={Impact of the latent space on the ability of {\\{}GAN{\\}}s to fit the distribution},\nauthor={Thomas Pinetz and Daniel Soukup and Thomas Pock},\nyear={2020},\nurl={https://openreview.net/forum?id=Hygy01StvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hygy01StvH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "1122;771;644",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            845.6666666666666,
            202.15890339587378
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14588391597819589283&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Hyl5V0EYvB",
        "title": "Testing Robustness Against Unforeseen Adversaries",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose several new attacks and a methodology to measure robustness against unforeseen adversarial attacks.",
        "abstract": "Most existing defenses against adversarial attacks only consider robustness to L_p-bounded distortions. In reality, the specific attack is rarely known in advance and adversaries are free to modify images in ways which lie outside any fixed distortion model; for example, adversarial rotations lie outside the set of L_p-bounded distortions. In this work, we advocate measuring robustness against a much broader range of unforeseen attacks, attacks whose precise form is unknown during defense design.\n\nWe propose several new attacks and a methodology for evaluating a defense against a diverse range of unforeseen distortions. First, we construct novel adversarial JPEG, Fog, Gabor, and Snow distortions to simulate more diverse adversaries. We then introduce UAR, a summary metric that measures the robustness of a defense against a given distortion.  Using UAR to assess robustness against existing and novel attacks, we perform an extensive study of adversarial robustness. We find that evaluation against existing L_p attacks yields redundant information which does not generalize to other attacks; we instead recommend evaluating against our significantly more diverse set of attacks. We further find that adversarial training against either one or multiple distortions fails to confer robustness to attacks with other distortion types.  These results underscore the need to evaluate and study robustness against unforeseen distortions.",
        "keywords": "adversarial examples;adversarial training;adversarial attacks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Kang*;Yi Sun*;Dan Hendrycks;Tom Brown;Jacob Steinhardt",
        "authorids": "ddkang@stanford.edu;yisun@math.columbia.edu;hendrycks@berkeley.edu;tom@openai.com;jsteinhardt@berkeley.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "https://github.com/iclr-2020-submission/advex-uar",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hyl5V0EYvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "643;348;185",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "979;355;382",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            392.0,
            189.54858655940083
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            572.0,
            288.00347220129134
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 141,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5689579238074536026&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Hyl7ygStwB",
        "title": "Incorporating BERT into Neural Machine Translation",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "The recently proposed BERT (Devlin et al., 2019) has shown great power on a variety of natural language understanding tasks, such as text classification, reading comprehension, etc. However, how to effectively apply BERT to neural machine translation (NMT) lacks enough exploration. While BERT is more commonly used as fine-tuning instead of contextual embedding for downstream language understanding tasks, in NMT, our preliminary exploration of using BERT as contextual embedding is better than using for fine-tuning. This motivates us to think how to better leverage BERT for NMT along this direction. We propose a new algorithm named BERT-fused model, in which we first use BERT to extract representations for an input sequence, and then the representations are fused with each layer of the encoder and decoder of the NMT model through attention mechanisms. We conduct experiments on supervised (including sentence-level and document-level translations), semi-supervised and unsupervised machine translation, and achieve state-of-the-art results on seven benchmark datasets. Our code is available at https://github.com/bert-nmt/bert-nmt",
        "keywords": "BERT;Neural Machine Translation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jinhua Zhu;Yingce Xia;Lijun Wu;Di He;Tao Qin;Wengang Zhou;Houqiang Li;Tieyan Liu",
        "authorids": "teslazhu@mail.ustc.edu.cn;yingce.xia@gmail.com;wulijun3@mail2.sysu.edu.cn;di_he@pku.edu.cn;taoqin@microsoft.com;zhwg@ustc.edu.cn;lihq@ustc.edu.cn;tyliu@microsoft.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nZhu2020Incorporating,\ntitle={Incorporating BERT into Neural Machine Translation},\nauthor={Jinhua Zhu and Yingce Xia and Lijun Wu and Di He and Tao Qin and Wengang Zhou and Houqiang Li and Tieyan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyl7ygStwB}\n}",
        "github": "https://github.com/bert-nmt/bert-nmt",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hyl7ygStwB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "628;506;536",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "418;581;446",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            556.6666666666666,
            51.90589775952461
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            481.6666666666667,
            71.16334886879778
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 522,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2826043205996388394&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Hyl8yANFDB",
        "title": "Assessing Generalization in TD methods for Deep Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Empirical investigation showing TD, in particular TD(0), may be preventing generalization in DeepRL",
        "abstract": "Current Deep Reinforcement Learning (DRL) methods  can exhibit both data inefficiency and brittleness, which seem to indicate that they generalize poorly. In this work, we experimentally analyze this issue through the lens of memorization, and show that it can be observed directly during training. More precisely, we find that Deep Neural Networks (DNNs) trained with supervised tasks on trajectories capture temporal structure well, but DNNs trained with TD(0) methods struggle to do so, while using TD(lambda) targets leads to better generalization.",
        "keywords": "reinforcement learning;deep learning;generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Emmanuel Bengio;Doina Precup;Joelle Pineau",
        "authorids": "bengioe@gmail.com;dprecup@cs.mcgill.ca;jpineau@cs.mcgill.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nbengio2020assessing,\ntitle={Assessing Generalization in {\\{}TD{\\}} methods for Deep Reinforcement Learning},\nauthor={Emmanuel Bengio and Doina Precup and Joelle Pineau},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyl8yANFDB}\n}",
        "github": "https://anonymous.4open.science/r/366905cc-d617-4467-bb12-de95c08deaad/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hyl8yANFDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "925;382;97",
        "wc_reply_reviewers": "275;0;0",
        "wc_reply_authors": "1155;450;25",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            468.0,
            343.45596515419555
        ],
        "wc_reply_reviewers_avg": [
            91.66666666666667,
            129.63624321753372
        ],
        "wc_reply_authors_avg": [
            543.3333333333334,
            466.01740549278014
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zetWAXrSXAoJ:scholar.google.com/&scioq=Assessing+Generalization+in+TD+methods+for+Deep+Reinforcement+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Hyl9ahVFwH",
        "title": "Learning Similarity Metrics for Numerical Simulations",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel CNN-based metric to robustly compare field data from PDE-based numerical simulations.",
        "abstract": "We propose a novel approach to compute a stable and generalizing metric (LNSM) with convolutional neural networks (CNN) to compare field data from a variety of numerical simulation sources. Our method employs a Siamese network architecture that is motivated by the mathematical properties of a metric and is known to work well for finding similarities of other data modalities. We leverage a controllable data generation setup with partial differential equation (PDE) solvers to create increasingly different outputs from a reference simulation. In addition, the data generation allows for adjusting the difficulty of the resulting learning task. A central component of our learned metric is a specialized loss function, that introduces knowledge about the correlation between single data samples into the training process. To demonstrate that the proposed approach outperforms existing simple metrics for vector spaces and other learned, image based metrics we evaluate the different methods on a large range of test data. Additionally, we analyze generalization benefits of using the proposed correlation loss and the impact of an adjustable training data difficulty.",
        "keywords": "metric learning;CNNs;PDEs;numerical simulation;perceptual evaluation;physics simulation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Georg Kohl;Kiwon Um;Nils Thuerey",
        "authorids": "georg.kohl@tum.de;kiwon.um@tum.de;nils.thuerey@tum.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkohl2020learning,\ntitle={Learning Similarity Metrics for Numerical Simulations},\nauthor={Georg Kohl and Kiwon Um and Nils Thuerey},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyl9ahVFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hyl9ahVFwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "327;303;522",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "630;285;444",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            384.0,
            98.07140255956371
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            453.0,
            140.98936130077334
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16424748636461420663&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "Hyl9xxHYPr",
        "title": "Demystifying Inter-Class Disentanglement",
        "track": "main",
        "status": "Poster",
        "tldr": "Latent Optimization for Representation Disentanglement",
        "abstract": "Learning to disentangle the hidden factors of variations within a set of observations is a key task for artificial intelligence. We present a unified formulation for class and content disentanglement and use it to illustrate the limitations of current methods. We therefore introduce LORD, a novel method based on Latent Optimization for Representation Disentanglement. We find that latent optimization, along with an asymmetric noise regularization, is superior to amortized inference for achieving disentangled representations. In extensive experiments, our method is shown to achieve better disentanglement performance than both adversarial and non-adversarial methods that use the same level of supervision. We further introduce a clustering-based approach for extending our method for settings that exhibit in-class variation with promising results on the task of domain translation.",
        "keywords": "disentanglement;latent optimization;domain translation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aviv Gabbay;Yedid Hoshen",
        "authorids": "avivga@gmail.com;yedid@cs.huji.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nGabbay2020Demystifying,\ntitle={Demystifying Inter-Class Disentanglement},\nauthor={Aviv Gabbay and Yedid Hoshen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyl9xxHYPr}\n}",
        "github": "https://github.com/avivga/lord",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hyl9xxHYPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "422;234;528",
        "wc_reply_reviewers": "0;0;287",
        "wc_reply_authors": "360;390;709",
        "reply_reviewers": "0;0;2",
        "reply_authors": "1;1;3",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            394.6666666666667,
            121.57119541879794
        ],
        "wc_reply_reviewers_avg": [
            95.66666666666667,
            135.2930974670261
        ],
        "wc_reply_authors_avg": [
            486.3333333333333,
            157.9247359415941
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 68,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4997623727964047990&as_sdt=5,44&sciodt=0,44&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HylA41Btwr",
        "title": "CP-GAN: Towards a Better Global Landscape of GANs",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "GANs have been very popular in data generation and unsupervised learning, but our understanding of GAN training is still very limited. One major reason is that  GANs are often formulated as non-convex-concave min-max optimization. As a result, most recent studies focused on the analysis in the local region around the equilibrium. In this work, we perform a  global analysis of GANs from two perspectives:  the global landscape of the outer-optimization problem and the global behavior of the gradient descent dynamics. We find that the original GAN has exponentially many bad strict local minima which are perceived as mode-collapse,  and the training dynamics (with linear discriminators) cannot escape mode collapse. To address these issues, we propose a simple modification to the original GAN, by coupling the generated samples and the true samples.  We prove that the new formulation has no bad basins, and its training dynamics (with linear discriminators) has a Lyapunov function that leads to global convergence.  Our experiments on standard datasets show that this simple loss outperforms the original GAN and WGAN-GP. ",
        "keywords": "GAN;global landscape;non-convex optimization;min-max optimization;dynamics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruoyu Sun;Tiantian Fang;Alex Schwing",
        "authorids": "ruoyus@illinois.edu;tf6@illinois.edu;aschwing@illinois.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsun2020cpgan,\ntitle={{\\{}CP{\\}}-{\\{}GAN{\\}}: Towards a Better Global Landscape of {\\{}GAN{\\}}s},\nauthor={Ruoyu Sun and Tiantian Fang and Alex Schwing},\nyear={2020},\nurl={https://openreview.net/forum?id=HylA41Btwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HylA41Btwr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "155;151;260",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "790;1096;227",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            188.66666666666666,
            50.46671070011289
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            704.3333333333334,
            359.90214719497794
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8_d5L9nu59QJ:scholar.google.com/&scioq=CP-GAN:+Towards+a+Better+Global+Landscape+of+GANs&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HylAoJSKvH",
        "title": "A Stochastic Derivative Free Optimization Method with Momentum",
        "track": "main",
        "status": "Poster",
        "tldr": "We develop and analyze a new derivative free optimization algorithm with momentum and importance sampling with applications to continuous control.",
        "abstract": "We consider the problem of unconstrained minimization of a smooth objective\nfunction in $\\mathbb{R}^d$ in setting where only function evaluations are possible. We propose and analyze stochastic zeroth-order method with heavy ball momentum. In particular, we propose, SMTP, a momentum version of the stochastic three-point method (STP) Bergou et al. (2019). We show new complexity results for non-convex, convex and strongly convex functions. We test our method on a collection of learning to continuous control tasks on several MuJoCo Todorov et al. (2012) environments with varying difficulty and compare against STP, other state-of-the-art derivative-free optimization algorithms and against policy gradient methods. SMTP significantly outperforms STP and all other methods that we considered in our numerical experiments. Our second contribution is SMTP with importance sampling which we call SMTP_IS. We provide convergence analysis of this method for non-convex, convex and strongly convex objectives.",
        "keywords": "derivative-free optimization;stochastic optimization;heavy ball momentum;importance sampling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eduard Gorbunov;Adel Bibi;Ozan Sener;El Houcine Bergou;Peter Richtarik",
        "authorids": "eduard.gorbunov@phystech.edu;adel.bibi@kaust.edu.sa;ozan.sener@intel.com;houcine.bergou@kaust.edu.sa;peter.richtarik@kaust.edu.sa",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nGorbunov2020A,\ntitle={A Stochastic Derivative Free Optimization Method with Momentum},\nauthor={Eduard Gorbunov and Adel Bibi and Ozan Sener and El Houcine Bergou and Peter Richtarik},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HylAoJSKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HylAoJSKvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "218;1202;1003",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "209;661;893",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            807.6666666666666,
            424.7982527689534
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            587.6666666666666,
            284.01564902112153
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6706707034122754506&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "HylKvyHYwS",
        "title": "Learning with Protection: Rejection of Suspicious Samples under Adversarial Environment",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a novel framework for avoiding the misclassification of data by using a framework of learning with rejection and adversarial examples. Recent developments in machine learning have opened new opportunities for industrial innovations such as self-driving cars. However, many machine learning models are vulnerable to adversarial attacks and industrial practitioners are concerned about accidents arising from misclassification. To avoid critical misclassifications, we define a sample that is likely to be mislabeled as a suspicious sample. Our main idea is to apply a framework of learning with rejection and adversarial examples to assist in the decision making for such suspicious samples. We propose two frameworks, learning with rejection under adversarial attacks and learning with protection. Learning with rejection under adversarial attacks is a naive extension of the learning with rejection framework for handling adversarial examples. Learning with protection is a practical application of learning with rejection under adversarial attacks. This algorithm transforms the original multi-class classification problem into a binary classification for a specific class, and we reject suspicious samples to protect a specific label. We demonstrate the effectiveness of the proposed method in experiments.",
        "keywords": "Learning with Rejection;Adversarial Examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Masahiro Kato;Yoshihiro Fukuhara;Hirokatsu Kataoka;Shigeo Morishima",
        "authorids": "mkato.csecon@gmail.com;gatheluck@gmail.com;hirokatsu.kataoka@aist.go.jp;shigeo@waseda.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkato2020learning,\ntitle={Learning with Protection: Rejection of Suspicious Samples under Adversarial Environment},\nauthor={Masahiro Kato and Yoshihiro Fukuhara and Hirokatsu Kataoka and Shigeo Morishima},\nyear={2020},\nurl={https://openreview.net/forum?id=HylKvyHYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HylKvyHYwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "625;237;382",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "560;258;405",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            414.6666666666667,
            160.07567654775733
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            407.6666666666667,
            123.30540224265205
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Hk1Q1N1ShGcJ:scholar.google.com/&scioq=Learning+with+Protection:+Rejection+of+Suspicious+Samples+under+Adversarial+Environment&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HylLq2EKwS",
        "title": "Collaborative Filtering With A Synthetic Feedback Loop",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a novel learning framework for recommendation systems, assisting collaborative filtering with a synthetic feedback loop. The proposed framework consists of a ``recommender'' and a ``virtual user.'' The recommender is formulizd as a collaborative-filtering method, recommending items according to observed user behavior.  The virtual user estimates rewards from the recommended items and generates the influence of the rewards on observed user behavior.  The recommender connected with the virtual user constructs a closed loop, that recommends users with items and imitates the unobserved feedback of the users to the recommended items. The synthetic feedback is used to augment observed user behavior and improve recommendation results.  Such a model can be interpreted as the inverse reinforcement learning, which can be learned effectively via rollout (simulation). Experimental results show that the proposed framework is able to boost the performance of existing collaborative filtering methods on multiple datasets. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenlin Wang;Hongteng Xu;Ruiyi Zhang;Wenqi Wang;Lawrence Carin",
        "authorids": "wenqiwang@fb.com;hongtengxu313@gmail.com;;;",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwang2020collaborative,\ntitle={Collaborative Filtering With A Synthetic Feedback Loop},\nauthor={Wenlin Wang and Hongteng Xu and Ruiyi Zhang and Wenqi Wang and Lawrence Carin},\nyear={2020},\nurl={https://openreview.net/forum?id=HylLq2EKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HylLq2EKwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "764;616;213",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            531.0,
            232.83613694327317
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YV8QTA-psEIJ:scholar.google.com/&scioq=Collaborative+Filtering+With+A+Synthetic+Feedback+Loop&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "HylNWkHtvB",
        "title": "Domain-Independent Dominance of Adaptive Methods",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "From a simplified analysis of adaptive methods, we derive AvaGrad, a new optimizer which outperforms SGD on vision tasks when its adaptability is properly tuned. We observe that the power of our method is partially explained by a decoupling of learning rate and adaptability, greatly simplifying hyperparameter search. In light of this observation, we demonstrate that, against conventional wisdom, Adam can also outperform SGD on vision tasks, as long as the coupling between its learning rate and adaptability is taken into account. In practice, AvaGrad matches the best results, as measured by generalization accuracy, delivered by any existing optimizer (SGD or adaptive) across image classification (CIFAR, ImageNet) and character-level language modelling (Penn Treebank) tasks. This later observation, alongside of AvaGrad's decoupling of hyperparameters, could make it the preferred optimizer for deep learning, replacing both SGD and Adam.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pedro Savarese;David McAllester;Sudarshan Babu;Michael Maire",
        "authorids": "savarese@ttic.edu;mcallester@ttic.edu;sudarshan@ttic.edu;mmaire@uchicago.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsavarese2020domainindependent,\ntitle={Domain-Independent Dominance of Adaptive Methods},\nauthor={Pedro Savarese and David McAllester and Sudarshan Babu and Michael Maire},\nyear={2020},\nurl={https://openreview.net/forum?id=HylNWkHtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HylNWkHtvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "373;614;826",
        "wc_reply_reviewers": "0;0;41",
        "wc_reply_authors": "831;1115;1354",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;2;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            604.3333333333334,
            185.0627521199829
        ],
        "wc_reply_reviewers_avg": [
            13.666666666666666,
            19.3275853524323
        ],
        "wc_reply_authors_avg": [
            1100.0,
            213.77714252619867
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 26,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4724538541608089973&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "HylUPnVKvH",
        "title": "Mix & Match: training convnets with mixed image sizes for improved accuracy, speed and scale resiliency",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Training convnets with mixed image size can improve results across multiple sizes at evaluation",
        "abstract": "Convolutional neural networks (CNNs) are commonly trained using a fixed spatial image size predetermined for a given model. Although trained on images of a specific size, it is well established that CNNs can be used to evaluate a wide range of image sizes at test time, by adjusting the size of intermediate feature maps. \nIn this work, we describe and evaluate a novel mixed-size training regime that mixes several image sizes at training time. We demonstrate that models trained using our method are more resilient to image size changes and generalize well even on small images. This allows faster inference by using smaller images at test time. For instance, we receive a 76.43% top-1 accuracy using ResNet50 with an image size of 160, which matches the accuracy of the baseline model with 2x fewer computations.\nFurthermore, for a given image size used at test time, we show this method can be exploited either to accelerate training or the final test accuracy. For example, we are able to reach a 79.27% accuracy with a model evaluated at a 288 spatial size for a relative improvement of 14% over the baseline.",
        "keywords": "convolutional networks;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Elad Hoffer;Berry Weinstein;Itay Hubara;Tal Ben-Nun;Torsten Hoefler;Daniel Soudry",
        "authorids": "elad.hoffer@gmail.com;bweinstein@habana.ai;itayhubara@gmail.com;talbn@inf.ethz.ch;htor@inf.ethz.ch;daniel.soudry@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "https://github.com/paper-submissions/mix-match",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HylUPnVKvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "522;980;571",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "202;390;123",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            691.0,
            205.33062768780175
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            238.33333333333334,
            111.98908676989716
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1073698520389794459&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HylWahVtwB",
        "title": "Neural Architecture Search in Embedding Space",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposed a novel neural architecture search framework, which enables reinforcement learning to search in an embedding space by using architecture encoders and decoders.",
        "abstract": "The neural architecture search (NAS) algorithm with reinforcement learning can be a powerful and novel framework for the automatic discovering process of neural architectures. However, its application is restricted by noncontinuous and high-dimensional search spaces, which result in difficulty in optimization. To resolve these problems, we proposed NAS in embedding space (NASES), which is a novel framework. Unlike other NAS with reinforcement learning approaches that search over a discrete and high-dimensional architecture space, this approach enables reinforcement learning to search in an embedding space by using architecture encoders and decoders. The current experiment demonstrated that the performance of the final architecture network using the NASES procedure is comparable with that of other popular NAS approaches for the image classification task on CIFAR-10. The beneficial-performance and effectiveness of NASES was impressive even when only the architecture-embedding searching and pre-training controller were applied without other NAS tricks such as parameter sharing. Specifically, considerable reduction in searches was achieved by reducing the average number of searching to < 100 architectures to achieve a final architecture for the NASES procedure.",
        "keywords": "neural architecture search;nas;automl",
        "primary_area": "",
        "supplementary_material": "",
        "author": "chun-ting liu",
        "authorids": "jimliu741523@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nliu2020neural,\ntitle={Neural Architecture Search in Embedding Space},\nauthor={chun-ting liu},\nyear={2020},\nurl={https://openreview.net/forum?id=HylWahVtwB}\n}",
        "github": "https://anonymous.4open.science/r/b5cee050-c345-4acf-bc34-4d7233edbe80/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HylWahVtwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "110;219;314",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "281;227;315",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            214.33333333333334,
            83.34799870956044
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            274.3333333333333,
            36.23380864453651
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PK3PSuXpZ28J:scholar.google.com/&scioq=Neural+Architecture+Search+in+Embedding+Space&hl=en&as_sdt=0,33",
        "gs_version_total": 2
    },
    {
        "id": "HylYBlBYvB",
        "title": "INTERPRETING CNN PREDICTION THROUGH LAYER - WISE SELECTED DISCERNIBLE NEURONS",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In recent years, researchers have seen working on interpreting the insights of deep networks in the pursuit of overcoming their opaqueness and so-called \u2018black-box\u2019 tag from them. In this work, we present a new visual interpretation technique that finds out discriminative image locations contributing highly towards networks\u2019 prediction. We select the most contributing set of neurons per layer and engineer the forward pass operation to gradually reach to the important locations of the in-put image. We explore the connectivity structure of the neuron and obtain support from succeeding and preceding layer along with its evidence from current layer to advocate for a neuron\u2019s importance. While conducting this operation, we also add priorities to the supports from neighboring layers, which, in practice, provides a reliable way of selecting the discriminative set of neurons for the target layer.We conduct both the objective and subjective evaluations to examine the performance of our method in terms of model\u2019s faithfulness and human-trust, where we visualize its efficacy over other existing methods.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Md Tauhid Bin Iqbal;Abdul Muqeet;Sung-Ho Bae",
        "authorids": "tauhidiq@khu.ac.kr;amuqeet@khu.ac.kr;shbae@khu.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HylYBlBYvB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "427;589;250",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            422.0,
            138.44132331063582
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bsd26jVD0JgJ:scholar.google.com/&scioq=INTERPRETING+CNN+PREDICTION+THROUGH+LAYER+-+WISE+SELECTED+DISCERNIBLE+NEURONS&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HylYtaVtwS",
        "title": "Improving the robustness of ImageNet classifiers using elements of human visual cognition",
        "track": "main",
        "status": "Withdraw",
        "tldr": "systematic study of large-scale cache-based image recognition models, focusing particularly on their robustness properties",
        "abstract": "We investigate the robustness properties of image recognition models equipped with two features inspired by human vision, an explicit episodic memory and a shape bias, at the ImageNet scale. As reported in previous work, we show that an explicit episodic memory improves the robustness of image recognition models against small-norm adversarial perturbations under some threat models. It does not, however, improve the robustness against more natural, and typically larger, perturbations. Learning more robust features during training appears to be necessary for robustness in this second sense. We show that features derived from a model that was encouraged to learn global, shape-based representations (Geirhos et al., 2019) do not only improve the robustness against natural perturbations, but when used in conjunction with an episodic memory, they also provide additional robustness against adversarial perturbations. Finally, we address three important design choices for the episodic memory: memory size, dimensionality of the memories and the retrieval method. We show that to make the episodic memory more compact, it is preferable to reduce the number of memories by clustering them, instead of reducing their dimensionality.",
        "keywords": "human vision;robustness;large-scale image recognition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Emin Orhan;Brenden Lake",
        "authorids": "aeminorhan@gmail.com;brenden@nyu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HylYtaVtwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "190;220;457",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "263;151;92",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            289.0,
            119.42361575500885
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            168.66666666666666,
            70.91935952584143
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4315829296995844185&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HylZ5grKvB",
        "title": "Growing Up Together: Structured Exploration for Large Action Spaces",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Training good policies for large combinatorial action spaces is onerous and usually tackled with imitation learning, curriculum learning, or reward shaping. Each of these methods has requirements that can hinder their general application. Here, we study how growing the action space of the policy during training can structure the exploration and lead to convergence without any external data (imitation), with less control over the environment (curriculum), and with minimal reward shaping. We evaluate this approach on a challenging end-to-end full games army control task in StarCraft: Brood War by training policies through self-play from scratch. We grow the spatial resolution and frequency of actions and achieve superior results compared to operating purely at finer resolutions.\n",
        "keywords": "Reinforcement Learning;Real-Time Strategy Games;Hierarchical RL;Large Action Space",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gabriel Synnaeve;Jonas Gehring;Zeming Lin;Daniel Haziza;Nicolas Usunier;Danielle Rothermel;Vegard Mella;Da Ju;Nicolas Carion;Laura Gustafson;Daniel Gant",
        "authorids": "gab@fb.com;jgehring@fb.com;zlin@fb.com;dhaziza@fb.com;usunier@fb.com;drothermel@fb.com;vegardmella@fb.com;daju@fb.com;alcinos@fb.com;lgustafson@fb.com;danielgant@fb.com",
        "gender": ";;;;;;;;;;",
        "homepage": ";;;;;;;;;;",
        "dblp": ";;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;",
        "orcid": ";;;;;;;;;;",
        "linkedin": ";;;;;;;;;;",
        "or_profile": ";;;;;;;;;;",
        "aff": ";;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;",
        "position": ";;;;;;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=HylZ5grKvB",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            11,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14522184566263523130&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HylZIT4Yvr",
        "title": "Structural Language Models for Any-Code Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "We generate source code using a Structural Language Model over the program's Abstract Syntax Tree",
        "abstract": "We address the problem of Any-Code Generation (AnyGen) - generating code without any restriction on the vocabulary or structure. The state-of-the-art in this problem is the sequence-to-sequence (seq2seq) approach, which treats code as a sequence and does not leverage any structural information. We introduce a new approach to AnyGen that leverages the strict syntax of programming languages to model a code snippet as tree structural language modeling (SLM). SLM estimates the probability of the program's abstract syntax tree (AST) by decomposing it into a product of conditional probabilities over its nodes. We present a neural model that computes these conditional probabilities by considering all AST paths leading to a target node. Unlike previous structural techniques that have severely restricted the kinds of expressions that can be generated, our approach can generate arbitrary expressions in any programming language. Our model significantly outperforms both seq2seq and a variety of existing structured approaches in generating Java and C# code. We make our code, datasets, and models available online.",
        "keywords": "Program Generation;Structural Language Model;SLM;Generative Model;Code Generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Uri Alon;Roy Sadaka;Omer Levy;Eran Yahav",
        "authorids": "urialon@cs.technion.ac.il;roysadaka@gmail.com;omerlevy@gmail.com;yahave@cs.technion.ac.il",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nalon2020structural,\ntitle={Structural Language Models for Any-Code Generation},\nauthor={Uri Alon and Roy Sadaka and Omer Levy and Eran Yahav},\nyear={2020},\nurl={https://openreview.net/forum?id=HylZIT4Yvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HylZIT4Yvr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "817;231;393",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1477;664;1078",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            480.3333333333333,
            247.07533713874037
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1073.0,
            331.92469025367785
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=779976417289606536&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HylcapVtvB",
        "title": "Improving Differentially Private Models with Active Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce privacy-aware active learning techniques to improve differentially private classifiers and achieve state-of-the-art results.",
        "abstract": "Broad adoption of machine learning techniques has increased privacy concerns for models trained on sensitive data such as medical records. Existing techniques for training differentially private (DP) models give rigorous privacy guarantees, but applying these techniques to neural networks can severely degrade model performance. This performance reduction is an obstacle to deploying private models in the real world. In this work, we improve the performance of DP models by fine-tuning them through active learning on public data. We introduce two new techniques - DiversePublic and NearPrivate - for doing this fine-tuning in a privacy-aware way. For the MNIST and SVHN datasets, these techniques improve state-of-the-art accuracy for DP models while retaining privacy guarantees.",
        "keywords": "Differential Privacy;Active Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhengli Zhao;Nicolas Papernot;Sameer Singh;Neoklis Polyzotis;Augustus Odena",
        "authorids": "zhengliz@uci.edu;papernot@google.com;sameer@uci.edu;npolyzotis@google.com;augustusodena@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhao2020improving,\ntitle={Improving Differentially Private Models with Active Learning},\nauthor={Zhengli Zhao and Nicolas Papernot and Sameer Singh and Neoklis Polyzotis and Augustus Odena},\nyear={2020},\nurl={https://openreview.net/forum?id=HylcapVtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer5",
        "site": "https://openreview.net/forum?id=HylcapVtvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "290;1211;292",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "212;277;432",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            597.6666666666666,
            433.69292772139545
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            307.0,
            92.28578799938084
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5799289337013590895&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HyleclHKvS",
        "title": "A Non-asymptotic comparison of SVRG and SGD: tradeoffs between compute and speed",
        "track": "main",
        "status": "Reject",
        "tldr": "Non-asymptotic analysis of SGD and SVRG, showing the strength of each algorithm in convergence speed and computational cost, in both under-parametrized and over-parametrized settings.",
        "abstract": "Stochastic gradient descent (SGD), which  trades off noisy gradient updates for computational efficiency, is the de-facto optimization algorithm to solve large-scale machine learning problems. SGD can make rapid learning progress by performing updates using subsampled training data, but the noisy updates also lead to slow asymptotic convergence.   Several variance reduction algorithms, such as SVRG, introduce control variates to obtain a lower variance gradient estimate and faster convergence.  Despite their appealing asymptotic guarantees, SVRG-like algorithms have not been widely adopted in deep learning. The traditional asymptotic analysis in stochastic optimization provides limited insight into training deep learning models under a fixed number of epochs. In this paper, we present a non-asymptotic analysis of SVRG under a noisy least squares regression problem. Our primary focus is to compare the exact loss of SVRG to that of SGD at each iteration t. We show that the learning dynamics of our regression model closely matches with that of neural networks on MNIST and CIFAR-10 for both the underparameterized and the overparameterized models. Our analysis and experimental results suggest there is a trade-off between the computational cost and the convergence speed in underparametrized neural networks. SVRG outperforms SGD after a few epochs in this regime. However, SGD is shown to always outperform SVRG in the overparameterized regime.",
        "keywords": "variance reduction;non-asymptotic analysis;trade-off;computational cost;convergence speed",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qingru Zhang;Yuhuai Wu;Fartash Faghri;Tianzong Zhang;Jimmy Ba",
        "authorids": "qrzhang98@gmail.com;ywu@cs.toronto.edu;faghri@cs.toronto.edu;ztz16@mails.tsinghua.edu.cn;jba@cs.toronto.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang2020a,\ntitle={A Non-asymptotic comparison of {\\{}SVRG{\\}} and {\\{}SGD{\\}}: tradeoffs between compute and speed},\nauthor={Qingru Zhang and Yuhuai Wu and Fartash Faghri and Tianzong Zhang and Jimmy Ba},\nyear={2020},\nurl={https://openreview.net/forum?id=HyleclHKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyleclHKvS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "439;845;333",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "518;755;322",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            539.0,
            220.6596172086471
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            531.6666666666666,
            177.03546411822563
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JtyinCuWT9QJ:scholar.google.com/&scioq=A+Non-asymptotic+comparison+of+SVRG+and+SGD:+tradeoffs+between+compute+and+speed&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "HylfPgHYvr",
        "title": "Occlusion resistant learning of intuitive physics from videos",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "To reach human performance on complex tasks, a key ability for artificial\nsystems is to understand physical interactions between objects, and predict\nfuture outcomes of a situation. This ability, often referred to as\nintuitive\nphysics\n, has recently received attention and several methods were proposed to\nlearn these physical rules from video sequences. Yet, most these methods are\nrestricted to the case where no occlusions occur, narrowing the potential areas\nof application. The main contribution of this paper is a method combining\na predictor of object dynamics and a neural renderer efficiently predicting\nfuture trajectories and explicitly modelling partial and full occlusions among\nobjects. We present a training procedure enabling learning intuitive physics\ndirectly from the input videos containing segmentation masks of objects and\ntheir depth. Our results show that our model learns object dynamics despite\nsignificant inter-object occlusions, and realistically predicts segmentation\nmasks up to 30 frames in the future. We study model performance for\nincreasing levels of occlusions, and compare results to previous work on\nthe tasks of future prediction and object following. We also show results\non predicting motion of objects in real videos and demonstrate significant\nimprovements over state-of-the-art on the object permanence task in the\nintuitive physics benchmark of Riochet et al. (2018).",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ronan Riochet;Josef Sivic;Ivan Laptev;Emmanuel Dupoux",
        "authorids": "ronan.riochet@inria.fr;josef.sivic@ens.fr;ivan.laptev@inria.fr;emmanuel.dupoux@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nriochet2020occlusion,\ntitle={Occlusion  resistant  learning  of  intuitive physics from videos},\nauthor={Ronan Riochet and Josef Sivic and Ivan Laptev and Emmanuel Dupoux},\nyear={2020},\nurl={https://openreview.net/forum?id=HylfPgHYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer5;AnonReviewer3;AnonReviewer6",
        "site": "https://openreview.net/forum?id=HylfPgHYvr",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "391;617;230;452",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "357;192;180;376",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            422.5,
            138.51804936541663
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            276.25,
            90.59904800824344
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7103015423506787&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HylhuTEtwr",
        "title": "The advantage of using Student's t-priors in variational autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Is it optimal to use the standard Gaussian prior in variational autoencoders? With Gaussian distributions, which are not weakly informative priors, variational autoencoders struggle to reconstruct the actual data. We provide numerical evidence that encourages using Student's t-distributions as default priors in variational autoencoders, and we challenge the usual setup for the variational autoencoder structure by comparing Gaussian and Student's t-distribution priors with different forms of the covariance matrix.",
        "keywords": "Variational Autoencoders;DLVMs;Posterior Collapse",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Najmeh Abiri;Mattias Ohlsson",
        "authorids": "najmeh@thep.lu.se;mattias@thep.lu.se",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nabiri2020the,\ntitle={The advantage of using Student's t-priors in variational autoencoders},\nauthor={Najmeh Abiri and Mattias Ohlsson},\nyear={2020},\nurl={https://openreview.net/forum?id=HylhuTEtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HylhuTEtwr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "417;379;318",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            371.3333333333333,
            40.778534658431376
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2030964475381081147&as_sdt=5,47&sciodt=0,47&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HyljY04YDB",
        "title": "Towards Interpretable Molecular Graph Representation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new Laplacian-based hierarchical graph pooling layers that not only outperforms existing GNNs on several graph benchmarks but is also more interpretable.",
        "abstract": "Recent work in graph neural networks (GNNs) has led to improvements in molecular activity and property prediction tasks. Unfortunately, GNNs often fail to capture the relative importance of interactions between molecular substructures, in part due to the absence of efficient intermediate pooling steps. To address these issues, we propose LaPool (Laplacian Pooling), a novel, data-driven, and interpretable hierarchical graph pooling method that takes into account both node features and graph structure to improve molecular understanding.\nWe benchmark LaPool and show that it not only outperforms recent GNNs on molecular graph understanding and prediction tasks but also remains highly competitive on other graph types. We then demonstrate the improved interpretability achieved with LaPool using both qualitative and quantitative assessments, highlighting its potential applications in drug discovery.",
        "keywords": "molecular graphs;graph pooling;hierarchical;GNN;Laplacian;drug discovery",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Emmanuel Noutahi;Dominique Beani;Julien Horwood;Prudencio Tossou",
        "authorids": "emmanuel@invivoai.com;dominique@invivoai.com;julien@invivoai.com;prudencio@invivoai.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nnoutahi2020towards,\ntitle={Towards Interpretable Molecular Graph Representation Learning},\nauthor={Emmanuel Noutahi and Dominique Beani and Julien Horwood and Prudencio Tossou},\nyear={2020},\nurl={https://openreview.net/forum?id=HyljY04YDB}\n}",
        "github": "https://anonymous.4open.science/r/941cb9ee-302f-4c81-bbf9-abcff1e98894/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyljY04YDB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "481;589;530",
        "wc_reply_reviewers": "105;0;0",
        "wc_reply_authors": "969;478;580",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            533.3333333333334,
            44.15377170248942
        ],
        "wc_reply_reviewers_avg": [
            35.0,
            49.49747468305833
        ],
        "wc_reply_authors_avg": [
            675.6666666666666,
            211.55666432949405
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11546135819573107595&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Hyljn1SFwr",
        "title": "Revisiting the Information Plane",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We give a detailed explanation of the trajectories in the information plane and investigate its usage for neural network design (pruning)",
        "abstract": "There has recently been a heated debate (e.g. Schwartz-Ziv & Tishby (2017), Saxe et al. (2018), Noshad et al. (2018), Goldfeld et al. (2018)) about measuring the information flow in Deep Neural Networks using techniques from information theory. It is claimed that Deep Neural Networks in general have good generalization capabilities since they not only learn how to map from an input to an output but also how to compress information about the training data input (Schwartz-Ziv & Tishby, 2017). That is, they abstract the input information and strip down any unnecessary or over-specific information. If so, the message compression method, Information Bottleneck (IB), could be used as a natural comparator for network performance, since this method gives an optimal information compression boundary. This claim was then later denounced as well as reaffirmed (e.g.  Saxe et al. (2018), Achille et al. (2017), Noshad et al. (2018)), as the employed method of mutual information measuring is not actually measuring information but clustering of the internal layer representations (Goldfeld et al. (2018)). In this paper, we will present a detailed explanation of the development in the Information Plain (IP), which is a plot-type that compares mutual information to judge compression (Schwartz-Ziv & Tishby (2017)), when noise is retroactively added (using binning estimation).  We also explain why different activation functions show different trajectories on the IP. Further, we have looked into the effect of clustering on the network loss through early and perfect stopping using the Information Plane and how clustering can be used to help network pruning.",
        "keywords": "Deep Learning;Information Theory;Information Bottleneck;Neural Network Design",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Martin Schiemer;Juan Ye",
        "authorids": "martin.schiemer@web.de;juan.ye@st-andrews.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nschiemer2020revisiting,\ntitle={Revisiting the Information Plane},\nauthor={Martin Schiemer and Juan Ye},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyljn1SFwr}\n}",
        "github": "https://drive.google.com/open?id=1CwBthST4bwtvMtw39Fy1bWMQ6hrz0Hzo",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hyljn1SFwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1243;811;398",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            817.3333333333334,
            344.9988727839878
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14510233393485450555&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HyljzgHtwS",
        "title": "Regularly varying representation for sentence embedding",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The dominant approaches to sentence representation in natural language rely on learning embeddings on massive corpuses. The obtained embeddings have desirable properties such as compositionality and distance preservation (sentences with similar meanings have similar representations). In this paper, we develop a novel method for learning an embedding enjoying a dilation invariance property. We propose two algorithms: Orthrus, a classification algorithm, constrains the distribution of the embedded variable to be regularly varying, i.e. multivariate heavy-tail. and uses Extreme Value Theory (EVT) to tackle the classification task on two separate regions: the tail and the bulk. Hydra, a text generation algorithm for dataset augmentation, leverages the invariance property of the embedding learnt by Orthrus to generate coherent sentences with controllable attribute, e.g. positive or negative sentiment. Numerical experiments on synthetic and real text data demonstrate the relevance of the proposed framework.\n",
        "keywords": "extreme value theory;classification;supvervised learning;data augmentation;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hamid Jalalzai;Pierre Colombo;Chlo\u00e9 Clavel;Eric Gaussier;Giovanna Varni;Emmanuel Vignon;Anne Sabourin",
        "authorids": "hamid.jalalzai@telecom-paris.fr;pierre.colombo@telecom-paris.fr;;;giovanna.varni@telecom-paris.fr;emmanuel.vignon@fr.ibm.com;anne.sabourin@telecom-paris.fr",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\njalalzai2020regularly,\ntitle={Regularly varying representation for sentence embedding},\nauthor={Hamid Jalalzai and Pierre Colombo and Chlo{\\'e} Clavel and Eric Gaussier and Giovanna Varni and Emmanuel Vignon and Anne Sabourin},\nyear={2020},\nurl={https://openreview.net/forum?id=HyljzgHtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyljzgHtwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "364;800;550",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "584;788;711",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            571.3333333333334,
            178.63432543109462
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            694.3333333333334,
            84.11235871671231
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ggVwdEw78Y4J:scholar.google.com/&scioq=Regularly+varying+representation+for+sentence+embedding&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HylloR4YDr",
        "title": "Learning Latent Representations for Inverse Dynamics using Generalized Experiences",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that the key to achieving good performance with IDMs lies in learning latent representations to encode the information shared between equivalent experiences, so that they can be generalized to unseen scenarios.",
        "abstract": "Many practical robot locomotion tasks require agents to use control policies that can be parameterized by goals. Popular deep reinforcement learning approaches in this direction involve learning goal-conditioned policies or value functions, or Inverse Dynamics Models (IDMs). IDMs map an agent\u2019s current state and desired goal to the required actions. We show that the key to achieving good performance with IDMs lies in learning the information shared between equivalent experiences, so that they can be generalized to unseen scenarios. We design a training process that guides the learning of latent representations to encode this shared information. Using a limited number of environment interactions, our agent is able to efficiently navigate to arbitrary points in the goal space. We demonstrate the effectiveness of our approach in high-dimensional locomotion environments such as the Mujoco Ant, PyBullet Humanoid, and PyBullet Minitaur. We provide quantitative and qualitative results to show that our method clearly outperforms competing baseline approaches.",
        "keywords": "deep reinforcement learning;continuous control;inverse dynamics model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aditi Mavalankar;Sicun Gao",
        "authorids": "amavalan@eng.ucsd.edu;sicung@ucsd.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmavalankar2020learning,\ntitle={Learning Latent Representations for Inverse Dynamics using Generalized Experiences},\nauthor={Aditi Mavalankar and Sicun Gao},\nyear={2020},\nurl={https://openreview.net/forum?id=HylloR4YDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HylloR4YDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "257;395;447",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "796;446;503",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            366.3333333333333,
            80.17203724214295
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            581.6666666666666,
            153.3326086939399
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tYIPVBJZGFwJ:scholar.google.com/&scioq=Learning+Latent+Representations+for+Inverse+Dynamics+using+Generalized+Experiences&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HyloPnEKPr",
        "title": "Context-aware Attention Model for Coreference Resolution",
        "track": "main",
        "status": "Reject",
        "tldr": "We demonstrate an attention model reweighing features around different contexts to reduce the wrongful predictions between similar or identical texts units",
        "abstract": "Coreference resolution is an important task for gaining more complete understanding about texts by artificial intelligence. The state-of-the-art end-to-end neural coreference model considers all spans in a document as potential mentions and learns to link an antecedent with each possible mention. However, for the verbatim same mentions, the model tends to get similar or even identical representations based on the features, and this leads to wrongful predictions. In this paper, we propose to improve the end-to-end system by building an attention model to reweigh features around different contexts. The proposed model substantially outperforms the state-of-the-art on the English dataset of the CoNLL 2012 Shared Task with 73.45% F1 score on development data and 72.84% F1 score on test data.",
        "keywords": "Coreference resolution;Feature Attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yufei Li;Xiangyu Zhou;Jie Ma;Yu Long;Xuan Wang;Chen Li",
        "authorids": "vermouthtarot@gmail.com;zxy951005@stu.xjtu.edu.cn;majack@stu.xjtu.edu.cn;longyu95@stu.xjtu.edu.cn;wangxuan8888@stu.xjtu.edu.cn;cli@xjtu.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nli2020contextaware,\ntitle={Context-aware Attention Model for Coreference Resolution},\nauthor={Yufei Li and Xiangyu Zhou and Jie Ma and Yu Long and Xuan Wang and Chen Li},\nyear={2020},\nurl={https://openreview.net/forum?id=HyloPnEKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HyloPnEKPr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "157;197;17",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            123.66666666666667,
            77.1722460186015
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vkTsje16y0kJ:scholar.google.com/&scioq=Context-aware+Attention+Model+for+Coreference+Resolution&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HylohJrKPS",
        "title": "Deep 3D-Zoom Net: Unsupervised Learning of Photo-Realistic 3D-Zoom",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A novel network architecture to perform Deep 3D Zoom or close-ups.",
        "abstract": "The 3D-zoom operation is the positive translation of the camera in the Z-axis, perpendicular to the image plane. In contrast, the optical zoom changes the focal length and the digital zoom is used to enlarge a certain region of an image to the original image size. In this paper, we are the first to formulate an unsupervised 3D-zoom learning problem where images with an arbitrary zoom factor can be generated from a given single image. An unsupervised framework is convenient, as it is a challenging task to obtain a 3D-zoom dataset of natural scenes due to the need for special equipment to ensure camera movement is restricted to the Z-axis. Besides, the objects in the scenes should not move when being captured, which hinders the construction of a large dataset of outdoor scenes. We present a novel unsupervised framework to learn how to generate arbitrarily 3D-zoomed versions of a single image, not requiring a 3D-zoom ground truth, called the Deep 3D-Zoom Net. The Deep 3D-Zoom Net incorporates the following features: (i) transfer learning from a pre-trained disparity estimation network via a back re-projection reconstruction loss; (ii) a fully convolutional network architecture that models depth-image-based rendering (DIBR), taking into account high-frequency details without the need for estimating the intermediate disparity; and (iii) incorporating a discriminator network that acts as a no-reference penalty for unnaturally rendered areas. Even though there is no baseline to fairly compare our results, our method outperforms previous novel view synthesis research in terms of realistic appearance on large camera baselines. We performed extensive experiments to verify the effectiveness of our method on the KITTI and Cityscapes datasets.",
        "keywords": "Deep learning;novel view synthesis;Deep 3D Zoom",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Juan Luis Gonzalez Bello;Munchurl Kim",
        "authorids": "juanluisgb@kaist.ac.kr;mkimee@kaist.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HylohJrKPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "415;325;170",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            303.3333333333333,
            101.18739491107027
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18155204244535531531&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HylpqA4FwS",
        "title": "RNNs Incrementally Evolving on an Equilibrium Manifold: A Panacea for Vanishing and Exploding Gradients?",
        "track": "main",
        "status": "Poster",
        "tldr": "Incremental-RNNs resolves exploding/vanishing gradient problem by updating state vectors based on difference between previous state and that predicted by an ODE.",
        "abstract": "Recurrent neural networks (RNNs) are particularly well-suited for modeling long-term dependencies in sequential data, but are notoriously hard to train because the error backpropagated in time either vanishes or explodes at an exponential rate. While a number of works attempt to mitigate this effect through gated recurrent units, skip-connections, parametric constraints and design choices, we propose a novel incremental RNN (iRNN), where hidden state vectors keep track of incremental changes, and as such approximate state-vector increments of Rosenblatt's (1962) continuous-time RNNs. iRNN exhibits identity gradients and is able to account for long-term dependencies (LTD). We show that our method is computationally efficient overcoming overheads of many existing methods that attempt to improve RNN training, while suffering no performance degradation. We demonstrate the utility of our approach with extensive experiments and show competitive performance against standard LSTMs on LTD and other non-LTD tasks.\n",
        "keywords": "novel recurrent neural architectures;learning representations of outputs or states",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anil Kag;Ziming Zhang;Venkatesh Saligrama",
        "authorids": "anilkag@bu.edu;zzhang@merl.com;srv@bu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nKag2020RNNs,\ntitle={RNNs Incrementally Evolving on an Equilibrium Manifold: A Panacea for Vanishing and Exploding Gradients?},\nauthor={Anil Kag and Ziming Zhang and Venkatesh Saligrama},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HylpqA4FwS}\n}",
        "github": "[![github](/images/github_icon.svg) anilkagak2/TARNN](https://github.com/anilkagak2/TARNN)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HylpqA4FwS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "288;242;551",
        "wc_reply_reviewers": "134;0;25",
        "wc_reply_authors": "887;220;812",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            360.3333333333333,
            136.12331011092684
        ],
        "wc_reply_reviewers_avg": [
            53.0,
            58.1778881248423
        ],
        "wc_reply_authors_avg": [
            639.6666666666666,
            298.3245808771975
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 68,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14548762609337726303&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "HylrB04YwH",
        "title": "Overparameterized Neural Networks Can Implement Associative Memory",
        "track": "main",
        "status": "Reject",
        "tldr": "We demonstrate that overparameterized neural networks trained using standard optimizers can memorize and recall individual data instances or sequences.  ",
        "abstract": "Identifying computational mechanisms for memorization and retrieval is a long-standing problem at the intersection of machine learning and neuroscience.  In this work, we demonstrate empirically that overparameterized deep neural networks trained using standard optimization methods provide a mechanism for memorization and retrieval of real-valued data.  In particular, we show that overparameterized autoencoders store training examples as attractors, and thus, can be viewed as implementations of associative memory with the retrieval mechanism given by iterating the map.  We study this phenomenon under a variety of common architectures and optimization methods and construct a network that can recall 500 real-valued images without any apparent spurious attractor states.  Lastly, we demonstrate how the same mechanism allows encoding sequences, including movies and audio, instead of individual examples.  Interestingly, this appears to provide an even more efficient mechanism for storage and retrieval than autoencoding single instances.\n",
        "keywords": "Associative Memory;Memorization and Recall;Attractors;Deep Autoencoders",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Adityanarayanan Radhakrishnan;Mikhail Belkin;Caroline Uhler",
        "authorids": "aradha@mit.edu;mbelkin@cse.ohio-state.edu;cuhler@mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nradhakrishnan2020overparameterized,\ntitle={Overparameterized Neural Networks Can Implement Associative Memory},\nauthor={Adityanarayanan Radhakrishnan and Mikhail Belkin and Caroline Uhler},\nyear={2020},\nurl={https://openreview.net/forum?id=HylrB04YwH}\n}",
        "github": "https://drive.google.com/open?id=1yWcWeZZSQIeESeLJ4nnEQEeLe3U34-xo",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HylrB04YwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "275;787;139",
        "wc_reply_reviewers": "235;230;0",
        "wc_reply_authors": "621;940;138",
        "reply_reviewers": "2;1;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            400.3333333333333,
            278.99502185921205
        ],
        "wc_reply_reviewers_avg": [
            155.0,
            109.62055768270231
        ],
        "wc_reply_authors_avg": [
            566.3333333333334,
            329.689079116808
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12856679215216072214&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HylsTT4FvB",
        "title": "On the \"steerability\" of generative adversarial networks",
        "track": "main",
        "status": "Poster",
        "tldr": "Interpolations in the latent space demonstrate generalization capacity of GANs and the effect of dataset biases.",
        "abstract": "An open secret in contemporary machine learning is that many models work beautifully on standard benchmarks but fail to generalize outside the lab. This has been attributed to biased training data, which provide poor coverage over real world events. Generative models are no exception, but recent advances in generative adversarial networks (GANs) suggest otherwise -- these models can now synthesize strikingly realistic and diverse images. Is generative modeling of photos a solved problem? We show that although current GANs can fit standard datasets very well, they still fall short of being comprehensive models of the visual manifold. In particular, we study their ability to fit simple transformations such as camera movements and color changes. We find that the models reflect the biases of the datasets on which they are trained (e.g., centered objects), but that they also exhibit some capacity for generalization: by \"steering\" in latent space, we can shift the distribution while still creating realistic images. We hypothesize that the degree of distributional shift is related to the breadth of the training data distribution. Thus, we conduct experiments to quantify the limits of GAN transformations and introduce techniques to mitigate the problem.   Code is released on our project page: https://ali-design.github.io/gan_steerability/",
        "keywords": "generative adversarial network;latent space interpolation;dataset bias;model generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ali Jahanian*;Lucy Chai*;Phillip Isola",
        "authorids": "jahanian@mit.edu;lrchai@mit.edu;phillipi@mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nJahanian*2020On,\ntitle={On the \"steerability\" of generative adversarial networks},\nauthor={Ali Jahanian* and Lucy Chai* and Phillip Isola},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HylsTT4FvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HylsTT4FvB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "273;363;268",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "7;583;325",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            301.3333333333333,
            43.65266951236265
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            305.0,
            235.5758901076254
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 474,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17896034147137114394&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HylthC4twr",
        "title": "Frequency Analysis for Graph Convolution Network",
        "track": "main",
        "status": "Reject",
        "tldr": "We study the filtering effect of GCN and SGC on benchmark datasets, find that all datasets are low-frequency and state-of-the-art models do not work in high-frequency settings.",
        "abstract": "In this work, we develop quantitative results to the learnablity of a two-layers Graph Convolutional Network (GCN). Instead of analyzing GCN under some classes of functions, our approach provides a quantitative gap between a two-layers GCN and a two-layers MLP model. Our analysis is based on the graph signal processing (GSP) approach, which can provide much more useful insights than the message-passing computational model. Interestingly, based on our analysis, we have been able to empirically demonstrate a few case when GCN and other state-of-the-art models cannot learn even when true vertex features are extremely low-dimensional. To demonstrate our theoretical findings and propose a solution to the aforementioned adversarial cases, we build a proof of concept graph neural network model with stacked filters named Graph Filters Neural Network (gfNN).\n",
        "keywords": "graph signal processing;frequency analysis;graph convolution neural network;simplified convolution network;semi-supervised vertex classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hoang NT;Takanori Maehara",
        "authorids": "hoang.nguyen.rh@riken.jp;takanori.maehara@riken.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nnt2020frequency,\ntitle={Frequency Analysis for Graph Convolution Network},\nauthor={Hoang NT and Takanori Maehara},\nyear={2020},\nurl={https://openreview.net/forum?id=HylthC4twr}\n}",
        "github": "https://gofile.io/?c=JrE62o ",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HylthC4twr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "274;182;590",
        "wc_reply_reviewers": "0;0;211",
        "wc_reply_authors": "283;258;539",
        "reply_reviewers": "0;0;2",
        "reply_authors": "1;1;3",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.6666666666667,
            174.73281190307546
        ],
        "wc_reply_reviewers_avg": [
            70.33333333333333,
            99.46635388690768
        ],
        "wc_reply_authors_avg": [
            360.0,
            126.98293848650167
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10601611369527555344&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HylvlaVtwr",
        "title": "Learning to Sit: Synthesizing Human-Chair Interactions via Hierarchical Control",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Synthesizing human motions on interactive tasks using mocap data and hierarchical RL.",
        "abstract": "Recent progress on physics-based character animation has shown impressive breakthroughs on human motion synthesis, through imitating motion capture data via deep reinforcement learning. However, results have mostly been demonstrated on imitating a single distinct motion pattern, and do not generalize to interactive tasks that require flexible motion patterns due to varying human-object spatial configurations. To bridge this gap, we focus on one class of interactive tasks---sitting onto a chair. We propose a hierarchical reinforcement learning framework which relies on a collection of subtask controllers trained to imitate simple, reusable mocap motions, and a meta controller trained to execute the subtasks properly to complete the main task. We experimentally demonstrate the strength of our approach over different single level and hierarchical baselines. We also show that our approach can be applied to motion prediction given an image input. A video highlight can be found at https://youtu.be/XWU3wzz1ip8/.\n",
        "keywords": "physics-based motion synthesis;hierarchical reinforcement learning;motion capture data",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu-Wei Chao;Jimei Yang;Weifeng Chen;Jia Deng",
        "authorids": "ychao@nvidia.com;jimyang@adobe.com;wfchen@umich.edu;jiadeng@princeton.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HylvlaVtwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "275;406;479",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            386.6666666666667,
            84.3972090112516
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 46,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4611793884008045340&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HylvleBtPB",
        "title": "Language-independent Cross-lingual Contextual Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "A language-independent contextual text representation for zero-shot cross-lingual transfer learning.",
        "abstract": "Contextual representation models like BERT have achieved state-of-the-art performance on a diverse range of NLP tasks. We propose a cross-lingual contextual representation model that generates language-independent contextual representations. This helps to enable zero-shot cross-lingual transfer of a wide range of NLP models, on top of contextual representation models like BERT. We provide a formulation of language-independent cross-lingual contextual representation based on mono-lingual representations. Our formulation takes three steps to align sequences of vectors: transform, extract, and reorder. We present a detailed discussion about the process of learning cross-lingual contextual representations, also about the performance in cross-lingual transfer learning and its implications.",
        "keywords": "contextual representation;cross-lingual;transfer learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiao Zhang;Song Wang;Dejing Dou;Xien Liu;Thien Huu Nguyen;Ji Wu",
        "authorids": "xzhang19@mails.tsinghua.edu.cn;wangsong16@mails.tsinghua.edu.cn;dou@cs.uoregon.edu;xeliu@mail.tsinghua.edu.cn;thien@cs.uoregon.edu;wuji_ee@mail.tsinghua.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nzhang2020languageindependent,\ntitle={Language-independent Cross-lingual Contextual Representations},\nauthor={Xiao Zhang and Song Wang and Dejing Dou and Xien Liu and Thien Huu Nguyen and Ji Wu},\nyear={2020},\nurl={https://openreview.net/forum?id=HylvleBtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HylvleBtPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "933;779;580",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            764.0,
            144.50144174597938
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8Ea3qTju7gYJ:scholar.google.com/&scioq=Language-independent+Cross-lingual+Contextual+Representations&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "HylwpREtDr",
        "title": "Active Learning Graph Neural Networks via Node Feature Propagation",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper introduces a clustering-based active learning algorithm on graphs.",
        "abstract": "Graph Neural Networks (GNNs) for prediction tasks like node classification or edge prediction have received increasing attention in recent machine learning from graphically structured data. However, a large quantity of labeled graphs is difficult to obtain, which significantly limit the true success of GNNs. Although active learning has been widely studied for addressing label-sparse issues with other data types like text, images, etc., how to make it effective over graphs is an open question for research.  In this paper, we present the investigation on active learning with GNNs for node classification tasks.  Specifically, we propose a new method, which uses node feature propagation followed by K-Medoids clustering of the nodes for instance selection in active learning. With a theoretical bound analysis we justify the design choice of our approach. In our experiments on four benchmark dataset, the proposed method outperforms other representative baseline methods consistently and significantly.",
        "keywords": "Graph Learning;Active Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuexin Wu;Yichong Xu;Aarti Singh;Artur Dubrawski;Yiming Yang",
        "authorids": "yuexinw@andrew.cmu.edu;yichongx@cs.cmu.edu;aarti@cs.cmu.edu;awd@cs.cmu.edu;yiming@cs.cmu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwu2020active,\ntitle={Active Learning Graph Neural Networks via Node Feature Propagation},\nauthor={Yuexin Wu and Yichong Xu and Aarti Singh and Artur Dubrawski and Yiming Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=HylwpREtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HylwpREtDr",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "158;63;202",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "542;572;206",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            141.0,
            58.00574684172825
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            440.0,
            165.91564121564912
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 74,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13699661645802301945&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HylwpaNKPB",
        "title": "Higher-order Weighted Graph Convolutional Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose HWGCN to mix the relevant neighborhood information at different orders to better learn node representations.",
        "abstract": "Graph Convolution Network (GCN) has been recognized as one of the most effective graph models for semi-supervised learning, but it extracts merely the first-order or few-order neighborhood information through information propagation, which suffers performance drop-off for deeper structure. Existing approaches that deal with the higher-order neighbors tend to take advantage of adjacency matrix power. In this paper, we assume a seemly trivial condition that the higher-order neighborhood information may be similar to that of the first-order neighbors. Accordingly, we present an unsupervised approach to describe such similarities and learn the weight matrices of higher-order neighbors automatically through Lasso that minimizes the feature loss between the first-order and higher-order neighbors, based on which we formulate the new convolutional filter for GCN to learn the better node representations. Our model, called higher-order weighted GCN (HWGCN), has achieved the state-of-the-art results on a number of node classification tasks over Cora, Citeseer and Pubmed datasets.",
        "keywords": "Graph Convolutional Networks;Lasso;Classification;Higher-order neighbors",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Songtao Liu;Lingwei Chen;Hanze Dong;Zihao Wang;Dinghao Wu;Zengfeng Huang",
        "authorids": "stliu15@fudan.edu.cn;lvc5613@psu.edu;hdongaj@ust.hk;zzw166@psu.edu;duw12@psu.edu;huangzf@fudan.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HylwpaNKPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "337;321;150",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            269.3333333333333,
            84.63385190861212
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15726629036039231201&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HylxE1HKwS",
        "title": "Once-for-All: Train One Network and Specialize it for Efficient Deployment",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce techniques to train a single once-for-all network that fits many hardware platforms.",
        "abstract": "We address the challenging problem of efficient inference across many devices and resource constraints, especially on edge devices.  Conventional approaches either manually design or use neural architecture search (NAS) to find a specialized neural network and train it from scratch for each case, which is computationally prohibitive (causing $CO_2$ emission as much as 5 cars' lifetime) thus unscalable. In this work, we propose to train a once-for-all (OFA) network that supports diverse architectural settings by decoupling training and search, to reduce the cost. We can quickly get a specialized sub-network by selecting from the OFA network without additional training. To efficiently train OFA networks, we also propose a novel progressive shrinking algorithm, a generalized pruning method that reduces the model size across many more dimensions than pruning (depth, width, kernel size, and resolution). It can obtain a surprisingly large number of sub-networks ($> 10^{19}$) that can fit different hardware platforms and latency constraints while maintaining the same level of accuracy as training independently. On diverse edge devices, OFA consistently outperforms state-of-the-art (SOTA) NAS methods (up to 4.0% ImageNet top1 accuracy improvement over MobileNetV3, or same accuracy but 1.5x faster than MobileNetV3, 2.6x faster than EfficientNet w.r.t measured latency) while reducing many orders of magnitude GPU hours and $CO_2$ emission. In particular, OFA achieves a new SOTA 80.0% ImageNet top-1 accuracy under the mobile setting ($<$600M MACs). OFA is the winning solution for the 3rd Low Power Computer Vision Challenge (LPCVC), DSP classification track and the 4th LPCVC, both classification track and detection track. Code and  50 pre-trained models (for many devices & many latency constraints) are released at https://github.com/mit-han-lab/once-for-all. ",
        "keywords": "Efficient Deep Learning;Specialized Neural Network Architecture;AutoML",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Han Cai;Chuang Gan;Tianzhe Wang;Zhekai Zhang;Song Han",
        "authorids": "hancai@mit.edu;ganchuang1990@gmail.com;usedtobe@mit.edu;zhangzk@mit.edu;songhan@mit.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nCai2020Once-for-All:,\ntitle={Once-for-All: Train One Network and Specialize it for Efficient Deployment},\nauthor={Han Cai and Chuang Gan and Tianzhe Wang and Zhekai Zhang and Song Han},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HylxE1HKwS}\n}",
        "github": "https://github.com/mit-han-lab/once-for-all",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HylxE1HKwS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "460;368;105",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "271;404;144",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            311.0,
            150.42827748354586
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            273.0,
            106.15397621694001
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1607,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5004054402916064925&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HylznxrYDr",
        "title": "FINBERT: FINANCIAL SENTIMENT ANALYSIS WITH PRE-TRAINED LANGUAGE MODELS",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce FinBERT, a language model based on BERT for financial text classification, where we improved state-of-the-art performance by 14 percentage points.",
        "abstract": "While many sentiment classification solutions report  high accuracy scores in product or movie review datasets, the performance of the methods in niche domains such as finance still largely falls behind. The reason of this gap is the domain-specific language, which decreases the applicability of existing models, and lack of quality labeled data to learn the new context of positive and negative in the specific domain. Transfer learning has been shown to be successful in adapting to new domains without large training data sets. In this paper, we explore the effectiveness of NLP transfer learning in financial sentiment classification. We introduce FinBERT, a language model based on BERT, which improved the state-of-the-art performance by 14 percentage points for a financial sentiment classification task in FinancialPhrasebank dataset.",
        "keywords": "Financial sentiment analysis;financial text classification;transfer learning;pre-trained language models;BERT;NLP",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dogu Araci;Zulkuf Genc",
        "authorids": "dogu.araci@naspers.com;zulkuf.genc@naspers.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\naraci2020finbert,\ntitle={{\\{}FINBERT{\\}}:  {\\{}FINANCIAL{\\}} {\\{}SENTIMENT{\\}} {\\{}ANALYSIS{\\}}   {\\{}WITH{\\}} {\\{}PRE{\\}}-{\\{}TRAINED{\\}} {\\{}LANGUAGE{\\}} {\\{}MODELS{\\}}},\nauthor={Dogu Araci and Zulkuf Genc},\nyear={2020},\nurl={https://openreview.net/forum?id=HylznxrYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HylznxrYDr",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "180;191;214;336",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            230.25,
            62.27509534316266
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1168,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11063521749171932039&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Hyx-jyBFPr",
        "title": "Self-labelling via simultaneous clustering and representation learning",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose a self-supervised learning formulation that simultaneously learns feature representations and useful dataset labels by optimizing the common cross-entropy loss for features _and_ labels, while maximizing information.",
        "abstract": "Combining clustering and representation learning is one of the most promising approaches for unsupervised learning of deep neural networks. However, doing so naively leads to ill posed learning problems with degenerate solutions.\nIn this paper, we propose a novel and principled learning formulation that addresses these issues.\nThe method is obtained by maximizing the information between labels and input data indices.\nWe show that this criterion extends standard cross-entropy minimization to an optimal transport problem, which we solve efficiently for millions of input images and thousands of labels using a fast variant of the Sinkhorn-Knopp algorithm.\nThe resulting method is able to self-label visual data so as to train highly competitive image representations without manual labels. Our method achieves state of the art representation learning performance for AlexNet and ResNet-50 on SVHN, CIFAR-10, CIFAR-100 and ImageNet and yields the first self-supervised AlexNet that outperforms the supervised Pascal VOC detection baseline. ",
        "keywords": "self-supervision;feature representation learning;clustering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Asano YM.;Rupprecht C.;Vedaldi A.",
        "authorids": "yuki@robots.ox.ac.uk;chrisr@robots.ox.ac.uk;vedaldi@robots.ox.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nYM.2020Self-labelling,\ntitle={Self-labelling via simultaneous clustering and representation learning},\nauthor={Asano YM. and Rupprecht C. and Vedaldi A.},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyx-jyBFPr}\n}",
        "github": "[![github](/images/github_icon.svg) yukimasano/self-label](https://github.com/yukimasano/self-label) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=Hyx-jyBFPr)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hyx-jyBFPr",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "256;408;639",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "465;683;74",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            434.3333333333333,
            157.46392885003502
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            407.3333333333333,
            251.944879333203
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 953,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17297173885241931276&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Hyx0slrFvH",
        "title": "Mixed Precision DNNs: All you need is a good parametrization",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Efficient deep neural network (DNN) inference on mobile or embedded devices typically involves quantization of the network parameters and activations. In particular, mixed precision networks achieve better performance than networks with homogeneous bitwidth for the same size constraint. Since choosing the optimal bitwidths is not straight forward, training methods, which can learn them, are desirable. Differentiable quantization with straight-through gradients allows to learn the quantizer's parameters using gradient methods. We show that a suited parametrization of the quantizer is the key to achieve a stable training and a good final performance. Specifically, we propose to parametrize the quantizer with the step size and dynamic range. The bitwidth can then be inferred from them. Other parametrizations, which explicitly use the bitwidth, consistently perform worse. We confirm our findings with experiments on CIFAR-10 and ImageNet and we obtain mixed precision DNNs with learned quantization parameters, achieving state-of-the-art performance.",
        "keywords": "Deep Neural Network Compression;Quantization;Straight through gradients",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stefan Uhlich;Lukas Mauch;Fabien Cardinaux;Kazuki Yoshiyama;Javier Alonso Garcia;Stephen Tiedemann;Thomas Kemp;Akira Nakamura",
        "authorids": "stefan.uhlich@sony.com;lukas.mauch@sony.com;fabien.cardinaux@sony.com;kazuki.yoshiyama@sony.com;javier.alonso@sony.com;stephen.tiedemann@sony.com;thomas.kemp@sony.com;akira.b.nakamura@sony.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nUhlich2020Mixed,\ntitle={Mixed Precision DNNs: All you need is a good parametrization},\nauthor={Stefan Uhlich and Lukas Mauch and Fabien Cardinaux and Kazuki Yoshiyama and Javier Alonso Garcia and Stephen Tiedemann and Thomas Kemp and Akira Nakamura},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyx0slrFvH}\n}",
        "github": "[![github](/images/github_icon.svg) sony/ai-research-code](https://github.com/sony/ai-research-code) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Hyx0slrFvH)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hyx0slrFvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "338;194;291",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1079;137;202",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            274.3333333333333,
            59.95739227892495
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            472.6666666666667,
            429.5628268626398
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "Hyx5qhEYvH",
        "title": "A SPIKING SEQUENTIAL MODEL: RECURRENT LEAKY INTEGRATE-AND-FIRE",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Stemming from neuroscience, Spiking neural networks (SNNs), a brain-inspired neural network that is a versatile solution to fault-tolerant and energy efficient information processing pertains to the \u201devent-driven\u201d characteristic as the analogy of the behavior of biological neurons. However, they are inferior to artificial neural networks (ANNs) in real complicated tasks and only had it been achieved good results in rather simple applications. When ANNs usually being questioned about it expensive processing costs and lack of essential biological plausibility, the temporal characteristic of RNN-based architecture makes it suitable to incorporate SNN inside as imitating the transition of membrane potential through time, and a brain-inspired Recurrent Leaky Integrate-and-Fire (RLIF) model has been put forward to overcome a series of challenges, such as discrete binary output and dynamical trait. The experiment results show that our recurrent architecture has an ultra anti-interference ability and strictly follows the guideline of SNN that spike output through it is discrete. Furthermore, this architecture achieves a good result on neuromorphic datasets and can be extended to tasks like text summarization and video understanding.",
        "keywords": "spiking neural network;RNN;spiking mode;brain-inspired;text summarization;DVS",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daiheng Gao;Hongwei Wang;Hehui Zhang;Meng Wang;Zhenzhi Wu",
        "authorids": "samuel.gao023@gmail.com;hongwei.wang@lynxi.com;zhh@bupt.edu.cn;wangmeng_wm@bupt.edu.cn;zhenzhi.wu@lynxi.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ngao2020a,\ntitle={A {\\{}SPIKING{\\}} {\\{}SEQUENTIAL{\\}} {\\{}MODEL{\\}}: {\\{}RECURRENT{\\}} {\\{}LEAKY{\\}} {\\{}INTEGRATE{\\}}-{\\{}AND{\\}}-{\\{}FIRE{\\}}},\nauthor={Daiheng Gao and Hongwei Wang and Hehui Zhang and Meng Wang and Zhenzhi Wu},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyx5qhEYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Hyx5qhEYvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "570;279;153",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            334.0,
            174.62531317080007
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8CdC9rnrjmMJ:scholar.google.com/&scioq=A+SPIKING+SEQUENTIAL+MODEL:+RECURRENT+LEAKY+INTEGRATE-AND-FIRE&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "HyxCRCEKwB",
        "title": "ROBUST GENERATIVE ADVERSARIAL NETWORK",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Generative adversarial networks (GANs) are powerful generative models, but usually suffer from instability which may lead to  poor generations. Most existing works try to alleviate this problem by focusing on stabilizing the training of the discriminator, which unfortunately ignores the robustness of generator and discriminator. In this work, we consider the robustness of GANs and propose a novel robust method called robust generative adversarial network (RGAN). Particularly, we design a robust optimization framework where the generator and discriminator compete with each other in a worst-case setting within a small Wasserstein ball. The generator tries to map the worst input distribution (rather than a specific input distribution, typically a Gaussian distribution used in most GANs) to the real data distribution, while the discriminator attempts to distinguish the real and fake distribution with the worst perturbation. We have provided theories showing that the generalization of the new robust framework can be guaranteed. A series of experiments on CIFAR-10, STL-10 and CelebA datasets indicate that our proposed robust framework can improve consistently on four baseline GAN models. We also provide  ablation analysis and visualization showing the efficacy of our method on both generator and discriminator  quantitatively and qualitatively.",
        "keywords": "Generative Adversarial Network;Robustness;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shufei Zhang;Zhuang Qian;Kaizhu Huang;Rui Zhang;Jimin Xiao",
        "authorids": "shufei.zhang@xjtlu.edu.cn;qz2009425@gmail.com;kaizhu.huang@xjtlu.edu.cn;rui.zhang02@xjtlu.edu.cn;jimin.xiao@xjtlu.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang2020robust,\ntitle={{\\{}ROBUST{\\}} {\\{}GENERATIVE{\\}} {\\{}ADVERSARIAL{\\}} {\\{}NETWORK{\\}}},\nauthor={Shufei Zhang and Zhuang Qian and Kaizhu Huang and Rui Zhang and Jimin Xiao},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxCRCEKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxCRCEKwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "226;202;517",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            315.0,
            143.17122615944868
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4946814276659846703&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "HyxFF34FPr",
        "title": "FoveaBox: Beyound Anchor-based Object Detection",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We present FoveaBox, an accurate, flexible, and completely anchor-free framework for object detection. While almost all state-of-the-art object detectors utilize predefined anchors to enumerate possible locations, scales and aspect ratios for the search of the objects, their performance and generalization ability are also limited to the design of anchors. Instead, FoveaBox directly learns the object existing possibility and the bounding box coordinates without anchor reference. This is achieved by: (a) predicting category-sensitive semantic maps for the object existing possibility, and (b) producing category-agnostic bounding box for each position that potentially contains an object. The scales of target boxes are naturally associated with feature pyramid representations. We demonstrate its effectiveness on standard benchmarks and report extensive experimental analysis.  Without bells and whistles, FoveaBox achieves state-of-the-art single model performance on the standard COCO detection benchmark. More importantly, FoveaBox avoids all computation and hyper-parameters related to anchor boxes, which are often sensitive to the final detection performance.  We believe the simple and effective approach will serve as a solid baseline and help ease future research for object detection. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tao Kong;Fuchun Sun;Huaping Liu;Yuning Jiang;Lei Li;Jianbo Shi",
        "authorids": "taokongcn@gmail.com;fcsun@tsinghua.edu.cn;hpliu@tsinghua.edu.cn;jiangyuning@bytedance.com;lileilab@bytedance.com;jshi@seas.upenn.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nkong2020foveabox,\ntitle={FoveaBox: Beyound Anchor-based Object Detection},\nauthor={Tao Kong and Fuchun Sun and Huaping Liu and Yuning Jiang and Lei Li and Jianbo Shi},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxFF34FPr}\n}",
        "github": "https://drive.google.com/file/d/1Iwe3Vfbunv5NaLFFn0i_fsfNT-XP0Zmx/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyxFF34FPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "344;390;282",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "392;511;458",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            338.6666666666667,
            44.251804734069566
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            453.6666666666667,
            48.678080853250115
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1161,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4793705234619070656&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HyxG3p4twS",
        "title": "Quantifying the Cost of Reliable Photo Authentication via High-Performance Learned Lossy Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "We learn an efficient lossy image codec that can be optimized to facilitate reliable photo manipulation detection at fractional cost in payload/quality and even at low bitrates.",
        "abstract": "Detection of photo manipulation relies on subtle statistical traces, notoriously removed by aggressive lossy compression employed online. We demonstrate that end-to-end modeling of complex photo dissemination channels allows for codec optimization with explicit provenance objectives. We design a lightweight trainable lossy image codec, that delivers competitive rate-distortion performance, on par with best hand-engineered alternatives, but has lower computational footprint on modern GPU-enabled platforms. Our results show that significant improvements in manipulation detection accuracy are possible at fractional costs in bandwidth/storage. Our codec improved the accuracy from 37% to 86% even at very low bit-rates, well below the practicality of JPEG (QF 20). ",
        "keywords": "image forensics;photo manipulation detection;learned compression;lossy compression;image compression;entropy estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pawel Korus;Nasir Memon",
        "authorids": "pkorus@nyu.edu;memon@nyu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nKorus2020Quantifying,\ntitle={Quantifying the Cost of Reliable Photo Authentication via High-Performance Learned Lossy Representations},\nauthor={Pawel Korus and Nasir Memon},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxG3p4twS}\n}",
        "github": "https://github.com/pkorus/neural-imaging",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HyxG3p4twS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "97;392;450",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "5;361;415",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            313.0,
            154.5595893714352
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            260.3333333333333,
            181.88885495146633
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1043795359610865764&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HyxJ1xBYDH",
        "title": "Learning-Augmented Data Stream Algorithms",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "The data stream model is a fundamental model for processing massive data sets with limited memory and fast processing time. Recently Hsu et al. (2019) incorporated machine learning techniques into the data stream model in order to learn relevant patterns in the input data. Such techniques were encapsulated by training an oracle to predict item frequencies in the streaming model. In this paper we explore the full power of such an oracle, showing that it can be applied to a wide array of problems in data streams, sometimes resulting in the first optimal bounds for such problems. Namely, we apply the oracle to counting distinct elements on the difference of streams, estimating frequency moments, estimating cascaded aggregates, and estimating moments of geometric data streams. For the distinct elements problem, we obtain the first memory-optimal algorithms. For estimating the $p$-th frequency moment for $0 < p < 2$ we obtain the first algorithms with optimal update time. For estimating the $p$-the frequency moment for $p > 2$ we obtain a quadratic saving in memory. We empirically validate our results, demonstrating also our improvements in practice. ",
        "keywords": "streaming algorithms;heavy hitters;F_p moment;distinct elements;cascaded norms",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tanqiu Jiang;Yi Li;Honghao Lin;Yisong Ruan;David P. Woodruff",
        "authorids": "taj320@lehigh.edu;yili@ntu.edu.sg;honghao_lin@sjtu.edu.cn;24320152202802@stu.xmu.edu.cn;dwoodruf@andrew.cmu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nJiang2020Learning-Augmented,\ntitle={Learning-Augmented Data Stream Algorithms},\nauthor={Tanqiu Jiang and Yi Li and Honghao Lin and Yisong Ruan and David P. Woodruff},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxJ1xBYDH}\n}",
        "github": "https://drive.google.com/open?id=1faroW4fFTM7ELVkZDtgiMBjUu1F-piQa",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxJ1xBYDH",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "425;251;231",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "261;56;44",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            302.3333333333333,
            87.12188142035399
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            120.33333333333333,
            99.5869246214359
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 53,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11682972066962399443&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HyxJhCEFDS",
        "title": "Intriguing Properties of Adversarial Training at Scale",
        "track": "main",
        "status": "Poster",
        "tldr": "The first rigor diagnose of large-scale adversarial training on ImageNet",
        "abstract": "Adversarial training is one of the main defenses against adversarial attacks. In this paper, we provide the first rigorous study on diagnosing elements of large-scale adversarial training on ImageNet, which reveals two intriguing properties. \n\nFirst, we study the role of normalization. Batch normalization (BN) is a crucial element for achieving state-of-the-art performance on many vision tasks, but we show it may prevent networks from obtaining strong robustness in adversarial training. One unexpected observation is that, for models trained with BN, simply removing clean images from training data largely boosts adversarial robustness, i.e., 18.3%. We relate this phenomenon to the hypothesis that clean images and adversarial images are drawn from two different domains. This two-domain hypothesis may explain the issue of BN when training with a mixture of clean and adversarial images, as estimating normalization statistics of this mixture distribution is challenging. Guided by this two-domain hypothesis, we show disentangling the mixture distribution for normalization, i.e., applying separate BNs to clean and adversarial images for statistics estimation, achieves much stronger robustness. Additionally, we find that enforcing BNs to behave consistently at training and testing can further enhance robustness.\n\nSecond, we study the role of network capacity. We find our so-called \"deep\" networks are still shallow for the task of adversarial learning. Unlike traditional classification tasks where accuracy is only marginally improved by adding more layers to \"deep\" networks (e.g., ResNet-152), adversarial training exhibits a much stronger demand on deeper networks to achieve higher adversarial robustness. This robustness improvement can be observed substantially and consistently even by pushing the network capacity to an unprecedented scale, i.e., ResNet-638.  ",
        "keywords": "adversarial defense;adversarial machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cihang Xie;Alan Yuille",
        "authorids": "cihangxie306@gmail.com;alan.l.yuille@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nXie2020Intriguing,\ntitle={Intriguing Properties of Adversarial Training at Scale},\nauthor={Cihang Xie and Alan Yuille},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxJhCEFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxJhCEFDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "903;699;306",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1090;403;312",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            636.0,
            247.76198255583927
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            601.6666666666666,
            347.29654315712514
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 244,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17226721860006165145&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HyxLRTVKPH",
        "title": "Budgeted Training: Rethinking Deep Neural Network Training Under Resource Constraints",
        "track": "main",
        "status": "Poster",
        "tldr": "Introduce a formal setting for budgeted training and propose a budget-aware linear learning rate schedule",
        "abstract": "In most practical settings and theoretical analyses, one assumes that a model can be trained until convergence. However, the growing complexity of machine learning datasets and models may violate such assumptions. Indeed, current approaches for hyper-parameter tuning and neural architecture search tend to be limited by practical resource constraints. Therefore, we introduce a formal setting for studying training under the non-asymptotic, resource-constrained regime, i.e., budgeted training. We analyze the following problem: \"given a dataset, algorithm, and fixed resource budget, what is the best achievable performance?\" We focus on the number of optimization iterations as the representative resource. Under such a setting, we show that it is critical to adjust the learning rate schedule according to the given budget. Among budget-aware learning schedules, we find simple linear decay to be both robust and high-performing. We support our claim through extensive experiments with state-of-the-art models on ImageNet (image classification), Kinetics (video classification), MS COCO (object detection and instance segmentation), and Cityscapes (semantic segmentation). We also analyze our results and find that the key to a good schedule is budgeted convergence, a phenomenon whereby the gradient vanishes at the end of each allowed budget. We also revisit existing approaches for fast convergence and show that budget-aware learning schedules readily outperform such approaches under (the practical but under-explored) budgeted training setting.",
        "keywords": "budgeted training;learning rate schedule;linear schedule;annealing;learning rate decay",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mengtian Li;Ersin Yumer;Deva Ramanan",
        "authorids": "mtli@cs.cmu.edu;meyumer@gmail.com;deva@cs.cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLi2020Budgeted,\ntitle={Budgeted Training: Rethinking Deep Neural Network Training Under Resource Constraints},\nauthor={Mengtian Li and Ersin Yumer and Deva Ramanan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxLRTVKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxLRTVKPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "391;228;857",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "177;77;890",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            492.0,
            266.534550605858
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            381.3333333333333,
            361.99109872420286
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 62,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8340628280866115762&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HyxPIyrFvH",
        "title": "When Robustness Doesn\u2019t Promote Robustness: Synthetic vs. Natural Distribution Shifts on ImageNet",
        "track": "main",
        "status": "Reject",
        "tldr": "We compare current robustness interventions and find that none promote robustness on natural distribution shifts.",
        "abstract": "We conduct a large experimental comparison of various robustness metrics for image classification. The main question of our study is to what extent current synthetic robustness interventions (lp-adversarial examples, noise corruptions, etc.) promote robustness under natural distribution shifts occurring in real data. To this end, we evaluate 147 ImageNet models under 199 different evaluation settings. We find that no current robustness intervention improves robustness on natural distribution shifts beyond a baseline given by standard models without a robustness intervention. The only exception is the use of larger training datasets, which provides a small increase in robustness on one natural distribution shift. Our results indicate that robustness improvements on real data may require new methodology and more evaluations on natural distribution shifts.",
        "keywords": "robustness;distribution shift;image corruptions;adversarial robustness;reliable machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rohan Taori;Achal Dave;Vaishaal Shankar;Nicholas Carlini;Benjamin Recht;Ludwig Schmidt",
        "authorids": "rohantaori@berkeley.edu;achald@cs.cmu.edu;vaishaal@berkeley.edu;nicholas@carlini.com;brecht@berkeley.edu;ludwigschmidt2@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ntaori2020when,\ntitle={When Robustness Doesn{\\textquoteright}t Promote Robustness: Synthetic vs. Natural Distribution Shifts on ImageNet},\nauthor={Rohan Taori and Achal Dave and Vaishaal Shankar and Nicholas Carlini and Benjamin Recht and Ludwig Schmidt},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxPIyrFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyxPIyrFvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "290;307;481",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "559;1191;885",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.3333333333333,
            86.31080787222163
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            878.3333333333334,
            258.05598014556625
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=738137051196291592&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HyxQ3gSKvr",
        "title": "Variational Information Bottleneck for Unsupervised Clustering: Deep Gaussian Mixture Embedding",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, we develop an unsupervised generative clustering framework that combines variational information bottleneck and the Gaussian Mixture Model. Specifically, in our approach we use the variational information bottleneck method and model the latent space as a mixture of Gaussians. We derive a bound on the cost function of our model that generalizes the evidence lower bound (ELBO); and provide a variational inference type algorithm that allows to compute it. In the algorithm, the coders\u2019 mappings are parametrized using neural networks and the bound is approximated by Markov sampling and optimized with stochastic gradient descent. Numerical results on real datasets are provided to support the efficiency of our method.",
        "keywords": "clustering;Variational Information Bottleneck;Gaussian Mixture Model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yigit Ugur;George Arvanitakis;Abdellatif Zaidi",
        "authorids": "ygtugur@gmail.com;george.arvanitakis@huawei.com;abdellatif.zaidi@u-pem.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nugur2020variational,\ntitle={Variational Information Bottleneck for Unsupervised Clustering: Deep Gaussian Mixture Embedding},\nauthor={Yigit Ugur and George Arvanitakis and Abdellatif Zaidi},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxQ3gSKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HyxQ3gSKvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "222;982;110",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            438.0,
            387.37406555765534
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1306221550786380260&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "HyxQbaEYPr",
        "title": "Leveraging Simple Model Predictions for Enhancing its Performance",
        "track": "main",
        "status": "Reject",
        "tldr": "Method to improve simple models performance given a (accurate) complex model.",
        "abstract": "There has been recent interest in improving performance of simple models for multiple reasons such as interpretability, robust learning from small data, deployment in memory constrained settings as well as environmental considerations. In this paper, we propose a novel method SRatio that can utilize information from high performing complex models (viz. deep neural networks, boosted trees, random forests) to reweight a training dataset for a potentially low performing simple model such as a decision tree or a shallow network enhancing its performance. Our method also leverages the per sample hardness estimate of the simple model which is not the case with the prior works which primarily consider the complex model's confidences/predictions and is thus conceptually novel. Moreover, we generalize and formalize the concept of attaching probes to intermediate layers of a neural network, which was one of the main ideas in previous work \\citep{profweight}, to other commonly used classifiers and incorporate this into our method. The benefit of these contributions is witnessed in the experiments where on 6 UCI datasets and CIFAR-10 we outperform competitors in a majority (16 out of 27) of the cases and tie for best performance in the remaining cases. In fact, in a couple of cases, we even approach the complex model's performance. We also conduct further experiments to validate assertions and intuitively understand why our method works. Theoretically, we motivate our approach by showing that the weighted loss minimized by simple models using our weighting upper bounds the loss of the complex model.",
        "keywords": "simple models;interpretability;resource constraints",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amit Dhurandhar;Karthikeyan Shanmugam;Ronny Luss",
        "authorids": "adhuran@us.ibm.com;karthikeyan.shanmugam2@ibm.com;rluss@us.ibm.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndhurandhar2020leveraging,\ntitle={Leveraging Simple Model Predictions for Enhancing its Performance},\nauthor={Amit Dhurandhar and Karthikeyan Shanmugam and Ronny Luss},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxQbaEYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyxQbaEYPr",
        "pdf_size": 0,
        "rating": "1;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "805;762;272;216",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "1268;366;49;365",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;1;1;1",
        "rating_avg": [
            4.75,
            2.165063509461097
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            513.75,
            270.90254243915837
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            512.0,
            455.2005052721273
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XHIIp235DE0J:scholar.google.com/&scioq=Leveraging+Simple+Model+Predictions+for+Enhancing+its+Performance&hl=en&as_sdt=0,14",
        "gs_version_total": 2
    },
    {
        "id": "HyxTJxrtvr",
        "title": "Learning a Spatio-Temporal Embedding for Video Instance Segmentation",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a new spatio-temporal embedding loss on videos that generates temporally consistent video instance segmentation, even with occlusions and missed detections, using appearance, geometry, and temporal context.",
        "abstract": "Understanding object motion is one of the core problems in computer vision. It requires segmenting and tracking objects over time. Significant progress has been made in instance segmentation, but such models cannot track objects, and more crucially, they are unable to reason in both 3D space and time.\nWe propose a new spatio-temporal embedding loss on videos that generates temporally consistent video instance segmentation. Our model includes a temporal network that learns to model temporal context and motion, which is essential to produce smooth embeddings over time. Further, our model also estimates monocular depth, with a self-supervised loss, as the relative distance to an object effectively constrains where it can be next, ensuring a time-consistent embedding. Finally, we show that our model can accurately track and segment instances, even with occlusions and missed detections, advancing the state-of-the-art on the KITTI Multi-Object and Tracking Dataset.",
        "keywords": "computer;vision;video;instance;segmentation;metric;learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anthony Hu;Alex Kendall;Roberto Cipolla",
        "authorids": "ah2029@cam.ac.uk;alex@wayve.ai;rc10001@cam.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhu2020learning,\ntitle={Learning a Spatio-Temporal Embedding for Video Instance Segmentation},\nauthor={Anthony Hu and Alex Kendall and Roberto Cipolla},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxTJxrtvr}\n}",
        "github": "https://github.com/iclr-2020-embedding/spatio-temporal-embedding",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=HyxTJxrtvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "241;266;220",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "392;483;344",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            242.33333333333334,
            18.80307303489394
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            406.3333333333333,
            57.644504411859494
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10155782178184512596&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HyxWteSFwS",
        "title": "Deep Interaction Processes for Time-Evolving Graphs",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a principled deep neural approach that models continuous time-evolving graphs at multiple time resolutions based on a temporal point processframework.",
        "abstract": "Time-evolving graphs are ubiquitous such as online transactions on an e-commerce platform and user interactions on social networks. While neural approaches have been proposed for graph modeling, most of them focus on static graphs. In this paper we present a principled deep neural approach that models continuous time-evolving graphs at multiple time resolutions based on a temporal point process framework.  To model the dependency between latent dynamic representations of each node, we define a mixture of temporal cascades in which a node's neural representation depends on not only this node's previous representations but also the previous representations of related nodes that have interacted with this node. We generalize LSTM on this temporal cascade mixture and introduce novel time gates to model time intervals between interactions. Furthermore, we introduce a selection mechanism that gives important nodes large influence in both $k-$hop subgraphs of nodes in an interaction. To capture temporal dependency at multiple time-resolutions, we stack our neural representations in several layers and fuse them based on attention. Based on the temporal point process framework, our approach can naturally handle growth (and shrinkage) of graph nodes and interactions, making it inductive. Experimental results on interaction prediction and classification tasks -- including a  real-world financial application --  illustrate the effectiveness of the time gate, the selection and attention mechanisms of our approach, as well as its \nsuperior performance over the alternative approaches.",
        "keywords": "deep temporal point process;multiple time resolutions;dynamic continuous time-evolving graph;anti-fraud detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "xiaofu chang;jianfeng wen;xuqin liu;yanming fang;le song;yuan qi",
        "authorids": "xiaofu.cxf@antfin.com;sylvain.wjf@antfin.com;xuqin.lxq@antfin.com;yanming.fym@mybank.cn;le.song@antfin.com;yuan.qi@antfin.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nchang2020deep,\ntitle={Deep Interaction Processes for Time-Evolving Graphs},\nauthor={xiaofu chang and jianfeng wen and xuqin liu and yanming fang and le song and yuan qi},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxWteSFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyxWteSFwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "144;393;1006",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "546;887;913",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            514.3333333333334,
            362.2175527988056
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            782.0,
            167.21443318884488
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "HyxY6JHKwr",
        "title": "You Only Train Once: Loss-Conditional Training of Deep Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "A method to train a single model simultaneously minimizing a family of loss functions instead of training a set of per-loss models.",
        "abstract": "In many machine learning problems, loss functions are weighted sums of several terms. A typical approach to dealing with these is to train multiple separate models with different selections of weights and then either choose the best one according to some criterion or keep multiple models if it is desirable to maintain a diverse set of solutions. This is inefficient both at training and at inference time. We propose a method that allows replacing multiple models trained on one loss function each by a single model trained on a distribution of losses. At test time a model trained this way can be conditioned to generate outputs corresponding to any loss from the training distribution of losses. We demonstrate this approach on three tasks with parametrized losses: beta-VAE, learned image compression, and fast style transfer.",
        "keywords": "deep learning;image generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexey Dosovitskiy;Josip Djolonga",
        "authorids": "adosovitskiy@gmail.com;josip@djolonga.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nDosovitskiy2020You,\ntitle={You Only Train Once: Loss-Conditional Training of Deep Networks},\nauthor={Alexey Dosovitskiy and Josip Djolonga},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxY6JHKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxY6JHKwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "588;415;414",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "353;427;175",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            472.3333333333333,
            81.78970323674952
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            318.3333333333333,
            105.75863505590874
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 120,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8627352499068155254&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Hyx_6p4tDr",
        "title": "Frontal low-rank random tensors for high-order feature representation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Representing high-order (second-order or higher) information in deep neural nets is essential in many tasks such as fine-grained visual understanding and multi-modal information fusion. Bilinear models are often used to extract second-order information. As a basis, extracting higher-order information requires extra computation. In this paper, we propose an approach to representing high-order information via a simple yet effective bilinear form. Specifically, our contribution is two-fold: (1) From the multilinear perspective, we derive a bilinear form of low complexity, assuming that the three-way tensor has low-rank frontal slices. (2) Rather than learning the tensor entries from data, we sample the entries from different underlying distributions, and prove that the underlying distribution influences the information order.  We perform temporal action segmentation experiments to evaluate our method. The results demonstrate that our bilinear form, employed as intermediate layers in deep neural nets, is computationally efficient; meanwhile it is effective as it achieves new state-of-the-art results on public benchmarks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yan Zhang;Krikamol Muandet;Qianli Ma;Heiko Neumann;Siyu Tang",
        "authorids": "yan.zhang@tuebingen.mpg.de;krikamol@tuebingen.mpg.de;qianli.ma@tue.mpg.de;heiko.neumann@uni-ulm.de;stang@tuebingen.mpg.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "https://github.com/anonymous-rubick/ms-tcn-bilinear",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer6;AnonReviewer5;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hyx_6p4tDr",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "344;185;477;389",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.75,
            105.95370451286732
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kVvRywRsx88J:scholar.google.com/&scioq=Frontal+low-rank+random+tensors+for+high-order+feature+representation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Hyx_h64Yvr",
        "title": "Kronecker Attention Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Attention operators have been applied on both 1-D data like texts and higher-order data such as images and videos. Use of attention operators on high-order data requires flattening of the spatial or spatial-temporal dimensions into a vector, which is assumed to follow a multivariate normal distribution. This not only incurs excessive requirements on computational resources, but also fails to preserve structures in data. In this work, we propose to avoid flattening by developing Kronecker attention operators (KAOs) that operate on high-order tensor data directly. KAOs lead to dramatic reductions in computational resources. Moreover, we analyze KAOs theoretically from a probabilistic perspective and point out that KAOs assume the data follow matrix-variate normal distributions. Experimental results show that KAOs reduce the amount of required computational resources by a factor of hundreds, with larger factors for higher-dimensional and higher-order data. Results also show that networks with KAOs outperform models without attention, while achieving competitive performance as those with original attention operators.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongyang Gao;Zhengyang Wang;Shuiwang Ji",
        "authorids": "hongyang.gao@tamu.edu;zhengyang.wang@tamu.edu;sji@tamu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngao2020kronecker,\ntitle={Kronecker Attention Networks},\nauthor={Hongyang Gao and Zhengyang Wang and Shuiwang Ji},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyx_h64Yvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Hyx_h64Yvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "465;503;380",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            449.3333333333333,
            51.42200134399888
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14875383418254835346&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HyxakgrFvS",
        "title": "Side-Tuning: Network Adaptation via Additive Side Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Side-tuning adapts a pre-trained network by training a lightweight \"side\" network that is fused with the (unchanged) pre-trained network using a simple additive process.",
        "abstract": "When training a neural network for a desired task, one may prefer to adapt a pretrained network rather than start with a randomly initialized one -- due to lacking enough training data, performing lifelong learning where the system has to learn a new task while being previously trained for other tasks, or wishing to encode priors in the network via preset weights. The most commonly employed approaches for network adaptation are fine-tuning and using the pre-trained network as a fixed feature extractor, among others. \n\nIn this paper we propose a straightforward alternative: Side-Tuning. Side-tuning adapts a pretrained network by training a lightweight \"side\" network that is fused with the (unchanged) pre-rained network using a simple additive process. This simple method works as well as or better than existing solutions while it resolves some of the basic issues with fine-tuning, fixed features, and several other common baselines. In particular, side-tuning is less prone to overfitting when little training data is available, yields better results than using a fixed feature extractor, and doesn't suffer from catastrophic forgetting in lifelong learning.  We demonstrate the performance of side-tuning under a diverse set of scenarios, including lifelong learning (iCIFAR, Taskonomy), reinforcement learning, imitation learning (visual navigation in Habitat), NLP question-answering (SQuAD v2), and single-task transfer learning (Taskonomy), with consistently promising results.",
        "keywords": "sidetuning;finetuning;transfer learning;representation learning;lifelong learning;incremental learning;continual learning;meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexander Sax;Jeffrey Zhang;Amir Zamir;Silvio Savarese;Jitendra Malik",
        "authorids": "sax@berkeley.edu;jozhang@berkeley.edu;zamir@cs.stanford.edu;ssilvio@cs.stanford.edu;malik@eecs.berkeley.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "https://www.dropbox.com/sh/v7qmrj4n0yqkjjp/AAD1HNfZk8_8sdBCu-WJj-Jna?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxakgrFvS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "203;285;348",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "202;277;480",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            278.6666666666667,
            59.365160003340534
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            319.6666666666667,
            117.4346153776172
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=628244479301201835&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HyxcZT4KwB",
        "title": "Real or Fake: An Empirical Study and Improved Model for Fake Face Detection",
        "track": "main",
        "status": "Withdraw",
        "tldr": "An empirical study on fake images reveals that texture is an important cue that current fake images differ from real images. Our improved model capturing global texture statistics shows better cross-GAN fake image detection performance.",
        "abstract": "Now GANs can generate more and more realistic face images that can easily fool human beings.  In contrast, a common convolutional neural network(CNN), e.g. ResNet-18, can achieve more than 99.9% accuracy in discerning fake/real faces if training and testing faces are from the same source. In this paper, we performed both human studies and CNN experiments, which led us to two important findings. One finding is that the textures of fake faces are substantially different from real ones. CNNs can capture local image texture information for recognizing fake/real face, while such cues are easily overlooked by humans. The other finding is that global image texture information is more robust to image editing and generalizable to fake faces from different GANs and datasets. Based on the above findings, we propose  a  novel  architecture  coined  as  Gram-Net,  which  incorporates  \u201cGram Block\u201d in multiple semantic levels to extract global image texture representations. Experimental results demonstrate that our Gram-Net performs better than existing approaches for fake face detection.  Especially, our Gram-Net is more robust to image editing, e.g.  downsampling, JPEG compression, blur, and noise.  More importantly, our Gram-Net generalizes significantly better in detecting fake faces from GAN models not seen in the training phase.",
        "keywords": "global image texture;generative adversarial networks;Gram matrix",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhengzhe Liu;Xiaojuan Qi;Jiaya Jia;Philip H. S. Torr",
        "authorids": "liuzhengzhelzz@gmail.com;xiaojuan.qi@eng.ox.ac.uk;leojia@cse.cuhk.edu.hk;philip.torr@eng.ox.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyxcZT4KwB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "213;292;282",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "43;36;95",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            262.3333333333333,
            35.122009560324926
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            58.0,
            26.318561257535844
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9373083645922903144&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "HyxehhNtvS",
        "title": "Why Learning of Large-Scale Neural Networks Behaves Like Convex Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "Some theoretical work on why learning of large neural networks converges to a global minimum in probability one",
        "abstract": "In this paper, we present some theoretical work to explain why simple gradient descent methods are so successful in solving non-convex optimization problems in learning large-scale neural networks (NN). After introducing a mathematical tool called  canonical space, we have proved that the objective functions in learning NNs are convex in the canonical model space. We further elucidate that the gradients between the original NN model space and the canonical space are related by a pointwise linear transformation, which is represented by the so-called  disparity matrix. Furthermore, we have proved that gradient descent methods surely converge to a global minimum of zero loss provided that the disparity matrices maintain full rank.  If this full-rank condition holds, the learning of NNs behaves in the same way as normal convex optimization. At last, we have shown that the chance to have singular disparity matrices is extremely slim in large NNs. In particular, when over-parameterized NNs are randomly initialized, the gradient decent algorithms converge to a global minimum of zero loss in probability. ",
        "keywords": "function space;canonical space;neural networks;stochastic gradient descent;disparity matrix",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hui Jiang",
        "authorids": "hj@cse.yorku.ca",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\njiang2020why,\ntitle={Why Learning of Large-Scale Neural Networks Behaves Like Convex Optimization},\nauthor={Hui Jiang},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxehhNtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=HyxehhNtvS",
        "pdf_size": 0,
        "rating": "1;1;1;1",
        "confidence": "0;0;0;0",
        "wc_review": "255;296;467;430",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            362.0,
            88.6763779143014
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9536196827191227455&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "HyxfGCVYDr",
        "title": "One Generation Knowledge Distillation by Utilizing Peer Samples",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present a novel framework of Knowledge Distillation utilizing peer samples as the teacher",
        "abstract": "Knowledge Distillation (KD) is a widely used technique in recent deep learning research to obtain small and simple models whose performance is on a par with their large and complex counterparts. Standard Knowledge Distillation tends to be time-consuming because of the training time spent to obtain a teacher model that would then provide guidance for the student model. It might be possible to cut short the time by training a teacher model on the fly, but it is not trivial to have such a high-capacity teacher that gives quality guidance to student models this way. To improve this, we present a novel framework of Knowledge Distillation exploiting dark knowledge from the whole training set. In this framework, we propose a simple and effective implementation named Distillation by Utilizing Peer Samples (DUPS) in one generation. We verify our algorithm on numerous experiments. Compared with standard training on modern architectures, DUPS achieves an average improvement of 1%-2% on various tasks with nearly zero extra cost. Considering some typical Knowledge Distillation methods which are much more time-consuming, we also get comparable or even better performance using DUPS.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xingjian Li;Haozhe An;Haoyi Xiong;Jun Huan;Dejing Dou;Chengzhong Xu",
        "authorids": ";haozhe.an@yahoo.com;;;;",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxfGCVYDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "318;827;1022",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "146;233;174",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            722.3333333333334,
            296.7831232098992
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            184.33333333333334,
            36.261396675926434
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SWm5-bkeIi0J:scholar.google.com/&scioq=One+Generation+Knowledge+Distillation+by+Utilizing+Peer+Samples&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Hyxfs1SYwH",
        "title": "Alleviating Privacy Attacks via Causal Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Machine learning models, especially deep neural networks have been shown to reveal membership information of inputs in the training data. Such membership inference attacks are a serious privacy concern, for example, patients providing medical records to build a model that detects HIV would not want their identity to be leaked.  Further, we show that the attack accuracy amplifies when the model is used to predict samples that come from a different distribution than the training set, which is often the case in real world applications.   Therefore, we propose the use of causal learning approaches where a model learns the causal relationship between the input features and the outcome. Causal models are known to be invariant to the training distribution and hence generalize well to shifts between samples from the same distribution and across different distributions. First, we prove that models learned using causal structure provide stronger differential privacy guarantees than associational models under reasonable assumptions. Next, we  show that causal models trained on sufficiently large samples are robust to membership inference attacks across different distributions of datasets and those trained on smaller sample sizes always have lower attack accuracy than corresponding associational models. Finally, we  confirm our theoretical claims with  experimental evaluation on 4 datasets with moderately complex Bayesian networks. We observe that neural network-based associational models exhibit upto 80% attack accuracy under different test distributions and sample sizes whereas causal models exhibit attack accuracy close to a random guess. Our results confirm the value of the generalizability of causal models in reducing susceptibility to privacy attacks.",
        "keywords": "Causal learning;Membership Inference Attacks;Differential Privacy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shruti Tople;Amit Sharma;Aditya Nori",
        "authorids": "t-shtopl@microsoft.com;amshar@microsoft.com;adityan@microsoft.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntople2020alleviating,\ntitle={Alleviating Privacy Attacks via Causal Learning},\nauthor={Shruti Tople and Amit Sharma and Aditya Nori},\nyear={2020},\nurl={https://openreview.net/forum?id=Hyxfs1SYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Hyxfs1SYwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "343;1162;502",
        "wc_reply_reviewers": "0;904;0",
        "wc_reply_authors": "480;2457;805",
        "reply_reviewers": "0;5;0",
        "reply_authors": "1;6;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            669.0,
            354.5955442472452
        ],
        "wc_reply_reviewers_avg": [
            301.3333333333333,
            426.1496867950927
        ],
        "wc_reply_authors_avg": [
            1247.3333333333333,
            865.5927962320902
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            2.357022603955158
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1602034690776300514&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HyxgBerKwB",
        "title": "GraphQA: Protein Model Quality Assessment using Graph Convolutional Network",
        "track": "main",
        "status": "Reject",
        "tldr": "GraphQA is a graph-based method for protein Quality Assessment that improves the state-of-the-art for both hand-engineered and representation-learning approaches",
        "abstract": "Proteins are ubiquitous molecules whose function in biological processes is determined by their 3D structure.\nExperimental identification of a protein's structure can be time-consuming, prohibitively expensive, and not always possible. \nAlternatively, protein folding can be modeled using computational methods, which however are not guaranteed to always produce optimal results.\nGraphQA is a graph-based method to estimate the quality of protein models, that possesses favorable properties such as representation learning, explicit modeling of both sequential and 3D structure, geometric invariance and computational efficiency. \nIn this work, we demonstrate significant improvements of the state-of-the-art for both hand-engineered and representation-learning approaches, as well as carefully evaluating the individual contributions of GraphQA.",
        "keywords": "Protein Quality Assessment;Graph Networks;Representation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Federico Baldassarre;David Men\u00e9ndez Hurtado;Arne Elofsson;Hossein Azizpour",
        "authorids": "baldassarre.fe@gmail.com;david.menendez.hurtado@scilifelab.se;arne@bioinfo.se;azizpour@kth.se",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nbaldassarre2020graphqa,\ntitle={Graph{\\{}QA{\\}}: Protein Model Quality Assessment using Graph Convolutional Network},\nauthor={Federico Baldassarre and David Men{\\'e}ndez Hurtado and Arne Elofsson and Hossein Azizpour},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxgBerKwB}\n}",
        "github": "https://anonymous.4open.science/r/94d976ce-9166-4379-b3c4-3c982752a931/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxgBerKwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "234;174;401",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "689;262;1497",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;3",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            269.6666666666667,
            96.04281452676312
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            816.0,
            512.1217303206989
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 128,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14470286494247095259&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "HyxgoyHtDB",
        "title": "Policy Optimization by Local Improvement through Search",
        "track": "main",
        "status": "Reject",
        "tldr": "Monte Carlo tree search can generate short time horizon demonstrations for effective imitation learning.",
        "abstract": "Imitation learning has emerged as a powerful strategy for learning initial policies that can be refined with reinforcement learning techniques. Most strategies in imitation learning, however, rely on per-step supervision either from expert demonstrations, referred to as behavioral cloning or from interactive expert policy queries such as DAgger. These strategies differ on the state distribution at which the expert actions are collected -- the former using the state distribution of the expert, the latter using the state distribution of the policy being trained. However, the learning signal in both cases arises from the expert actions. On the other end of the spectrum, approaches rooted in Policy Iteration, such as Dual Policy Iteration do not choose next step actions based on an expert, but instead use planning or search over the policy to choose an action distribution to train towards. However, this can be computationally expensive, and can also end up training the policy on a state distribution that is far from the current policy's induced distribution. In this paper, we propose an algorithm that finds a middle ground by using Monte Carlo Tree Search (MCTS) to perform local trajectory improvement over rollouts from the policy. We provide theoretical justification for both the proposed local trajectory search algorithm and for our use of MCTS as a local policy improvement operator. We also show empirically that our method (Policy Optimization by Local Improvement through Search or POLISH) is much faster than methods that plan globally, speeding up training by a factor of up to 14 in wall clock time. Furthermore, the resulting policy outperforms strong baselines in both reinforcement learning and imitation learning.",
        "keywords": "policy learning;imitation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jialin Song;Joe Wenjie Jiang;Amir Yazdanbakhsh;Ebrahim Songhori;Anna Goldie;Navdeep Jaitly;Azalia Mirhoseini",
        "authorids": "jssong@caltech.edu;wenjiej@google.com;ayazdan@google.com;esonghori@google.com;agoldie@google.com;ndjaitly@google.com;azalia@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nsong2020policy,\ntitle={Policy Optimization by Local Improvement through Search},\nauthor={Jialin Song and Joe Wenjie Jiang and Amir Yazdanbakhsh and Ebrahim Songhori and Anna Goldie and Navdeep Jaitly and Azalia Mirhoseini},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxgoyHtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyxgoyHtDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "369;467;869",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "361;304;553",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            568.3333333333334,
            216.33513096325547
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            406.0,
            106.51760417883985
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nE3bMbgzyPkJ:scholar.google.com/&scioq=Policy+Optimization+by+Local+Improvement+through+Search&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HyxhqhVKPB",
        "title": "Moniqua: Modulo Quantized Communication in Decentralized SGD",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a general method that allows decentralized SGD to use quantized communication.",
        "abstract": "Decentralized stochastic gradient descent (SGD), where parallel workers are connected to form a graph and communicate adjacently, has shown promising results both theoretically and empirically. In this paper we propose Moniqua, a technique that allows decentralized SGD to use quantized communication. We prove in theory that Moniqua communicates a provably bounded number of bits per iteration, while converging at the same asymptotic rate as the original algorithm does with full-precision communication. Moniqua improves upon prior works in that it (1) requires no additional memory, (2) applies to non-convex objectives, and (3) supports biased/linear quantizers. We demonstrate empirically that Moniqua converges faster with respect to wall clock time than other quantized decentralized algorithms.  We also show that Moniqua is robust to very low bit-budgets, allowing  less than 4-bits-per-parameter communication without affecting convergence when training VGG16 on CIFAR10.",
        "keywords": "decentralized training;quantization;communicaiton;stochastic gradient descent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yucheng Lu;Christopher De Sa",
        "authorids": "yl2967@cornell.edu;cdesa@cs.cornell.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlu2020moniqua,\ntitle={Moniqua: Modulo Quantized Communication in Decentralized {\\{}SGD{\\}}},\nauthor={Yucheng Lu and Christopher De Sa},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxhqhVKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxhqhVKPB",
        "pdf_size": 0,
        "rating": "3;3;3;8",
        "confidence": "0;0;0;0",
        "wc_review": "283;808;397;264",
        "wc_reply_reviewers": "0;416;0;0",
        "wc_reply_authors": "504;2882;0;43",
        "reply_reviewers": "0;2;0;0",
        "reply_authors": "1;5;0;1",
        "rating_avg": [
            4.25,
            2.165063509461097
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            438.0,
            219.5916665085449
        ],
        "wc_reply_reviewers_avg": [
            104.0,
            180.13328398716322
        ],
        "wc_reply_authors_avg": [
            857.25,
            1185.5672429263554
        ],
        "reply_reviewers_avg": [
            0.5,
            0.8660254037844386
        ],
        "reply_authors_avg": [
            1.75,
            1.920286436967152
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 69,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=224074559302820267&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HyxjNyrtPr",
        "title": "RGBD-GAN: Unsupervised 3D Representation Learning From Natural Image Datasets via RGBD Image Synthesis",
        "track": "main",
        "status": "Poster",
        "tldr": "RGBD image generation for unsupervised camera parameter conditioning",
        "abstract": "Understanding three-dimensional (3D) geometries from two-dimensional (2D) images without any labeled information is promising for understanding the real world without incurring annotation cost. We herein propose a novel generative model, RGBD-GAN, which achieves unsupervised 3D representation learning from 2D images. The proposed method enables camera parameter--conditional image generation and depth image generation without any 3D annotations, such as camera poses or depth. We use an explicit 3D consistency loss for two RGBD images generated from different camera parameters, in addition to the ordinal GAN objective. The loss is simple yet effective for any type of image generator such as DCGAN and StyleGAN to be conditioned on camera parameters. Through experiments, we demonstrated that the proposed method could learn 3D representations from 2D images with various generator architectures.",
        "keywords": "image generation;3D vision;unsupervised representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Atsuhiro Noguchi;Tatsuya Harada",
        "authorids": "noguchi@mi.t.u-tokyo.ac.jp;harada@mi.t.u-tokyo.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nNoguchi2020RGBD-GAN:,\ntitle={RGBD-GAN: Unsupervised 3D Representation Learning From Natural Image Datasets via RGBD Image Synthesis},\nauthor={Atsuhiro Noguchi and Tatsuya Harada},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxjNyrtPr}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HyxjNyrtPr)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyxjNyrtPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "277;704;306",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "153;529;323",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            429.0,
            194.81444162758228
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            335.0,
            153.73570394240457
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2779241125807343329&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "HyxjOyrKvr",
        "title": "Neural Epitome Search for Architecture-Agnostic Network Compression",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a novel neural network compression method which can reuse the parameters efficiently to reduce the model size.",
        "abstract": "Traditional compression methods including network pruning, quantization, low rank factorization and knowledge distillation all assume that network architectures and parameters should be hardwired.  In this work, we propose a new perspective on network compression, i.e., network parameters can be disentangled from the architectures.  From this viewpoint, we present the Neural Epitome Search (NES), a new neural network compression approach that learns to find compact yet expressive epitomes for weight parameters of a specified network architecture end-to-end. The complete network to compress can be generated from the learned epitome via a novel transformation method that adaptively transforms the epitomes to match shapes of the given architecture. Compared with existing compression methods, NES allows the weight tensors to be independent of the architecture design and hence can achieve a good trade-off between model compression rate and performance given a specific model size constraint. Experiments demonstrate that, on ImageNet, when taking MobileNetV2 as backbone, our approach improves the full-model baseline by 1.47% in top-1 accuracy with 25% MAdd reduction and AutoML for Model Compression (AMC) by 2.5% with nearly the same compression ratio. Moreover, taking EfficientNet-B0 as baseline, our NES yields an improvement of 1.2% but are with 10% less MAdd.  In particular, our method achieves a new state-of-the-art results of 77.5% under mobile settings (<350M MAdd). Code will be made publicly available.",
        "keywords": "Network Compression;Classification;Deep Learning;Weights Sharing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daquan Zhou;Xiaojie Jin;Qibin Hou;Kaixin Wang;Jianchao Yang;Jiashi Feng",
        "authorids": "zhoudaquan21@gmail.com;jinxiaojie@bytedance.com;andrewhoux@gmail.com;kaixin.wang@u.nus.edu;yangjianchao@bytedance.com;elefjia@nus.edu.sg",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nZhou2020Neural,\ntitle={Neural Epitome Search for Architecture-Agnostic Network Compression},\nauthor={Daquan Zhou and Xiaojie Jin and Qibin Hou and Kaixin Wang and Jianchao Yang and Jiashi Feng},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxjOyrKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyxjOyrKvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "452;799;118",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1493;748;386",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            456.3333333333333,
            278.0339707461822
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            875.6666666666666,
            460.85886005249904
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15867000779768371323&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "HyxjWANFwH",
        "title": "Deep Learning-Based Average Consensus",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In this paper, we study the problem of accelerating the linear average consensus algorithm over complex networks. We specifically present a data-driven methodology for tuning the weights of temporal (i.e., time-varying) networks by using deep learning techniques. We first unfold the linear average consensus protocol to obtain a feedforward signal flow graph, which we regard as a neural network. We then train the neural network by using standard deep learning technique to minimize the consensus error over a given finite time-horizon. As a result of the training, we obtain a set of optimized time-varying weights for faster consensus in the complex network. Numerical simulations are presented to show that our methodology can achieve a significantly smaller consensus error than the static optimal strategy.\n",
        "keywords": "complex network;optimization;deep-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Masako Kishida;Masaki Ogura;Yuichi Yoshida;Tadashi Wadayama",
        "authorids": "kishida@nii.ac.jp;oguram@is.naist.jp;yyoshida@nii.ac.jp;wadayama@nitech.ac.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkishida2020deep,\ntitle={Deep Learning-Based Average Consensus},\nauthor={Masako Kishida and Masaki Ogura and Yuichi Yoshida and Tadashi Wadayama},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxjWANFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyxjWANFwH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "687;536;307",
        "wc_reply_reviewers": "40;0;0",
        "wc_reply_authors": "300;322;437",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            510.0,
            156.21993043996233
        ],
        "wc_reply_reviewers_avg": [
            13.333333333333334,
            18.856180831641264
        ],
        "wc_reply_authors_avg": [
            353.0,
            60.07217880738692
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 26,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2430512292003606887&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HyxnH64KwS",
        "title": "The problem with DDPG: understanding failures in deterministic environments with sparse rewards",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In environments with continuous state and action spaces, state-of-the-art actor-critic reinforcement learning algorithms can solve very complex problems, yet can also fail in environments that seem trivial, but the reason for such failures is still poorly understood. In this paper, we contribute a formal explanation of these failures in the particular case of sparse reward and deterministic environments. First, using a very elementary control problem, we illustrate that the learning process can get\nstuck into a fixed point corresponding to a poor solution. Then, generalizing from the studied example, we provide a detailed analysis of the underlying mechanisms which results in a new understanding of one of the convergence regimes of these algorithms. The resulting perspective casts a new light on already existing solutions to the issues we have highlighted, and suggests other potential approaches.",
        "keywords": "ddpg;reinforcement learning;deep learning;policy gradient",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guillaume Matheron;Olivier Sigaud;Nicolas Perrin",
        "authorids": "matheron@isir.upmc.fr;olivier.sigaud@upmc.fr;perrin@isir.upmc.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmatheron2020the,\ntitle={The problem with {\\{}DDPG{\\}}: understanding failures in deterministic environments with sparse rewards},\nauthor={Guillaume Matheron and Olivier Sigaud and Nicolas Perrin},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxnH64KwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxnH64KwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "319;527;494",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "542;624;552",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            446.6666666666667,
            91.27370316191235
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            572.6666666666666,
            36.527006751473934
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 124,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15108121990679461684&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "HyxnMyBKwB",
        "title": "The Gambler's Problem and Beyond",
        "track": "main",
        "status": "Poster",
        "tldr": "This simple problem's optimal value function is fractal and is like a Cantor function.",
        "abstract": "We analyze the Gambler's problem, a simple reinforcement learning problem where the gambler has the chance to double or lose their bets until the target is reached. This is an early example introduced in the reinforcement learning textbook by Sutton and Barto (2018), where they mention an interesting pattern of the optimal value function with high-frequency components and repeating non-smooth points. It is however without further investigation. We provide the exact formula for the optimal value function for both the discrete and the continuous cases. Though simple as it might seem, the value function is pathological: fractal, self-similar, derivative taking either zero or infinity, not smooth on any interval, and not written as elementary functions. It is in fact one of the generalized Cantor functions, where it holds a complexity that has been uncharted thus far. Our analyses could lead insights into improving value function approximation, gradient-based algorithms, and Q-learning, in real applications and implementations.",
        "keywords": "the gambler's problem;reinforcement learning;fractal;self-similarity;Bellman equation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Baoxiang Wang;Shuai Li;Jiajin Li;Siu On Chan",
        "authorids": "bxwang@cse.cuhk.edu.hk;shuaili8@sjtu.edu.cn;jjli@se.cuhk.edu.hk;siuon@cse.cuhk.edu.hk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWang2020The,\ntitle={The Gambler's Problem and Beyond},\nauthor={Baoxiang Wang and Shuai Li and Jiajin Li and Siu On Chan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxnMyBKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxnMyBKwB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "131;461;293",
        "wc_reply_reviewers": "83;60;0",
        "wc_reply_authors": "432;693;193",
        "reply_reviewers": "1;2;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.0,
            134.72935834479432
        ],
        "wc_reply_reviewers_avg": [
            47.666666666666664,
            34.988887124660344
        ],
        "wc_reply_authors_avg": [
            439.3333333333333,
            204.18999866681904
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yX7rlaoCmkQJ:scholar.google.com/&scioq=The+Gambler%27s+Problem+and+Beyond&hl=en&as_sdt=0,5",
        "gs_version_total": 6
    },
    {
        "id": "HyxnnnVtwB",
        "title": "High performance RNNs with spiking neurons",
        "track": "main",
        "status": "Reject",
        "tldr": "A technique to train spiking RNNs to achieve high accuracy without simulating spikes.  ",
        "abstract": "  The increasing need for compact and low-power computing solutions for machine learning applications has triggered a renaissance in the study of energy-efficient neural network accelerators. In particular, in-memory computing neuromorphic architectures have started to receive substantial attention from both academia and industry. However, most of these architectures rely on spiking neural networks, which typically perform poorly compared to their non-spiking counterparts in terms of accuracy. In this paper, we propose a new adaptive spiking neuron model that can also be abstracted as a low-pass filter. This abstraction enables faster and better training of spiking networks using back-propagation, without simulating spikes. We show that this model dramatically improves the inference performance of a recurrent neural network and validate it with three complex spatio-temporal learning tasks: the temporal addition task, the temporal copying task, and a spoken-phrase recognition task. Application of these results will lead to the development of powerful spiking models for neuromorphic hardware that solve relevant edge-computing and Internet-of-Things applications with high accuracy and ultra-low power consumption.",
        "keywords": "RNNs;Spiking neurons;Neuromorphics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Manu V Nair;Giacomo Indiveri",
        "authorids": "mnair@ini.uzh.ch;giacomo@ini.uzh.ch",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nnair2020high,\ntitle={High performance {\\{}RNN{\\}}s with spiking neurons},\nauthor={Manu V Nair and Giacomo Indiveri},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxnnnVtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyxnnnVtwB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "214;213;255",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "779;352;813",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            227.33333333333334,
            19.567546828585563
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            648.0,
            209.76335873232642
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "HyxoX6EKvB",
        "title": "Reflection-based Word Attribute Transfer",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel representation learning framework that obtains a vector with an inverted attribute in embedding space without explicit attribute knowledge of the given word. ",
        "abstract": "We propose a word attribute transfer framework based on reflection to obtain a word vector with an inverted target attribute for a given word in a word embedding space. Word embeddings based on Pointwise Mutual Information (PMI) represent such analogic relations as king - man + woman \\approx queen. These relations can be used for changing a word\u2019s attribute from king to queen by changing its gender. This attribute transfer can be performed by subtracting a difference vector man - woman from king when we have explicit knowledge of the gender of given word king. However, this knowledge cannot be developed for various words and attributes in practice. For transferring queen into king in this analogy-based manner, we need to know that queen denotes a female and add the difference vector to it. In this work, we transfer such binary attributes based on an assumption that such transfer mapping will become identity mapping when we apply it twice. We introduce a framework based on reflection mapping that satisfies this property; queen should be transferred back to king with the same mapping as the transfer from king to queen. Experimental results show that the proposed method can transfer the word attributes of the given words, and does not change the words that do not have the target attributes.",
        "keywords": "embedding;representation learning;analogy;geometry",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yoichi Ishibashi;Katsuhito Sudoh;Koichiro Yoshino;Satoshi Nakamura",
        "authorids": "ishibashi.yoichi.ir3@is.naist.jp;sudoh@is.naist.jp;koichiro@is.naist.jp;s-nakamura@is.naist.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nishibashi2020reflectionbased,\ntitle={Reflection-based Word Attribute Transfer},\nauthor={Yoichi Ishibashi and Katsuhito Sudoh and Koichiro Yoshino and Satoshi Nakamura},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxoX6EKvB}\n}",
        "github": "https://drive.google.com/open?id=1hbRZNnEoEKo55hJGO9qgUJ9FU--5heYE",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxoX6EKvB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "225;733;235",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "161;855;206",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            397.6666666666667,
            237.15161582601306
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            407.3333333333333,
            317.08078185570037
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4439294054542085729&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "HyxsR24tvS",
        "title": "FAKE CAN BE REAL IN GANS",
        "track": "main",
        "status": "Withdraw",
        "tldr": " We propose a novel GAN training method by considering certain fake samples as real to alleviate mode collapse and stabilize training process.",
        "abstract": "In order to alleviate the notorious mode collapse phenomenon in generative adversarial networks (GANs), we propose a novel training method of GANs in which certain fake samples can be reconsidered as real ones during the training process. This strategy can reduce the gradient value that generator receives in the region where gradient exploding happens. We show that the theoretical equilibrium between the generators and discriminations actually can be seldom realized in practice. And this results in an unbalanced generated distribution that deviates from the target one, when fake datepoints overfit to real ones, which explains the non-stability of GANs. We also prove that, by penalizing the difference between discriminator outputs and considering certain fake datapoints as real for adjacent real and fake sample pairs, gradient exploding can be alleviated. Accordingly, a modified GAN training method is proposed with a more stable training process and a better generalization. Experiments on different datasets verify our theoretical analysis.",
        "keywords": "GANs;Mode collapse;Gradient exploding;Stability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Song Tao;Jia Wang",
        "authorids": "taosong@sjtu.edu.cn;jiawang@sjtu.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxsR24tvS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "231;485;125",
        "wc_reply_reviewers": "85;0;0",
        "wc_reply_authors": "392;676;314",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            280.3333333333333,
            151.0526030523436
        ],
        "wc_reply_reviewers_avg": [
            28.333333333333332,
            40.069384267237695
        ],
        "wc_reply_authors_avg": [
            460.6666666666667,
            155.557777761905
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hEe0elIap9YJ:scholar.google.com/&scioq=FAKE+CAN+BE+REAL+IN+GANS&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "HyxwZRNtDr",
        "title": "Wasserstein Robust Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "An RL algorithm that learns to be robust to changes in dynamics",
        "abstract": "Reinforcement learning algorithms, though successful, tend to over-fit to training environments, thereby hampering their application to the real-world. This paper proposes $\\text{W}\\text{R}^{2}\\text{L}$ -- a robust reinforcement learning algorithm with significant robust performance on low and high-dimensional control tasks. Our method formalises robust reinforcement learning as a novel min-max game with a Wasserstein constraint for a correct and convergent solver. Apart from the formulation, we also propose an efficient and scalable solver following a novel zero-order optimisation method that we believe can be useful to numerical optimisation in general. \nWe empirically demonstrate significant gains compared to standard and robust state-of-the-art algorithms on high-dimensional MuJuCo environments",
        "keywords": "Reinforcement Learning;Robustness;Wasserstein distance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohammed Amin Abdullah;Hang Ren;Haitham Bou-Ammar;Vladimir Milenkovic;Rui Luo;Mingtian Zhang;Jun Wang",
        "authorids": "mohammed.abdullah@huawei.com;;haitham.ammar@huawei.com;vladimir.milenkovic@huawei.com;;;",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nabdullah2020wasserstein,\ntitle={Wasserstein Robust Reinforcement Learning},\nauthor={Mohammed Amin Abdullah and Hang Ren and Haitham Bou-Ammar and Vladimir Milenkovic and Rui Luo and Mingtian Zhang and Jun Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxwZRNtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=HyxwZRNtDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "436;519;285",
        "wc_reply_reviewers": "0;0;81",
        "wc_reply_authors": "529;816;903",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            413.3333333333333,
            96.86531313576026
        ],
        "wc_reply_reviewers_avg": [
            27.0,
            38.18376618407357
        ],
        "wc_reply_authors_avg": [
            749.3333333333334,
            159.79639823503186
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 106,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=219317425097386310&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "HyxyIgHFvr",
        "title": "Truth or backpropaganda? An empirical investigation of deep learning theory",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We call into question commonly held beliefs regarding the loss landscape, optimization, network width, and rank.",
        "abstract": "We empirically evaluate common assumptions about neural networks that are widely held by practitioners and theorists alike.  In this work, we: (1) prove the widespread existence of suboptimal local minima in the loss landscape of neural networks, and we use our theory to find examples; (2) show that small-norm parameters are not optimal for generalization; (3) demonstrate that ResNets do not conform to wide-network theories, such as the neural tangent kernel, and that the interaction between skip connections and batch normalization plays a role;  (4) find that rank does not correlate with generalization or robustness in a practical setting.",
        "keywords": "Deep learning;generalization;loss landscape;robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Micah Goldblum;Jonas Geiping;Avi Schwarzschild;Michael Moeller;Tom Goldstein",
        "authorids": "goldblumcello@gmail.com;jonas.geiping@uni-siegen.de;avi1@umd.edu;michael.moeller@uni-siegen.de;tomg@cs.umd.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nGoldblum2020Truth,\ntitle={Truth or backpropaganda? An empirical investigation of deep learning theory},\nauthor={Micah Goldblum and Jonas Geiping and Avi Schwarzschild and Michael Moeller and Tom Goldstein},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=HyxyIgHFvr}\n}",
        "github": "https://github.com/goldblum/TruthOrBackpropaganda",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=HyxyIgHFvr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "445;202;247",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "167;26;119",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            298.0,
            105.55567251455508
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            104.0,
            58.532042506647585
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 46,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=705485090125694801&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1e-0kBYPB",
        "title": "Can I Trust the Explainer? Verifying Post-Hoc Explanatory Methods",
        "track": "main",
        "status": "Reject",
        "tldr": "An evaluation framework based on a real-world neural network for post-hoc explanatory methods",
        "abstract": "For AI systems to garner widespread public acceptance, we must develop methods capable of explaining the decisions of black-box models such as neural networks. In this work, we identify two issues of current explanatory methods. First, we show that two prevalent perspectives on explanations\u2014feature-additivity and feature-selection\u2014lead to fundamentally different instance-wise explanations. In the literature, explainers from different perspectives are currently being directly compared, despite their distinct explanation goals. The second issue is that current post-hoc explainers have only been thoroughly validated on simple models, such as linear regression, and, when applied to real-world neural networks, explainers are commonly evaluated under the assumption that the learned models behave reasonably. However, neural networks often rely on unreasonable correlations, even when producing correct decisions. We introduce a verification framework for explanatory methods under the feature-selection perspective. Our framework is based on a non-trivial neural network architecture trained on a real-world task, and for which we are able to provide guarantees on its inner workings. We validate the efficacy of our evaluation by showing the failure modes of current explainers. We aim for this framework to provide a publicly available,1 off-the-shelf evaluation when the feature-selection perspective on explanations is needed.",
        "keywords": "explainability;neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Oana-Maria Camburu*;Eleonora Giunchiglia*;Jakob Foerster;Thomas Lukasiewicz;Phil Blunsom",
        "authorids": "ocamburu@gmail.com;eleonora.giunchiglia@cs.ox.ac.uk;jakobfoerster@gmail.com;thomas.lukasiewicz@gmail.com;philblunsom@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ncamburu*2020can,\ntitle={Can I Trust the Explainer? Verifying Post-Hoc Explanatory Methods},\nauthor={Oana-Maria Camburu* and Eleonora Giunchiglia* and Jakob Foerster and Thomas Lukasiewicz and Phil Blunsom},\nyear={2020},\nurl={https://openreview.net/forum?id=S1e-0kBYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1e-0kBYPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "551;826;296",
        "wc_reply_reviewers": "0;378;0",
        "wc_reply_authors": "372;710;236",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            557.6666666666666,
            216.4229398397704
        ],
        "wc_reply_reviewers_avg": [
            126.0,
            178.19090885900997
        ],
        "wc_reply_authors_avg": [
            439.3333333333333,
            199.28092956650138
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 62,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6905392296764833852&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1e0ZlHYDB",
        "title": "Progressive Compressed Records: Taking a Byte Out of Deep Learning Data",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a simple, general, and space-efficient data format to accelerate deep learning training by allowing sample fidelity to be dynamically selected at training time",
        "abstract": "Deep learning training accesses vast amounts of data at high velocity, posing challenges for datasets retrieved over commodity networks and storage devices. We introduce a way to dynamically reduce the overhead of fetching and transporting training data with a method we term Progressive Compressed Records (PCRs). PCRs deviate from previous formats by leveraging progressive compression to split each training example into multiple examples of increasingly higher fidelity, without adding to the total data size. Training examples of similar fidelity are grouped together, which reduces both the system overhead and data bandwidth needed to train a model. We show that models can be trained on aggressively compressed representations of the training data and still retain high accuracy, and that PCRs can enable a 2x speedup on average over baseline formats using JPEG compression. Our results hold across deep learning architectures for a wide range of datasets: ImageNet, HAM10000, Stanford Cars, and CelebA-HQ.",
        "keywords": "Deep Learning;Storage;Bandwidth;Compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael Kuchnik;George Amvrosiadis;Virginia Smith",
        "authorids": "mkuchnik@andrew.cmu.edu;gamvrosi@cmu.edu;smithv@cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkuchnik2020progressive,\ntitle={Progressive Compressed Records: Taking a Byte Out of Deep Learning Data},\nauthor={Michael Kuchnik and George Amvrosiadis and Virginia Smith},\nyear={2020},\nurl={https://openreview.net/forum?id=S1e0ZlHYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer5",
        "site": "https://openreview.net/forum?id=S1e0ZlHYDB",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "250;405;446;146",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "556;712;577;157",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            311.75,
            120.420876512339
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            500.5,
            207.15754874008334
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12144406293747068625&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "S1e1EAEFPB",
        "title": "Perceptual Regularization: Visualizing and Learning Generalizable Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "A deployable machine learning model relies on a good representation. Two desirable criteria of a good representation are to be understandable, and to generalize to new tasks. We propose a technique termed perceptual regularization that enables both visualization of the latent representation and control over the generality of the learned representation. In particular our method provides a direct visualization of the effect that adversarial attacks have on the internal representation of a deep network. By visualizing the learned representation, we are also able to understand the attention of a model, obtaining visual evidence that supervised networks learn task-specific representations. We show models trained with perceptual regularization learn transferrable features, achieving significantly higher accuracy in unseen tasks compared to standard supervised learning and multi-task methods.",
        "keywords": "regularization;representation learning;visualization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongzhou Lin;Joshua Robinson;Stefanie Jegelka",
        "authorids": "hongzhou@mit.edu;joshrob@mit.edu;stefje@csail.mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlin2020perceptual,\ntitle={Perceptual Regularization: Visualizing and Learning Generalizable Representations},\nauthor={Hongzhou Lin and Joshua Robinson and Stefanie Jegelka},\nyear={2020},\nurl={https://openreview.net/forum?id=S1e1EAEFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1e1EAEFPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "539;810;233",
        "wc_reply_reviewers": "0;190;0",
        "wc_reply_authors": "642;544;155",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            527.3333333333334,
            235.7036746048356
        ],
        "wc_reply_reviewers_avg": [
            63.333333333333336,
            89.56685895029602
        ],
        "wc_reply_authors_avg": [
            447.0,
            210.31563581119372
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13643883333300664162&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1e2agrFvS",
        "title": "Geom-GCN: Geometric Graph Convolutional Networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "For graph neural networks, the aggregation on a graph can benefit from a continuous space underlying the graph.",
        "abstract": "Message-passing neural networks (MPNNs) have been successfully applied in a wide variety of applications in the real world. However, two fundamental weaknesses of MPNNs' aggregators limit their ability to represent graph-structured data: losing the structural information of nodes in neighborhoods and lacking the ability to capture long-range dependencies in disassortative graphs. Few studies have noticed the weaknesses from different perspectives. From the observations on classical neural network and network geometry, we propose a novel geometric aggregation scheme for graph neural networks to overcome the two weaknesses.  The behind basic idea is the aggregation on a graph can benefit from a continuous space underlying the graph. The proposed aggregation scheme is permutation-invariant and consists of three modules, node embedding, structural neighborhood, and bi-level aggregation. We also present an implementation of the scheme in graph convolutional networks, termed Geom-GCN, to perform transductive learning on graphs. Experimental results show the proposed Geom-GCN achieved state-of-the-art performance on a wide range of open datasets of graphs.",
        "keywords": "Deep Learning;Graph Convolutional Network;Network Geometry",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongbin Pei;Bingzhe Wei;Kevin Chen-Chuan Chang;Yu Lei;Bo Yang",
        "authorids": "gspeihongbing@163.com;bwei6@illinois.edu;kcchang@illinois.edu;csylei@comp.polyu.edu.hk;ybo@jlu.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nPei2020Geom-GCN:,\ntitle={Geom-GCN: Geometric Graph Convolutional Networks},\nauthor={Hongbin Pei and Bingzhe Wei and Kevin Chen-Chuan Chang and Yu Lei and Bo Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1e2agrFvS}\n}",
        "github": "https://github.com/graphdml-uiuc-jlu/geom-gcn",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1e2agrFvS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "368;647;568",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1143;1165;883",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            527.6666666666666,
            117.41758338889831
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1063.6666666666667,
            128.06595522967407
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            20,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1460,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10425996329335567417&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1e3g1rtwB",
        "title": "The fairness-accuracy landscape of neural classifiers",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "That machine learning algorithms can demonstrate bias is well-documented by now. This work confronts the challenge of bias mitigation in feedforward fully-connected neural nets from the lens of causal inference and multiobjective optimisation. Regarding the former, a new causal notion of fairness is introduced that is particularly suited to giving a nuanced treatment of datasets collected under unfair practices. In particular, special attention is paid to subjects whose covariates could appear with substantial probability in either value of the sensitive attribute.  Next, recognising that fairness and accuracy are competing objectives, the proposed methodology uses techniques from multiobjective optimisation to ascertain the fairness-accuracy landscape of a neural net classifier. Experimental results suggest that the proposed method produces neural net classifiers that distribute evenly across the Pareto front of the fairness-accuracy space and is more efficient at finding non-dominated points than an adversarial approach.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Susan Wei;Marc Niethammer",
        "authorids": "susan.wei@unimelb.edu.au;mn@cs.unc.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nwei2020the,\ntitle={The fairness-accuracy landscape of neural classifiers},\nauthor={Susan Wei and Marc Niethammer},\nyear={2020},\nurl={https://openreview.net/forum?id=S1e3g1rtwB}\n}",
        "github": "https://github.com/icml2020submission/fairness_accuracy_nn.git",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1e3g1rtwB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "527;182;245",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1512;498;383",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            318.0,
            150.0066665185251
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            797.6666666666666,
            507.28712010282914
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:21h9Gb3ZJ7QJ:scholar.google.com/&scioq=The+fairness-accuracy+landscape+of+neural+classifiers&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1e4Q6EtDH",
        "title": "Tensorized Embedding Layers for Efficient Model Compression",
        "track": "main",
        "status": "Reject",
        "tldr": "Embedding layers are factorized with Tensor Train decomposition to reduce their memory footprint.",
        "abstract": "The embedding layers transforming input words into real vectors are the key components of deep neural networks used in natural language processing. However, when the vocabulary is large, the corresponding weight matrices can be enormous, which precludes their deployment in a limited resource setting. We introduce a novel way of parametrizing embedding layers based on the Tensor Train (TT) decomposition, which allows compressing the model significantly at the cost of a negligible drop or even a slight gain in performance.  We evaluate our method on a wide range of benchmarks in natural language processing and analyze the trade-off between performance and compression ratios for a wide range of architectures, from MLPs to LSTMs and Transformers.",
        "keywords": "Embedding layers compression;tensor networks;low-rank factorization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Oleksii Hrinchuk;Valentin Khrulkov;Leyla Mirvakhabova;Ivan Oseledets",
        "authorids": "oleksii.hrinchuk@skoltech.ru;khrulkov.v@gmail.com;leyla.mirvakhabova@skoltech.ru;i.oseledets@skoltech.ru",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhrinchuk2020tensorized,\ntitle={Tensorized Embedding Layers for Efficient Model Compression},\nauthor={Oleksii Hrinchuk and Valentin Khrulkov and Leyla Mirvakhabova and Ivan Oseledets},\nyear={2020},\nurl={https://openreview.net/forum?id=S1e4Q6EtDH}\n}",
        "github": "https://github.com/tt-embedding/tt-embeddings",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1e4Q6EtDH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "96;232;788",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "483;147;606",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            372.0,
            299.35040782779413
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            412.0,
            193.99484529234275
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 77,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15471916324100294852&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1e4jkSKvB",
        "title": "The intriguing role of module criticality in the generalization of deep networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We study the phenomenon that some modules of DNNs are more critical than others. Our analysis leads us to propose a complexity measure, that is able to explain the superior generalization performance of some architectures over others.",
        "abstract": "We study the phenomenon that some modules of deep neural networks (DNNs) are more critical than others. Meaning that rewinding their parameter values back to initialization, while keeping other modules fixed at the trained parameters, results in a large drop in the network's performance. Our analysis reveals interesting properties of the loss landscape which leads us to propose a complexity measure, called module criticality, based on the shape of the valleys that connect the initial and final values of the module parameters. We formulate how generalization relates to the module criticality, and show that this measure is able to explain the superior generalization performance of some architectures over others, whereas, earlier measures fail to do so.",
        "keywords": "Module Criticality Phenomenon;Complexity Measure;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Niladri Chatterji;Behnam Neyshabur;Hanie Sedghi",
        "authorids": "niladri.chatterji@berkeley.edu;neyshabur@google.com;hsedghi@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nChatterji2020The,\ntitle={The intriguing role of module criticality in the generalization of deep networks},\nauthor={Niladri Chatterji and Behnam Neyshabur and Hanie Sedghi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1e4jkSKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1e4jkSKvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "211;149;443",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "158;57;563",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            267.6666666666667,
            126.53677550639402
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            259.3333333333333,
            218.6478650453484
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 72,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1591229172154101586&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1e5YC4KPS",
        "title": "Winning Privately: The Differentially Private Lottery Ticket Mechanism",
        "track": "main",
        "status": "Reject",
        "tldr": "An end-to-end differentially private extension of the lottery ticket mechanism",
        "abstract": "We propose the differentially private lottery ticket mechanism (DPLTM). An end-to-end differentially private training paradigm based on the lottery ticket hypothesis. Using ``high-quality winners\", selected via our custom score function, DPLTM significantly outperforms state-of-the-art. We show that DPLTM converges faster, allowing for early stopping with reduced privacy budget consumption. We further show that the tickets from DPLTM are transferable across datasets, domains, and architectures. Our extensive evaluation on several public datasets provides evidence to our claims. ",
        "keywords": "Differentially private neural networks;lottery ticket hypothesis;differential privacy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lovedeep Gondara;Ke Wang;Ricardo Silva Carvalho",
        "authorids": "lgondara@sfu.ca;wang@sfu.ca;ricardo_silva_carvalho@sfu.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngondara2020winning,\ntitle={Winning Privately: The Differentially Private Lottery Ticket Mechanism},\nauthor={Lovedeep Gondara and Ke Wang and Ricardo Silva Carvalho},\nyear={2020},\nurl={https://openreview.net/forum?id=S1e5YC4KPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1e5YC4KPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "467;401;501",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "725;774;538",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            456.3333333333333,
            41.51572660517404
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            679.0,
            101.68906857015982
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kZ5QEkEI8YAJ:scholar.google.com/&scioq=Winning+Privately:+The+Differentially+Private+Lottery+Ticket+Mechanism&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1eALyrYDH",
        "title": "RNA Secondary Structure Prediction By Learning Unrolled Algorithms",
        "track": "main",
        "status": "Talk",
        "tldr": "A DL model for RNA secondary structure prediction, which uses an unrolled algorithm in the architecture to enforce constraints.",
        "abstract": "In this paper, we propose an end-to-end deep learning model, called E2Efold, for RNA secondary structure prediction which can effectively take into account the inherent constraints in the problem. The key idea of E2Efold is to directly predict the RNA base-pairing matrix, and use an unrolled algorithm for constrained programming as the template for deep architectures to enforce constraints. With comprehensive experiments on benchmark datasets, we demonstrate the superior performance of E2Efold: it predicts significantly better structures compared to previous SOTA (especially for pseudoknotted structures), while being as efficient as the fastest algorithms in terms of inference time.",
        "keywords": "RNA secondary structure prediction;learning algorithm;deep architecture design;computational biology",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinshi Chen;Yu Li;Ramzan Umarov;Xin Gao;Le Song",
        "authorids": "xinshi.chen@gatech.edu;yu.li@kaust.edu.sa;ramzan.umarov@kaust.edu.sa;xin.gao@kaust.edu.sa;lsong@cc.gatech.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nChen2020RNA,\ntitle={RNA Secondary Structure Prediction By Learning Unrolled Algorithms},\nauthor={Xinshi Chen and Yu Li and Ramzan Umarov and Xin Gao and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eALyrYDH}\n}",
        "github": "https://github.com/ml4bio/e2efold",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=S1eALyrYDH",
        "pdf_size": 0,
        "rating": "6;8;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "406;413;446;673",
        "wc_reply_reviewers": "0;128;0;0",
        "wc_reply_authors": "705;1113;254;0",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "1;3;1;0",
        "rating_avg": [
            7.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            484.5,
            109.87379123339652
        ],
        "wc_reply_reviewers_avg": [
            32.0,
            55.42562584220407
        ],
        "wc_reply_authors_avg": [
            518.0,
            426.32557980960985
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            1.0897247358851685
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 149,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=973131369176852672&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "S1eFpa4KvS",
        "title": "BEAN: Interpretable Representation Learning with Biologically-Enhanced Artificial Neuronal Assembly Regularization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Deep neural networks (DNNs) are known for extracting good representations from a large amount of data. However, the representations learned in DNNs are typically hard to interpret, especially the ones learned in dense layers. One crucial issue is that neurons within each layer of DNNs are conditionally independent with each other, which makes the analysis of neurons at higher modularity difficult. In contrast, the dependency patterns of biological neurons in the human brain are largely different from those of DNNs. Neuronal assembly describes such neuron dependencies that could be found among a group of biological neurons as having strong internal synaptic interactions, potentially high semantic correlations that are deemed to facilitate the memorization process. In this paper, we show such a crucial gap between DNNs and biological neural networks (BNNs) can be bridged by the newly proposed Biologically-Enhanced Artificial Neuronal assembly (BEAN) regularization that could enforce dependencies among neurons in dense layers of DNNs without altering the conventional architecture. Both qualitative and quantitative analyses show that BEAN enables the formations of interpretable and biologically plausible neuronal assemblies in dense layers and consequently enhances the modularity and interpretability of the hidden representations learned. Moreover, BEAN further results in sparse and structured connectivity and parameter sharing among neurons, which substantially improves the efficiency and generalizability of the model.",
        "keywords": "regularization;interpretability;bio-inspired deep learning;neuroscience;computational biology",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuyang Gao;Giorgio Ascoli;Liang Zhao",
        "authorids": "ygao13@gmu.edu;ascoli@gmu.edu;lzhao9@gmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://drive.google.com/open?id=1el-tmKgO5nlteY8rIh1ss9X6Agfr4Lp5",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1eFpa4KvS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "848;315;333",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            498.6666666666667,
            247.125249395704
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7736760843181926556&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1eIw0NFvr",
        "title": "Selective Brain Damage: Measuring the Disparate Impact of Model Pruning",
        "track": "main",
        "status": "Reject",
        "tldr": "Pruning deep neural networks has a non-uniform impact; certain classes and exemplars are systematically more impacted by the introduction of sparsity. ",
        "abstract": "Neural network pruning techniques have demonstrated it is possible to remove the majority of weights in a network with surprisingly little degradation to top-1 test set accuracy. However, this measure of performance conceals significant differences in how different classes and images are impacted by pruning. We find that certain individual data points, which we term pruning identified exemplars (PIEs), and classes are systematically more impacted by the introduction of sparsity. Removing PIE images from the test-set greatly improves top-1 accuracy for both sparse and non-sparse models. These hard-to-generalize-to images tend to be of lower image quality, mislabelled, entail abstract representations, require fine-grained classification or depict atypical class examples.",
        "keywords": "machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sara Hooker;Yann Dauphin;Aaron Courville;Andrea Frome",
        "authorids": "shooker@google.com;ynd@google.com;aaron.courville@gmail.com;onepinkfairyarmadillo@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhooker2020selective,\ntitle={Selective Brain Damage: Measuring the Disparate Impact of Model Pruning},\nauthor={Sara Hooker and Yann Dauphin and Aaron Courville and Andrea Frome},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eIw0NFvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1eIw0NFvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "397;202;327",
        "wc_reply_reviewers": "0;164;0",
        "wc_reply_authors": "789;545;392",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            308.6666666666667,
            80.65702421708573
        ],
        "wc_reply_reviewers_avg": [
            54.666666666666664,
            77.3103414097292
        ],
        "wc_reply_authors_avg": [
            575.3333333333334,
            163.48768217276256
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 31,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9433327814355203556&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1eL4kBYwr",
        "title": "UNITER: Learning UNiversal Image-TExt Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce UNITER, a UNiversal Image-TExt Representation, learned through large-scale pre-training over image-text datasets, achieves state-of-the-art results across six Vision-and-Language tasks over nine datasets.",
        "abstract": "Joint image-text embedding is the bedrock for most Vision-and-Language (V+L) tasks, where multimodality inputs are jointly processed for visual and textual understanding. In this paper, we introduce UNITER, a UNiversal Image-TExt Representation, learned through large-scale pre-training over four image-text datasets (COCO, Visual Genome, Conceptual Captions, and SBU Captions), which can power heterogeneous downstream V+L tasks with joint multimodal embeddings. We design three pre-training tasks: Masked Language Modeling (MLM), Image-Text Matching (ITM), and Masked Region Modeling (MRM, with three variants). Different from concurrent work on multimodal pre-training that apply joint random masking to both modalities, we use Conditioned Masking on pre-training tasks (i.e., masked language/region modeling is conditioned on full observation of image/text). Comprehensive analysis shows that conditioned masking yields better performance than unconditioned masking. We also conduct a thorough ablation study to find an optimal combination of pre-training tasks for UNITER. Extensive experiments show that UNITER achieves new state of the art across six V+L tasks over nine datasets, including Visual Question Answering, Image-Text Retrieval, Referring Expression Comprehension, Visual Commonsense Reasoning, Visual Entailment, and NLVR2. ",
        "keywords": "Self-supervised Representation Learning;Large-scale Pre-training;Vision and Language",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yen-Chun Chen;Linjie Li;Licheng Yu;Ahmed El Kholy;Faisal Ahmed;Zhe Gan;Yu Cheng;Jingjing Liu",
        "authorids": "yen-chun.chen@microsoft.com;lindsey.li@microsoft.com;licheng.yu@microsoft.com;ahmed.elkholy@microsoft.com;fiahmed@microsoft.com;zhe.gan@microsoft.com;yu.cheng@microsoft.com;jingjl@microsoft.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nchen2020uniter,\ntitle={{\\{}UNITER{\\}}: Learning {\\{}UN{\\}}iversal Image-{\\{}TE{\\}}xt Representations},\nauthor={Yen-Chun Chen and Linjie Li and Licheng Yu and Ahmed El Kholy and Faisal Ahmed and Zhe Gan and Yu Cheng and Jingjing Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eL4kBYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1eL4kBYwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "199;570;89",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "478;1339;123",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;7;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            286.0,
            205.77819774375192
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            646.6666666666666,
            510.5554709224933
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            3.6666666666666665,
            2.494438257849294
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 441,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6068739516859004374&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "S1eQuCVFvB",
        "title": "Machine Truth Serum",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposes two machine learning aided methods HMTS and DMTS to detect when the aggregated minority opinion should be taken as the final prediction instead of majority.",
        "abstract": "Wisdom of the crowd revealed a striking fact that the majority answer from a crowd is often more accurate than any individual expert. We observed the same story in machine learning - ensemble methods leverage this idea to combine multiple learning algorithms to obtain better classification performance. Among many popular examples is the celebrated Random Forest, which applies the majority voting rule in aggregating different decision trees to make the final prediction. Nonetheless, these aggregation rules would fail when the majority is more likely to be wrong. In this paper, we extend the idea proposed in Bayesian Truth Serum that \"a surprisingly more popular answer is more likely the true answer\" to classification problems. The challenge for us is to define or detect when an answer should be considered as being \"surprising\". We present two machine learning aided methods which aim to reveal the truth when it is minority instead of majority who has the true answer. Our experiments over real-world datasets show that better classification performance can be obtained compared to always trusting the majority voting. Our proposed methods also outperform popular ensemble algorithms. Our approach can be generically applied as a subroutine in ensemble methods to replace majority voting rule. ",
        "keywords": "Ensemble method;Classification;Machine Truth Serum;Minority;Machine Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianyi Luo;Yang Liu",
        "authorids": "~Tianyi_Luo2;yangliu@ucsc.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": "~Tianyi_Luo2;yangliu@ucsc.edu",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nluo2020machine,\ntitle={Machine Truth Serum},\nauthor={Tianyi Luo and Yang Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eQuCVFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1eQuCVFvB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "440;716;591",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "403;548;407",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            582.3333333333334,
            112.84305718809445
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            452.6666666666667,
            67.43062278289361
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10805511570371115894&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1eRbANtDB",
        "title": "Learning to Link",
        "track": "main",
        "status": "Poster",
        "tldr": "We show how to use data to automatically learn low-loss linkage procedures and metrics for specific clustering applications.",
        "abstract": "Clustering is an important part of many modern data analysis pipelines, including network analysis and data retrieval. There are many different clustering algorithms developed by various communities, and it is often not clear which algorithm will give the best performance on a specific clustering task. Similarly, we often have multiple ways to measure distances between data points, and the best clustering performance might require a non-trivial combination of those metrics. In this work, we study data-driven algorithm selection and metric learning for clustering problems, where the goal is to simultaneously learn the best algorithm and metric for a specific application. The family of clustering algorithms we consider is parameterized linkage based procedures that includes single and complete linkage. The family of distance functions we learn over are convex combinations of base distance functions. We design efficient learning algorithms which receive samples from an application-specific distribution over clustering instances and learn a near-optimal distance and clustering algorithm from these classes. We also carry out a comprehensive empirical evaluation of our techniques showing that they can lead to significantly improved clustering performance on real-world datasets.",
        "keywords": "Data-driven Algorithm Configuration;Metric Learning;Linkage Clustering;Learning Algorithms",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Maria-Florina Balcan;Travis Dick;Manuel Lang",
        "authorids": "ninamf@cs.cmu.edu;tdick@ttic.edu;manuel.lang@student.kit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nBalcan2020Learning,\ntitle={Learning to Link},\nauthor={Maria-Florina Balcan and Travis Dick and Manuel Lang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eRbANtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1eRbANtDB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "487;305;406",
        "wc_reply_reviewers": "50;0;0",
        "wc_reply_authors": "441;253;378",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            399.3333333333333,
            74.45058017832292
        ],
        "wc_reply_reviewers_avg": [
            16.666666666666668,
            23.570226039551585
        ],
        "wc_reply_authors_avg": [
            357.3333333333333,
            78.12952209134664
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8226955839331759497&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1eRya4KDB",
        "title": "A novel Bayesian estimation-based word embedding model for sentiment analysis",
        "track": "main",
        "status": "Reject",
        "tldr": "We use sentiment probabilities to incorporate the sentiment information of words into the word embeddings and use Bayesian estimation to derive sentiment probabilities.",
        "abstract": "The word embedding models have achieved state-of-the-art results in a variety of natural language processing tasks. Whereas, current word embedding models mainly focus on the rich semantic meanings while are challenged by capturing the sentiment information. For this reason, we propose a novel sentiment word embedding model. In line with the working principle, the parameter estimating method is highlighted. On the task of semantic and sentiment embeddings, the parameters in the proposed model are determined by using both the maximum likelihood estimation and the Bayesian estimation. Experimental results show the proposed model significantly outperforms the baseline methods in sentiment analysis for low-frequency words and sentences. Besides, it is also effective in conventional semantic and sentiment analysis tasks.",
        "keywords": "sentiment analysis;sentiment word embeddings;maximum likelihood estimation;Bayesian estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jingyao Tang;Yun Xue;Ziwen Wang;Haoliang Zhao",
        "authorids": "manderous@foxmail.com;995438712@qq.com;773473833@qq.com;1044012786@qq.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntang2020a,\ntitle={A novel Bayesian estimation-based word embedding model for sentiment analysis},\nauthor={Jingyao Tang and Yun Xue and Ziwen Wang and Haoliang Zhao},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eRya4KDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1eRya4KDB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "349;198;225",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "524;381;120",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            257.3333333333333,
            65.7486797501584
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            341.6666666666667,
            167.26094051577678
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:f9ohKRllc7QJ:scholar.google.com/&scioq=A+novel+Bayesian+estimation-based+word+embedding+model+for+sentiment+analysis&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1eSoeSYwr",
        "title": "Deep Evidential Uncertainty",
        "track": "main",
        "status": "Reject",
        "tldr": "Fast, calibrated uncertainty estimation for neural networks without sampling",
        "abstract": "Deterministic neural networks (NNs) are increasingly being deployed in safety critical domains, where calibrated, robust and efficient measures of uncertainty are crucial. While it is possible to train regression networks to output the parameters of a probability distribution by maximizing a Gaussian likelihood function, the resulting model remains oblivious to the underlying confidence of its predictions. In this paper, we propose a novel method for training deterministic NNs to not only estimate the desired target but also the associated evidence in support of that target. We accomplish this by  placing evidential priors over our original Gaussian likelihood function and training our NN to infer the hyperparameters of our evidential distribution. We impose priors during training such that the model is penalized when its predicted evidence is not aligned with the correct output. Thus the model estimates not only the probabilistic mean and variance of our target but also the underlying uncertainty associated with each of those parameters. We observe that our evidential regression method learns well-calibrated measures of uncertainty on various benchmarks, scales to complex computer vision tasks, and is robust to adversarial input perturbations.\n",
        "keywords": "Evidential deep learning;Uncertainty estimation;Epistemic uncertainty",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexander Amini;Wilko Schwarting;Ava Soleimany;Daniela Rus",
        "authorids": "amini@mit.edu;wilkos@mit.edu;asolei@mit.edu;rus@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\namini2020deep,\ntitle={Deep Evidential Uncertainty},\nauthor={Alexander Amini and Wilko Schwarting and Ava Soleimany and Daniela Rus},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eSoeSYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1eSoeSYwr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "96;639;674",
        "wc_reply_reviewers": "0;139;174",
        "wc_reply_authors": "178;1219;1236",
        "reply_reviewers": "0;1;3",
        "reply_authors": "1;2;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            469.6666666666667,
            264.6083059081017
        ],
        "wc_reply_reviewers_avg": [
            104.33333333333333,
            75.14578423896371
        ],
        "wc_reply_authors_avg": [
            877.6666666666666,
            494.7877210368997
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            1.247219128924647
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17570837007660032966&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "S1eUd64tDr",
        "title": "An Inter-Layer Weight Prediction and Quantization for Deep Neural Networks based on Smoothly Varying Weight Hypothesis",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a new compression method, Inter-Layer Weight Prediction (ILWP) and quantization method which quantize the predicted residuals between the weights in convolution layers.",
        "abstract": "Due to a resource-constrained environment, network compression has become an important part of deep neural networks research. In this paper, we propose a new compression method, Inter-Layer Weight Prediction (ILWP) and quantization method which quantize the predicted residuals between the weights in all convolution layers based on an inter-frame prediction method in conventional video coding schemes. Furthermore, we found a phenomenon Smoothly Varying Weight Hypothesis (SVWH) which is that the weights in adjacent convolution layers share strong similarity in shapes and values, i.e., the weights tend to vary smoothly along with the layers. Based on SVWH, we propose a second ILWP and quantization method which quantize the predicted residuals between the weights in adjacent convolution layers. Since the predicted weight residuals tend to follow Laplace distributions with very low variance, the weight quantization can more effectively be applied, thus producing more zero weights and enhancing the weight compression ratio. In addition, we propose a new inter-layer loss for eliminating non-texture bits, which enabled us to more effectively store only texture bits. That is, the proposed loss regularizes the weights such that the collocated weights between the adjacent two layers have the same values. Finally, we propose an ILWP with an inter-layer loss and quantization method. Our comprehensive experiments show that the proposed method achieves a much higher weight compression rate at the same accuracy level compared with the previous quantization-based compression methods in deep neural networks.",
        "keywords": "Network Compression;Quantization;Weight Prediction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kang-Ho Lee;JoonHyun Jung;Sung-Ho Bae",
        "authorids": "ho7719@khu.ac.kr;doublejtoh@khu.ac.kr;shbae@khu.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1eUd64tDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "323;237;305",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            288.3333333333333,
            37.03451843288307
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2808082020664893560&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1eWbkSFPS",
        "title": "GRAPHS, ENTITIES, AND STEP MIXTURE",
        "track": "main",
        "status": "Reject",
        "tldr": "Simple and effective graph neural network with mixture of random walk steps and attention",
        "abstract": "Graph neural networks have shown promising results on representing and analyzing diverse graph-structured data such as social, citation, and protein interaction networks. Existing approaches commonly suffer from the oversmoothing issue, regardless of whether policies are edge-based or node-based for neighborhood aggregation. Most methods also focus on transductive scenarios for fixed graphs, leading to poor generalization performance for unseen graphs. To address these issues, we propose a new graph neural network model that considers both edge-based neighborhood relationships and node-based entity features, i.e. Graph Entities with Step Mixture via random walk (GESM). GESM employs a mixture of various steps through random walk to alleviate the oversmoothing problem and attention to use node information explicitly. These two mechanisms allow for a weighted neighborhood aggregation which considers the properties of entities and relations. With intensive experiments, we show that the proposed GESM achieves state-of-the-art or comparable performances on four benchmark graph datasets comprising transductive and inductive learning tasks. Furthermore, we empirically demonstrate the significance of considering global information. The source code will be publicly available in the near future.",
        "keywords": "Graph Neural Network;Random Walk;Attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kyuyong Shin;Wonyoung Shin;Jung-Woo Ha;Sunyoung Kwon",
        "authorids": "p37329@gmail.com;wyshin@kaist.ac.kr;jungwoo.ha@navercorp.com;sunny.kwon@navercorp.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nshin2020graphs,\ntitle={{\\{}GRAPHS{\\}}, {\\{}ENTITIES{\\}}, {\\{}AND{\\}} {\\{}STEP{\\}} {\\{}MIXTURE{\\}}},\nauthor={Kyuyong Shin and Wonyoung Shin and Jung-Woo Ha and Sunyoung Kwon},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eWbkSFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1eWbkSFPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "647;237;268",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1663;967;717",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            384.0,
            186.3992131599988
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1115.6666666666667,
            400.2543635684133
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14921285179869262962&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1eYKlrYvr",
        "title": "Diagnosing the Environment Bias in Vision-and-Language Navigation",
        "track": "main",
        "status": "Reject",
        "tldr": "Diagnosing and reducing the seen-unseen performance gap in vision-and-language navigation",
        "abstract": "Vision-and-Language Navigation (VLN) requires an agent to follow natural-language instructions, explore the given environments, and reach the desired target locations. These step-by-step navigational instructions are extremely useful in navigating new environments which the agent does not know about previously. Most recent works that study VLN observe a significant performance drop when tested on unseen environments (i.e., environments not used in training), indicating that the neural agent models are highly biased towards training environments. Although this issue is considered as one of major challenges in VLN research, it is still under-studied and needs a clearer explanation. In this work, we design novel diagnosis experiments via environment re-splitting and feature replacement, looking into possible reasons of this environment bias. We observe that neither the language nor the underlying navigational graph, but the low-level visual appearance conveyed by ResNet features directly affects the agent model and contributes to this environment bias in results. According to this observation, we explore several kinds of semantic representations which contain less low-level visual information, hence the agent learned with these features could be better generalized to unseen testing environments. Without modifying the baseline agent model and its training method, our explored semantic features significantly decrease the performance gap between seen and unseen on multiple datasets (i.e., 8.6% to 0.2% on R2R, 23.9% to 0.1% on R4R, and 3.74 to 0.17 on CVDN) and achieve competitive unseen results to previous state-of-the-art models.",
        "keywords": "vision-and-language navigation;generalization;environment bias diagnosis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yubo Zhang;Hao Tan;Mohit Bansal",
        "authorids": "zhangyb@cs.unc.edu;airsplay@cs.unc.edu;mbansal@cs.unc.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhang2020diagnosing,\ntitle={Diagnosing the Environment Bias in Vision-and-Language Navigation},\nauthor={Yubo Zhang and Hao Tan and Mohit Bansal},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eYKlrYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1eYKlrYvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "1105;304;338",
        "wc_reply_reviewers": "169;0;0",
        "wc_reply_authors": "1859;227;192",
        "reply_reviewers": "2;0;0",
        "reply_authors": "4;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            582.3333333333334,
            369.8417078817489
        ],
        "wc_reply_reviewers_avg": [
            56.333333333333336,
            79.66736401368435
        ],
        "wc_reply_authors_avg": [
            759.3333333333334,
            777.713029050919
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 68,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12556021208589921251&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1eYchEtwH",
        "title": "Learning Human Postural Control with Hierarchical Acquisition Functions",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper presents a computational model for efficient human postural control adaptation based on hierarchical acquisition functions with well-known features. ",
        "abstract": "Learning control policies in robotic tasks requires a large number of interactions due to small learning rates, bounds on the updates or unknown constraints. In contrast humans can infer protective and safe solutions after a single failure or unexpected observation. \nIn order to reach similar performance, we developed a hierarchical Bayesian optimization algorithm that replicates the cognitive inference and memorization process for avoiding failures in motor control tasks. A Gaussian Process implements the modeling and the sampling of the acquisition function. This enables rapid learning with large learning rates while a mental replay phase ensures that policy regions that led to failures are inhibited during the sampling process.    \nThe features of the hierarchical Bayesian optimization method are evaluated in a simulated and physiological humanoid postural balancing task. We quantitatively compare the human learning performance to our learning approach by evaluating the deviations of the center of mass during training. Our results show that we can reproduce the efficient learning of human subjects in postural control tasks which provides a testable model for future physiological motor control tasks. In these postural control tasks, our method outperforms standard Bayesian Optimization in the number of interactions to solve the task, in the computational demands and in the frequency of observed failures. ",
        "keywords": "Human Postural Control Model;Hierarchical Bayesian Optimization;Acquisition Function",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nils Rottmann;Tjasa Kunavar;Jan Babic;Jan Peters;Elmar Rueckert",
        "authorids": "rottmann@rob.uni-luebeck.de;tjasa.kunavar@ijs.si;jan.babic@ijs.si;mail@jan-peters.net;rueckert@ai-lab.science",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nrottmann2020learning,\ntitle={Learning Human Postural Control with Hierarchical Acquisition Functions},\nauthor={Nils Rottmann and Tjasa Kunavar and Jan Babic and Jan Peters and Elmar Rueckert},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eYchEtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1eYchEtwH",
        "pdf_size": 0,
        "rating": "1;1",
        "confidence": "0;0",
        "wc_review": "288;429",
        "wc_reply_reviewers": "493;0",
        "wc_reply_authors": "467;519",
        "reply_reviewers": "1;0",
        "reply_authors": "1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            358.5,
            70.5
        ],
        "wc_reply_reviewers_avg": [
            246.5,
            246.5
        ],
        "wc_reply_authors_avg": [
            493.0,
            26.0
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_ZnPRkj0IPoJ:scholar.google.com/&scioq=Learning+Human+Postural+Control+with+Hierarchical+Acquisition+Functions&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "S1eZOeBKDS",
        "title": "Deep Spike Decoder (DSD)",
        "track": "main",
        "status": "Reject",
        "tldr": "We built an unsupervised spike sorting algorithm using deep learning with biophysics baked in.",
        "abstract": "Spike-sorting is of central importance for neuroscience research.  We introducea novel spike-sorting method comprising a deep autoencoder trained end-to-endwith a biophysical generative model, biophysically motivated priors, and a self-supervised loss function to training a deep autoencoder. The encoder infers the ac-tion potential event times for each source, while the decoder parameters representeach source\u2019s spatiotemporal response waveform.  We evaluate this approach inthe context of real and synthetic multi-channel surface electromyography (sEMG)data, a noisy superposition of motor unit action potentials (MUAPs).  Relative toan established spike-sorting method, this autoencoder-based approach shows su-perior recovery of source waveforms and event times.  Moreover, the biophysicalnature of the loss functions facilitates interpretability and hyperparameter tuning.Overall, these results demonstrate the efficacy and motivate further developmentof self-supervised spike sorting techniques.",
        "keywords": "self-supervised;deep learning;spike sorting;EMG;sEMG;autoencoder;inductive bias",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Emrah Adamey;Tarin Ziyaee;Nishanth Alapati;Jun Ye",
        "authorids": "emrah@ctrl-labs.com;tarin@ctrl-labs.com;nishanth@ctrl-labs.com;jun@ctrl-labs.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nadamey2020deep,\ntitle={Deep Spike Decoder ({\\{}DSD{\\}})},\nauthor={Emrah Adamey and Tarin Ziyaee and Nishanth Alapati and Jun Ye},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eZOeBKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=S1eZOeBKDS",
        "pdf_size": 0,
        "rating": "1;1",
        "confidence": "0;0",
        "wc_review": "796;339",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;337",
        "reply_reviewers": "0;0",
        "reply_authors": "0;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            567.5,
            228.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            168.5,
            168.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.5,
            0.5
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nOTe_zTM3K8J:scholar.google.com/&scioq=Deep+Spike+Decoder+(DSD)&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1eZYeHFDS",
        "title": "Deep Learning For Symbolic Mathematics",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We train a neural network to compute function integrals, and to solve complex differential equations.",
        "abstract": "Neural networks have a reputation for being better at solving statistical or approximate problems than at performing calculations or working with symbolic data. In this paper, we show that they can be surprisingly good at more elaborated tasks in mathematics, such as symbolic integration and solving differential equations. We propose a syntax for representing these mathematical problems, and methods for generating large datasets that can be used to train sequence-to-sequence models. We achieve results that outperform commercial Computer Algebra Systems such as Matlab or Mathematica.",
        "keywords": "symbolic;math;deep learning;transformers",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guillaume Lample;Fran\u00e7ois Charton",
        "authorids": "guillaume.lample@gmail.com;fcharton@fb.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLample2020Deep,\ntitle={Deep Learning For Symbolic Mathematics},\nauthor={Guillaume Lample and Fran\u00e7ois Charton},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eZYeHFDS}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=S1eZYeHFDS)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1eZYeHFDS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "762;206;418",
        "wc_reply_reviewers": "0;0;14",
        "wc_reply_authors": "1192;105;744",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            462.0,
            229.1084168394227
        ],
        "wc_reply_reviewers_avg": [
            4.666666666666667,
            6.599663291074443
        ],
        "wc_reply_authors_avg": [
            680.3333333333334,
            446.04359527840876
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            29,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 585,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12135917312174122583&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "S1e_9xrFvS",
        "title": "Energy-based models for atomic-resolution protein conformations",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Energy-based models trained on crystallized protein structures predict native side chain configurations and automatically discover molecular energy features.",
        "abstract": "We propose an energy-based model (EBM) of protein conformations that operates at atomic scale. The model is trained solely on crystallized protein data. By contrast, existing approaches for scoring conformations use energy functions that incorporate knowledge of physical principles and features that are the complex product of several decades of research and tuning. To evaluate the model, we benchmark on the rotamer recovery task, the problem of predicting the conformation of a side chain from its context within a protein structure, which has been used to evaluate energy functions for protein design. The model achieves performance close to that of the Rosetta energy function, a state-of-the-art method widely used in protein structure prediction and design. An investigation of the model\u2019s outputs and hidden representations finds that it captures physicochemical properties relevant to protein energy.",
        "keywords": "energy-based model;transformer;energy function;protein conformation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yilun Du;Joshua Meier;Jerry Ma;Rob Fergus;Alexander Rives",
        "authorids": "yilundu@mit.edu;jmeier@fb.com;maj@fb.com;robfergus@fb.com;arives@cs.nyu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nDu2020Energy-based,\ntitle={Energy-based models for atomic-resolution protein conformations},\nauthor={Yilun Du and Joshua Meier and Jerry Ma and Rob Fergus and Alexander Rives},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1e_9xrFvS}\n}",
        "github": "https://github.com/facebookresearch/protein-ebm",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1e_9xrFvS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "178;577;519",
        "wc_reply_reviewers": "0;30;0",
        "wc_reply_authors": "357;767;464",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            424.6666666666667,
            176.01956961908778
        ],
        "wc_reply_reviewers_avg": [
            10.0,
            14.142135623730951
        ],
        "wc_reply_authors_avg": [
            529.3333333333334,
            173.6401131331378
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 64,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1721264646237527179&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1e__ANKvB",
        "title": "Molecular Graph Enhanced Transformer for Retrosynthesis Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "With massive possible synthetic routes in chemistry, retrosynthesis prediction is still a challenge for researchers. Recently, retrosynthesis prediction is formulated as a Machine Translation (MT) task. Namely, since each molecule can be represented as a Simpli\ufb01ed Molecular-Input Line-Entry System (SMILES) string, the process of synthesis is analogized to a process of language translation from reactants to products. However, the MT models that applied on SMILES data usually ignore the information of natural atomic connections and the topology of molecules. In this paper, we propose a Graph Enhanced Transformer (GET) framework, which adopts both the sequential and graphical information of molecules. Four different GET designs are proposed, which fuse the SMILES representations with atom embedding learned from our improved Graph Neural Network (GNN). Empirical results show that our model signi\ufb01cantly outperforms the Transformer model in test accuracy.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kelong Mao;Peilin Zhao;Tingyang Xu;Yu Rong;Xi Xiao;Junzhou Huang",
        "authorids": "mkl18@mails.tsinghua.edu.cn;masonzhao@tencent.com;tingyangxu@tencent.com;yu.rong@hotmail.com;xiaox@sz.tsinghua.edu.cn;joehhuang@tencent.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nmao2020molecular,\ntitle={Molecular Graph Enhanced Transformer for Retrosynthesis Prediction},\nauthor={Kelong Mao and Peilin Zhao and Tingyang Xu and Yu Rong and Xi Xiao and Junzhou Huang},\nyear={2020},\nurl={https://openreview.net/forum?id=S1e__ANKvB}\n}",
        "github": "https://github.com/papercodekl/ICLR2020_MGET",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1e__ANKvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "124;312;246",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "181;234;517",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            227.33333333333334,
            77.87739652099889
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            310.6666666666667,
            147.4953859918638
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 81,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9780477810338637799&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1ebsJrFwS",
        "title": "Meta Decision Trees for Explainable Recommendation Systems",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "We tackle the problem of building explainable recommendation systems that are based on a per-user decision tree, with decision rules that are based on single attribute values. We build the trees by applying learned regression functions to obtain the decision rules as well as the values at the leaf nodes. The regression functions receive as input the embedding of the user\u2019s training set, as well as the embedding of the samples that arrive at the current node. The embedding and the regressors are learned end-to-end with a loss that encourages the decision rules to be sparse. By applying our method, we obtain a collaborative filtering solution that provides a direct explanation to every rating it provides. With regards to accuracy, it is competitive with other algorithms. However, as expected, explainability comes at a cost and the accuracy is typically slightly lower than the state of the art result reported in the literature. Our code is attached as supplementary.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eyal Shulman;Lior Wolf",
        "authorids": "shulmaneyal@gmail.com;wolf@fb.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "https://github.com/AnonymousBlindSubmission/iclr2020metatrees",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1ebsJrFwS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "217;135;225",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            192.33333333333334,
            40.67213078045238
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12103010263252412422&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1ecYANtPr",
        "title": "Representation Learning Through Latent Canonicalizations",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce latent canonicalizers: linear transformations meant to structure latent representations for improved sim2real adaptation",
        "abstract": "We seek to learn a representation on a large annotated data source that generalizes to a target domain using limited new supervision. Many prior approaches to this problem have focused on learning disentangled representations so that as individual factors vary in a new domain, only a portion of the representation need be updated. In this work, we seek the generalization power of disentangled representations, but relax the requirement of explicit latent disentanglement and instead encourage linearity of individual factors of variation by requiring them to be manipulable by learned linear transformations. We dub these transformations latent canonicalizers, as they aim to modify the value of a factor to a pre-determined (but arbitrary) canonical value (e.g., recoloring the image foreground to black). Assuming a source domain with access to meta-labels specifying the factors of variation within an image, we demonstrate experimentally that our method helps reduce the number of observations needed to generalize to a similar target domain when compared to a number of supervised baselines. ",
        "keywords": "representation learning;latent canonicalization;sim2real;few shot;disentanglement",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Or Litany;Ari Morcos;Srinath Sridhar;Leonidas Guibas;Judy Hoffman",
        "authorids": "orlitany@gmail.com;arimorcos@gmail.com;ssrinath@cs.stanford.edu;guibas@cs.stanford.edu;judy@gatech.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nlitany2020representation,\ntitle={Representation Learning Through Latent Canonicalizations},\nauthor={Or Litany and Ari Morcos and Srinath Sridhar and Leonidas Guibas and Judy Hoffman},\nyear={2020},\nurl={https://openreview.net/forum?id=S1ecYANtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1ecYANtPr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "158;263;363",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "379;203;444",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            261.3333333333333,
            83.69919686326479
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            342.0,
            101.80700696252035
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14972773143334133989&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1ef6JBtPr",
        "title": "Probabilistic View of Multi-agent Reinforcement Learning: A Unified Approach",
        "track": "main",
        "status": "Reject",
        "tldr": "A probabilistic framework for multi-agent reinforcement learning",
        "abstract": "Formulating the reinforcement learning (RL) problem in the framework of probabilistic inference not only offers a new perspective about RL, but also yields practical algorithms that are more robust and easier to train. While this connection between RL and probabilistic inference has been extensively studied in the single-agent setting, it has not yet been fully understood in the multi-agent setup. In this paper, we pose the problem of multi-agent reinforcement learning as the problem of performing inference in a particular graphical model. We model the environment, as seen by each of the agents, using separate but related Markov decision processes. We derive a practical off-policy maximum-entropy actor-critic algorithm that we call Multi-agent Soft Actor-Critic (MA-SAC) for performing approximate inference in the proposed model using variational inference. MA-SAC can be employed in both cooperative and competitive settings. Through experiments, we demonstrate that MA-SAC outperforms a strong baseline on several multi-agent scenarios. While MA-SAC is one resultant multi-agent RL algorithm that can be derived from the proposed probabilistic framework, our work provides a unified view of maximum-entropy algorithms in the multi-agent setting.",
        "keywords": "multi-agent reinforcement learning;maximum entropy reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shubham Gupta;Ambedkar Dukkipati",
        "authorids": "shubhamg@iisc.ac.in;ambedkar@iisc.ac.in",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngupta2020probabilistic,\ntitle={Probabilistic View of Multi-agent Reinforcement Learning: A Unified Approach},\nauthor={Shubham Gupta and Ambedkar Dukkipati},\nyear={2020},\nurl={https://openreview.net/forum?id=S1ef6JBtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1ef6JBtPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "644;198;440",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "286;256;420",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            427.3333333333333,
            182.29889985649635
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            320.6666666666667,
            71.29905718560067
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3590949631444707204&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1efAp4YvB",
        "title": "Interpreting video features: a comparison of 3D convolutional networks and convolutional LSTM networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigate what spatiotemporal features are focused on in video data by two models that are principally different in the way that they model temporal dependencies.",
        "abstract": "A number of techniques for interpretability have been presented for deep learning\nin computer vision, typically with the goal of understanding what it is that the networks\nhave actually learned underneath a given classification decision. However,\nwhen it comes to deep video architectures, interpretability is still in its infancy and\nwe do not yet have a clear concept of how we should decode spatiotemporal features.\nIn this paper, we present a study comparing how 3D convolutional networks\nand convolutional LSTM networks respectively learn features across temporally\ndependent frames. This is the first comparison of two video models that both\nconvolve to learn spatial features but that have principally different methods of\nmodeling time. Additionally, we extend the concept of meaningful perturbation\nintroduced by Fong & Vedaldi (2017) to the temporal dimension to search for the\nmost meaningful part of a sequence for a classification decision.",
        "keywords": "interpretability;spatiotemporal;video;features;saliency;temporal",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Joonatan M\u00e4ntt\u00e4ri*;Sofia Broom\u00e9*;John Folkesson;Hedvig Kjellstr\u00f6m",
        "authorids": "sbroome@kth.se;manttari@kth.se;johnf@kth.se;hedvig@kth.se",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nm{\\\"a}ntt{\\\"a}ri*2020interpreting,\ntitle={Interpreting video features: a comparison of 3D convolutional networks and convolutional {\\{}LSTM{\\}} networks},\nauthor={Joonatan M{\\\"a}ntt{\\\"a}ri* and Sofia Broom{\\'e}* and John Folkesson and Hedvig Kjellstr{\\\"o}m},\nyear={2020},\nurl={https://openreview.net/forum?id=S1efAp4YvB}\n}",
        "github": "https://www.dropbox.com/sh/48kva9cw7keobi5/AABdrhuwzSwohA-Jq-TIsG80a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer5;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1efAp4YvB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "226;823;422",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "708;1278;323",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            490.3333333333333,
            248.4677488573159
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            769.6666666666666,
            392.3079855872877
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 39,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8767646062479265057&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "S1efxTVYDr",
        "title": "Data-dependent Gaussian Prior Objective for Language Generation",
        "track": "main",
        "status": "Talk",
        "tldr": "We introduce an extra data-dependent Gaussian prior objective to augment the current MLE training, which is designed to capture the prior knowledge in the ground-truth data.",
        "abstract": "For typical sequence prediction problems such as language generation, maximum likelihood estimation (MLE) has commonly been adopted as it encourages the predicted sequence most consistent with the ground-truth sequence to have the highest probability of occurring. However, MLE focuses on once-to-all matching between the predicted sequence and gold-standard, consequently treating all incorrect predictions as being equally incorrect. We refer to this drawback as {\\it negative diversity ignorance} in this paper. Treating all incorrect predictions as equal unfairly downplays the nuance of these sequences' detailed token-wise structure. To counteract this, we augment the MLE loss by introducing an extra Kullback--Leibler divergence term derived by comparing a data-dependent Gaussian prior and the detailed training prediction. The proposed data-dependent Gaussian prior objective (D2GPo) is defined over a prior topological order of tokens and is poles apart from the data-independent Gaussian prior (L2 regularization) commonly adopted in smoothing the training of MLE. Experimental results show that the proposed method makes effective use of a more detailed prior in the data and has improved performance in typical language generation tasks, including supervised and unsupervised machine translation, text summarization, storytelling, and image captioning.\n",
        "keywords": "Gaussian Prior Objective;Language Generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zuchao Li;Rui Wang;Kehai Chen;Masso Utiyama;Eiichiro Sumita;Zhuosheng Zhang;Hai Zhao",
        "authorids": "charlee@sjtu.edu.cn;wangrui@nict.go.jp;khchen@nict.go.jp;mutiyama@nict.go.jp;eiichiro.sumita@nict.go.jp;zhangzs@sjtu.edu.cn;zhaohai@cs.sjtu.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nLi2020Data-dependent,\ntitle={Data-dependent Gaussian Prior Objective for Language Generation},\nauthor={Zuchao Li and Rui Wang and Kehai Chen and Masso Utiyama and Eiichiro Sumita and Zhuosheng Zhang and Hai Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1efxTVYDr}\n}",
        "github": "https://drive.google.com/file/d/1q8PqhF9eOLOHOcOCGVKXtA_OlP6qq2mn",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1efxTVYDr",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "557;684;291",
        "wc_reply_reviewers": "45;163;0",
        "wc_reply_authors": "357;783;450",
        "reply_reviewers": "1;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            510.6666666666667,
            163.75252330541014
        ],
        "wc_reply_reviewers_avg": [
            69.33333333333333,
            68.7329776906415
        ],
        "wc_reply_authors_avg": [
            530.0,
            182.88247592374728
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 69,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17871047046437230311&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1eik6EtPB",
        "title": "Towards A Unified Min-Max Framework for Adversarial Exploration and Robustness",
        "track": "main",
        "status": "Reject",
        "tldr": "A unified min-max optimization framework for adversarial attack and defense",
        "abstract": "The worst-case training principle that minimizes the maximal adversarial loss, also known as adversarial training (AT), has shown to be a state-of-the-art approach for enhancing adversarial robustness against norm-ball bounded input perturbations. Nonetheless, min-max optimization beyond the purpose of AT has not been rigorously explored in the research of adversarial attack and defense. In particular, given a set of risk sources (domains), minimizing the maximal loss induced from the domain set can be reformulated as a general min-max problem that is different from AT. Examples of this general formulation include attacking model ensembles, devising universal perturbation under multiple inputs or data transformations, and generalized AT over different types of attack models. We show that these problems can be solved under a unified and theoretically principled min-max optimization framework.  We also show that the self-adjusted domain weights learned from our method provides a means to explain the difficulty level of attack and defense over multiple domains. Extensive experiments show that our approach leads to substantial performance improvement over the conventional averaging strategy.",
        "keywords": "Ensemble attack;adversarial training;diversity promotion",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jingkang Wang;Tianyun Zhang;Sijia Liu;Pin-Yu Chen;Jiacen Xu;Makan Fardad;Bo Li",
        "authorids": "wangjksjtu@gmail.com;tzhan120@syr.edu;sijia.liu@ibm.com;pin-yu.chen@ibm.com;coldstudy@sjtu.edu.cn;makan@syr.edu;lxbosky@gmail.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nwang2020towards,\ntitle={Towards A Unified Min-Max Framework for Adversarial Exploration and Robustness},\nauthor={Jingkang Wang and Tianyun Zhang and Sijia Liu and Pin-Yu Chen and Jiacen Xu and Makan Fardad and Bo Li},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eik6EtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1eik6EtPB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "476;378;86",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1536;1067;38",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            313.3333333333333,
            165.65291693444124
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            880.3333333333334,
            625.637985490722
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1555271989429700096&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "S1ejj64YvS",
        "title": "Good Semi-supervised VAE Requires Tighter Evidence Lower Bound",
        "track": "main",
        "status": "Reject",
        "tldr": "we propose OSPOT-VAE, a one-stage deep generative model that unifies the generation and classification loss in one ELBO framework and achieves a tighter ELBO.",
        "abstract": "Semi-supervised learning approaches based on generative models have now encountered 3 challenges: (1) The two-stage training strategy is not robust. (2) Good semi-supervised learning results and good generative performance can not be obtained at the same time. (3) Even at the expense of sacrificing generative performance, the semi-supervised classification results are still not satisfactory. To address these problems, we propose One-stage Semi-suPervised Optimal Transport VAE (OSPOT-VAE), a one-stage deep generative model that theoretically unifies the generation and classification loss in one ELBO framework and achieves a tighter ELBO by applying the optimal transport scheme to the distribution of latent variables. We show that with tighter ELBO, our OSPOT-VAE surpasses the best semi-supervised generative models by a large margin across many benchmark datasets. For example, we reduce the error rate from 14.41% to 6.11% on Cifar-10 with 4k labels and achieve state-of-the-art performance with 25.30% on Cifar-100 with 10k labels. We also demonstrate that good generative models and semi-supervised results can be achieved simultaneously by OSPOT-VAE.",
        "keywords": "VAE;Semi-supervised Learning;ELBO;Generative Model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haozhe Feng;Kezhi Kong;Tianye Zhang;Siyue Xue;Wei Chen",
        "authorids": "fenghz@zju.edu.cn;kong@cs.umd.edu;zhangtianye1026@zju.edu.cn;3160104527@zju.edu.cn;chenwei@cad.zju.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nfeng2020good,\ntitle={Good Semi-supervised {\\{}VAE{\\}} Requires Tighter Evidence Lower Bound},\nauthor={Haozhe Feng and Kezhi Kong and Tianye Zhang and Siyue Xue and Wei Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=S1ejj64YvS}\n}",
        "github": "https://github.com/PaperCodeSubmission/OSPOT-VAE",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1ejj64YvS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "116;445;606",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "182;373;436",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            389.0,
            203.92318815344828
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            330.3333333333333,
            107.99485584456738
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:q_CRqsKQXF8J:scholar.google.com/&scioq=Good+Semi-supervised+VAE+Requires+Tighter+Evidence+Lower+Bound&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1ekaT4tDB",
        "title": "Why Convolutional Networks Learn Oriented Bandpass Filters: A Hypothesis",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper offers a hypothesis for why convolutional networks learn oriented bandpass filters when applied to image understanding.",
        "abstract": "It has been repeatedly observed that convolutional architectures when applied to\nimage understanding tasks learn oriented bandpass filters. A standard explanation\nof this result is that these filters reflect the structure of the images that they have\nbeen exposed to during training: Natural images typically are locally composed\nof oriented contours at various scales and oriented bandpass filters are matched\nto such structure. The present paper offers an alternative explanation based not\non the structure of images, but rather on the structure of convolutional architectures.\nIn particular, complex exponentials are the eigenfunctions of convolution.\nThese eigenfunctions are defined globally; however, convolutional architectures\noperate locally. To enforce locality, one can apply a windowing function to the\neigenfunctions, which leads to oriented bandpass filters as the natural operators\nto be learned with convolutional architectures. From a representational point of\nview, these filters allow for a local systematic way to characterize and operate on\nan image or other signal.",
        "keywords": "convolutional networks;computer vision;oriented bandpass filters;linear systems theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Richard P. Wildes",
        "authorids": "wildes@cse.yorku.ca",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nwildes2020why,\ntitle={Why Convolutional Networks Learn Oriented Bandpass Filters: A Hypothesis},\nauthor={Richard P. Wildes},\nyear={2020},\nurl={https://openreview.net/forum?id=S1ekaT4tDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1ekaT4tDB",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "284;129;139;174",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            181.5,
            61.491869381244214
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uQphh8y7eHcJ:scholar.google.com/&scioq=Why+Convolutional+Networks+Learn+Oriented+Bandpass+Filters:+A+Hypothesis&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1el9TEKPB",
        "title": "Sparsity Meets Robustness: Channel Pruning for the Feynman-Kac Formalism Principled Robust Deep Neural Nets",
        "track": "main",
        "status": "Reject",
        "tldr": "We focus on a co-design of efficient DNN compression algorithms and sparse neural architectures for robust and accurate deep learning. Such a co-design enables us to advance the goal of accommodating both sparsity and robustness.",
        "abstract": "Deep neural nets (DNNs) compression is crucial for adaptation to mobile devices. Though many successful algorithms exist to compress naturally trained DNNs, developing efficient and stable compression algorithms for robustly trained DNNs remains widely open. In this paper, we focus on a co-design of efficient DNN compression algorithms and sparse neural architectures for robust and accurate deep learning. Such a co-design enables us to advance the goal of accommodating both sparsity and robustness. With this objective in mind, we leverage the relaxed augmented Lagrangian based algorithms to prune the weights of adversarially trained DNNs, at both structured and unstructured levels. Using a Feynman-Kac formalism principled robust and sparse DNNs, we can at least double the channel sparsity of the adversarially trained ResNet20 for CIFAR10 classification, meanwhile, improve the natural accuracy by 8.69\\% and the robust accuracy under the benchmark 20 iterations of IFGSM attack by 5.42\\%.",
        "keywords": "Sparse Network;Model Compression;Adversarial Training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thu Dinh*;Bao Wang*;Andrea L. Bertozzi;Stanley J. Osher;Jack Xin",
        "authorids": "thud2@uci.edu;wangbaonj@gmail.com;bertozzi@math.ucla.edu;sjo@math.ucla.edu;jxin@math.uci.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ndinh*2020sparsity,\ntitle={Sparsity Meets Robustness: Channel Pruning for the Feynman-Kac Formalism Principled Robust Deep Neural Nets},\nauthor={Thu Dinh* and Bao Wang* and Andrea L. Bertozzi and Stanley J. Osher and Jack Xin},\nyear={2020},\nurl={https://openreview.net/forum?id=S1el9TEKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1el9TEKPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "255;266;409",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "558;456;916",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            310.0,
            70.14746372226631
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            643.3333333333334,
            197.24998239684473
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1729237785414108038&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "S1elRa4twS",
        "title": "Pre-training as Batch Meta Reinforcement Learning with tiMe",
        "track": "main",
        "status": "Reject",
        "tldr": "Pre-training in RL from purely existing and observational data. Generalization to unseen MDPs.",
        "abstract": "Pre-training is transformative in supervised learning: a large network trained with large and existing datasets can be used as an initialization when learning a new task. Such initialization speeds up convergence and leads to higher performance. In this paper, we seek to understand what the formalization for pre-training from only existing and observational data in Reinforcement Learning (RL) is and whether it is possible. We formulate the setting as Batch Meta Reinforcement Learning. We identify MDP mis-identification to be a central challenge and motivate it with theoretical analysis. Combining ideas from Batch RL and Meta RL, we propose tiMe, which learns distillation of multiple value functions and MDP embeddings from only existing data. In challenging control tasks and without fine-tuning on unseen MDPs, tiMe is competitive with state-of-the-art model-free RL method trained with hundreds of thousands of environment interactions.",
        "keywords": "Reinforcement Learning;Deep Reinforcement Learning;Meta Reinforcement Learning;Batch Reinforcement Learning;Transfer Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Quan Vuong;Shuang Liu;Minghua Liu;Kamil Ciosek;Hao Su;Henrik Iskov Christensen",
        "authorids": "quan.hovuong@gmail.com;s3liu@eng.ucsd.edu;minghua@ucsd.edu;kamil.ciosek@microsoft.com;haosu@eng.ucsd.edu;hichristensen@ucsd.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nvuong2020pretraining,\ntitle={Pre-training as Batch Meta Reinforcement Learning with tiMe },\nauthor={Quan Vuong and Shuang Liu and Minghua Liu and Kamil Ciosek and Hao Su and Henrik Iskov Christensen},\nyear={2020},\nurl={https://openreview.net/forum?id=S1elRa4twS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1elRa4twS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "731;386;906",
        "wc_reply_reviewers": "0;0;704",
        "wc_reply_authors": "169;334;2081",
        "reply_reviewers": "0;0;2",
        "reply_authors": "1;1;4",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            674.3333333333334,
            216.0375481767515
        ],
        "wc_reply_reviewers_avg": [
            234.66666666666666,
            331.8687826368863
        ],
        "wc_reply_authors_avg": [
            861.3333333333334,
            865.0612052848567
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AyrLKNzy5LsJ:scholar.google.com/&scioq=Pre-training+as+Batch+Meta+Reinforcement+Learning+with+tiMe&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1emOTNKvS",
        "title": "Robust Graph Representation Learning via Neural Sparsification",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Graph representation learning serves as the core of many important prediction tasks, ranging from product recommendation in online marketing to fraud detection in financial domain. Real-life graphs are usually large with complex local neighborhood, where each node is described by a rich set of features and easily connects to dozens or even hundreds of neighbors. Most existing graph learning techniques rely on neighborhood aggregation, however, the complexity on real-life graphs is usually high, posing non-trivial overfitting risk during model training. In this paper, we present Neural Sparsification (NeuralSparse), a supervised graph sparsification technique that mitigates the overfitting risk by reducing the complexity of input graphs. Our method takes both structural and non-structural information as input, utilizes deep neural networks to parameterize the sparsification process, and optimizes the parameters by feedback signals from downstream tasks. Under the NeuralSparse framework, supervised graph sparsification could seamlessly connect with existing graph neural networks for more robust performance on testing data. Experimental results on both benchmark and private datasets show that NeuralSparse can effectively improve testing accuracy and bring up to 7.4% improvement when working with existing graph neural networks on node classification tasks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cheng Zheng;Bo Zong;Wei Cheng;Dongjin Song;Jingchao Ni;Wenchao Yu;Haifeng Chen;Wei Wang",
        "authorids": "chengzheng@cs.ucla.edu;bzong@nec-labs.com;weicheng@nec-labs.com;dsong@nec-labs.com;jni@nec-labs.com;yuwenchao@ucla.edu;haifeng@nec-labs.com;weiwang@cs.ucla.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nzheng2020robust,\ntitle={Robust Graph Representation Learning via Neural Sparsification},\nauthor={Cheng Zheng and Bo Zong and Wei Cheng and Dongjin Song and Jingchao Ni and Wenchao Yu and Haifeng Chen and Wei Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=S1emOTNKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1emOTNKvS",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "311;396;332",
        "wc_reply_reviewers": "0;76;0",
        "wc_reply_authors": "735;719;274",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            346.3333333333333,
            36.15091823023157
        ],
        "wc_reply_reviewers_avg": [
            25.333333333333332,
            35.82674358011841
        ],
        "wc_reply_authors_avg": [
            576.0,
            213.64612485759403
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 362,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16689439002162801380&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "S1enmaVFvS",
        "title": "Data-Driven Approach to Encoding and Decoding 3-D Crystal Structures",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Generative models have achieved impressive results in many domains including image and text generation. In the natural sciences, generative models have lead to rapid progress in automated drug discovery.  Many of the current methods focus on either 1-D or 2-D representations of typically small, drug-like molecules. However, many molecules require 3-D descriptors and exceed the chemical complexity of commonly used dataset. We present a method to encode and decode the position of atoms in 3-D molecules along with a dataset of nearly 50,000 stable crystal unit cells that vary from containing 1 to over 100 atoms. We construct a smooth and continuous 3-D density representation of each crystal based on the positions of different atoms. Two different neural networks were trained on a dataset of over 120,000 three-dimensional samples of single and repeating crystal structures. The first, an Encoder-Decoder pair, constructs a  compressed latent space representation of each molecule and then decodes this description into an accurate reconstruction of the input. The second network segments the resulting output into atoms and assigns each atom an atomic number. By generating compressed, continuous latent spaces representations of molecules we are able to decode random samples, interpolate between two molecules, and alter known molecules.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jordan Hoffmann;Louis Maestrati;Yoshihide Sawada;Jian Tang;Jean Michel Sellier;Yoshua Bengio",
        "authorids": "jhoffmann@g.harvard.edu;maestratilouis@gmail.com;sawada.yoshihide@jp.panasonic.com;jian.tang@hec.ca;jeanmichel.sellier@mila.quebec;yoshua.bengio@mila.quebec",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nhoffmann2020datadriven,\ntitle={Data-Driven Approach to Encoding and Decoding 3-D Crystal Structures},\nauthor={Jordan Hoffmann and Louis Maestrati and Yoshihide Sawada and Jian Tang and Jean Michel Sellier and Yoshua Bengio},\nyear={2020},\nurl={https://openreview.net/forum?id=S1enmaVFvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1enmaVFvS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "992;284;168",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "605;496;190",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            481.3333333333333,
            364.18798564599695
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            430.3333333333333,
            175.67077794050502
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 107,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2248663782954001922&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1eof0VKwH",
        "title": "$\\textrm{D}^2$GAN: A Few-Shot Learning Approach with Diverse and Discriminative Feature Synthesis",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A new GAN based few-shot learning algorithm by synthesizing  diverse and discriminative Features",
        "abstract": "The rich and accessible labeled data fuel the revolutionary success of deep learning. Nonetheless, massive supervision remains a luxury for many real applications, boosting great interest in label-scarce techniques such as few-shot learning (FSL). An intuitively feasible approach to FSL is to conduct data augmentation via synthesizing additional training samples. The key to this approach is how to guarantee both discriminability and diversity of the synthesized samples. In this paper, we propose a novel FSL model, called $\\textrm{D}^2$GAN, which synthesizes Diverse and Discriminative features based on Generative Adversarial Networks (GAN). $\\textrm{D}^2$GAN secures discriminability of the synthesized features by constraining them to have high correlation with real features of the same classes while low correlation with those of different classes.  Based on the observation that noise vectors that are closer in the latent code space are more likely to be collapsed into the same mode when mapped to feature space, $\\textrm{D}^2$GAN incorporates a novel anti-collapse regularization term, which encourages feature diversity by penalizing the ratio of the logarithmic similarity of two synthesized features and the logarithmic similarity of the latent codes generating them. Experiments on three common benchmark datasets verify the effectiveness of $\\textrm{D}^2$GAN by comparing with the state-of-the-art.",
        "keywords": "few-shot learning;generative adversarial networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kai Li;Yulun Zhang;Kunpeng Li;Yun Fu",
        "authorids": "li.kai.gml@gmail.com;yulun100@gmail.com;kunpengli@ece.neu.edu;yunfu@ece.neu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1eof0VKwH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "789;241;417",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            482.3333333333333,
            228.44012101399545
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2775866019291497899&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1epu2EtPB",
        "title": "From Here to There: Video Inbetweening Using Direct 3D Convolutions",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper presents method for stochastically generating in-between video frames from given key frames, using direct 3D convolutions.",
        "abstract": "We consider the problem of generating plausible and diverse video sequences, when we are only given a start and an end frame. This task is also known as inbetweening, and it belongs to the broader area of stochastic video generation, which is generally approached by means of recurrent neural networks (RNN). In this paper, we propose instead a fully convolutional model to generate video sequences directly in the pixel domain. We first obtain a latent video representation using a stochastic fusion mechanism that learns how to incorporate information from the start and end frames. Our model learns to produce such latent representation by progressively increasing the temporal resolution, and then decode in the spatiotemporal domain using 3D convolutions. The model is trained end-to-end by minimizing an adversarial loss. Experiments on several widely-used benchmark datasets show that it is able to generate meaningful and diverse in-between video sequences, according to both quantitative and qualitative evaluations.",
        "keywords": "Computer vision;video generation;generative models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yunpeng Li;Dominik Roblek;Marco Tagliasacchi",
        "authorids": "yunpeng@google.com;droblek@google.com;mtagliasacchi@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1epu2EtPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "192;286;723",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            400.3333333333333,
            231.3645512077327
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17622891443808937803&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1eq9yrYvH",
        "title": "Subjective Reinforcement Learning for Open Complex Environments",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Solving tasks in open environments has been one of the long-time pursuits of reinforcement learning researches. We propose that data confusion is the core underlying problem. Although there exist methods that implicitly alleviate it from different perspectives, we argue that their solutions are based on task-specific prior knowledge that is constrained to certain kinds of tasks and lacks theoretical guarantees. In this paper, Subjective Reinforcement Learning Framework is proposed to state the problem from a broader and systematic view, and subjective policy is proposed to represent existing related algorithms in general. Theoretical analysis is given about the conditions for the superiority of a subjective policy, and the relationship between model complexity and the overall performance. Results are further applied as guidance for algorithm designing without task-specific prior knowledge about tasks.\n",
        "keywords": "reinforcement learning theory;subjective learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhile Yang*;Haichuan Gao*;Xin Su;Shangqi Guo;Feng Chen",
        "authorids": "yzl18@mails.tsinghua.edu.cn;ghc18@mails.tsinghua.edu.cn;suxin16@mails.tsinghua.edu.cn;gsq15@mails.tsinghua.edu.cn;chenfeng@mail.tsinghua.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyang*2020subjective,\ntitle={Subjective Reinforcement Learning for Open Complex Environments},\nauthor={Zhile Yang* and Haichuan Gao* and Xin Su and Shangqi Guo and Feng Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eq9yrYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1eq9yrYvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "471;809;382",
        "wc_reply_reviewers": "123;0;72",
        "wc_reply_authors": "483;662;376",
        "reply_reviewers": "1;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            554.0,
            183.93658327441733
        ],
        "wc_reply_reviewers_avg": [
            65.0,
            50.457903246171455
        ],
        "wc_reply_authors_avg": [
            507.0,
            117.98587486079283
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1xl5igkASlcJ:scholar.google.com/&scioq=Subjective+Reinforcement+Learning+for+Open+Complex+Environments&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1eqj1SKvr",
        "title": "TOWARDS FEATURE SPACE ADVERSARIAL ATTACK",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a new type of adversarial attack to Deep Neural Networks (DNNs) for image classification. Different from most existing attacks that directly perturb input pixels. Our attack focuses on perturbing abstract features, more specifically, features that denote styles, including interpretable styles such as vivid colors and sharp outlines, and uninterpretable ones. It induces model misclassfication by injecting style changes insensitive for humans, through an optimization procedure. We show that state-of-the-art pixel space adversarial attack detection and defense techniques are ineffective in guarding against feature space attacks. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qiuling Xu;Guanhong Tao;Siyuan Cheng;Lin Tan;Xiangyu Zhang",
        "authorids": "xu1230@purdue.edu;taog@purdue.edu;516030910472@sjtu.edu.cn;lintan@purdue.edu;xyzhang@cs.purdue.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nxu2020towards,\ntitle={{\\{}TOWARDS{\\}} {\\{}FEATURE{\\}} {\\{}SPACE{\\}} {\\{}ADVERSARIAL{\\}} {\\{}ATTACK{\\}}},\nauthor={Qiuling Xu and Guanhong Tao and Siyuan Cheng and Lin Tan and Xiangyu Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=S1eqj1SKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1eqj1SKvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "161;272;309",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "767;443;368",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            247.33333333333334,
            62.887907334311016
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            526.0,
            173.1415605797753
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12214795252212526179&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1erpeBFPB",
        "title": "How to 0wn the NAS in Your Spare Time",
        "track": "main",
        "status": "Poster",
        "tldr": "We design an algorithm that reconstructs the key components of a novel deep learning system by exploiting a small amount of information leakage from a cache side-channel attack, Flush+Reload.",
        "abstract": "New data processing pipelines and novel network architectures increasingly drive the success of deep learning. In consequence, the industry considers top-performing architectures as intellectual property and devotes considerable computational resources to discovering such architectures through neural architecture search (NAS). This provides an incentive for adversaries to steal these novel architectures; when used in the cloud, to provide Machine Learning as a Service (MLaaS), the adversaries also have an opportunity to reconstruct the architectures by exploiting a range of hardware side-channels. However, it is challenging to reconstruct novel architectures and pipelines without knowing the computational graph (e.g., the layers, branches or skip connections), the architectural parameters (e.g., the number of filters in a convolutional layer) or the specific pre-processing steps (e.g. embeddings). In this paper, we design an algorithm that reconstructs the key components of a novel deep learning system by exploiting a small amount of information leakage from a cache side-channel attack, Flush+Reload. We use Flush+Reload to infer the trace of computations and the timing for each computation. Our algorithm then generates candidate computational graphs from the trace and eliminates incompatible candidates through a parameter estimation process. We implement our algorithm in PyTorch and Tensorflow. We demonstrate experimentally that we can reconstruct MalConv, a novel data pre-processing pipeline for malware detection, and ProxylessNAS-CPU, a novel network architecture for the ImageNet classification optimized to run on CPUs, without knowing the architecture family. In both cases, we achieve 0% error. These results suggest hardware side channels are a practical attack vector against MLaaS, and more efforts should be devoted to understanding their impact on the security of deep learning systems.",
        "keywords": "Reconstructing Novel Deep Learning Systems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sanghyun Hong;Michael Davinroy;Yi\u01e7itcan Kaya;Dana Dachman-Soled;Tudor Dumitra\u015f",
        "authorids": "shhong@cs.umd.edu;michael.davinroy@gmail.com;cankaya@umiacs.umd.edu;danadach@ece.umd.edu;tdumitra@umiacs.umd.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nHong2020How,\ntitle={How to 0wn the NAS in Your Spare Time},\nauthor={Sanghyun Hong and Michael Davinroy and Yi\u01e7itcan Kaya and Dana Dachman-Soled and Tudor Dumitra\u015f},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1erpeBFPB}\n}",
        "github": "https://github.com/Sanghyun-Hong/How-to-0wn-NAS-in-Your-Spare-Time",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1erpeBFPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "689;204;230",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1013;577;910",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            374.3333333333333,
            222.7559701157799
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            833.3333333333334,
            186.06868504817126
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6624307467439583182&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1ervgHFwS",
        "title": "Adversarial Training Generalizes Data-dependent Spectral Norm Regularization",
        "track": "main",
        "status": "Reject",
        "tldr": "We establish a theoretical link between adversarial training and operator norm regularization for deep neural networks.",
        "abstract": "We establish a theoretical link between adversarial training and operator norm regularization for deep neural networks. Specifically, we present a data-dependent variant of spectral norm regularization and prove that it is equivalent to adversarial training based on a specific $\\ell_2$-norm constrained projected gradient ascent attack. This fundamental connection confirms the long-standing argument that a network's sensitivity to adversarial examples is tied to its spectral properties and hints at novel ways to robustify and defend against adversarial attacks. We provide extensive empirical evidence to support our theoretical results. ",
        "keywords": "Adversarial Robustness;Adversarial Training;Spectral Norm Regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kevin Roth;Yannic Kilcher;Thomas Hofmann",
        "authorids": "kevin.roth@inf.ethz.ch;yannic.kilcher@inf.ethz.ch;thomas.hofmann@inf.ethz.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nroth2020adversarial,\ntitle={Adversarial Training Generalizes Data-dependent Spectral Norm Regularization},\nauthor={Kevin Roth and Yannic Kilcher and Thomas Hofmann},\nyear={2020},\nurl={https://openreview.net/forum?id=S1ervgHFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1ervgHFwS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "682;249;110",
        "wc_reply_reviewers": "432;0;0",
        "wc_reply_authors": "3547;1212;16",
        "reply_reviewers": "1;0;0",
        "reply_authors": "6;3;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            347.0,
            243.58297696404537
        ],
        "wc_reply_reviewers_avg": [
            144.0,
            203.64675298172568
        ],
        "wc_reply_authors_avg": [
            1591.6666666666667,
            1466.3106386059158
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17589604025939158664&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "S1esMkHYPr",
        "title": "GraphAF: a Flow-based Autoregressive Model for Molecular Graph Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "A flow-based autoregressive model for molecular graph generation. Reaching state-of-the-art results on molecule generation and properties optimization.",
        "abstract": "Molecular graph generation is a fundamental problem for drug discovery and has been attracting growing attention. The problem is challenging since it requires not only generating chemically valid molecular structures but also optimizing their chemical properties in the meantime. Inspired by the recent progress in deep generative models, in this paper we propose a flow-based autoregressive model for graph generation called GraphAF. GraphAF combines the advantages of both autoregressive and flow-based approaches and enjoys: (1) high model flexibility for data density estimation; (2) efficient parallel computation for training; (3) an iterative sampling process, which allows leveraging chemical domain knowledge for valency checking. Experimental results show that GraphAF is able to generate 68\\% chemically valid molecules even without chemical knowledge rules and 100\\% valid molecules with chemical rules. The training process of GraphAF is two times faster than the existing state-of-the-art approach GCPN. After fine-tuning the model for goal-directed property optimization with reinforcement learning, GraphAF achieves state-of-the-art performance on both chemical property optimization and constrained property optimization. ",
        "keywords": "Molecular graph generation;deep generative models;normalizing flows;autoregressive models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chence Shi*;Minkai Xu*;Zhaocheng Zhu;Weinan Zhang;Ming Zhang;Jian Tang",
        "authorids": "chenceshi@pku.edu.cn;mkxu@apex.sjtu.edu.cn;zhaocheng.zhu@umontreal.ca;wnzhang@sjtu.edu.cn;mzhang_cs@pku.edu.cn;jian.tang@hec.ca",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nShi*2020GraphAF:,\ntitle={GraphAF: a Flow-based Autoregressive Model for Molecular Graph Generation},\nauthor={Chence Shi* and Minkai Xu* and Zhaocheng Zhu and Weinan Zhang and Ming Zhang and Jian Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1esMkHYPr}\n}",
        "github": "http://bit.ly/2lCkfsr",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1esMkHYPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "612;346;479",
        "wc_reply_reviewers": "400;41;0",
        "wc_reply_authors": "1726;304;1401",
        "reply_reviewers": "4;2;0",
        "reply_authors": "5;2;3",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            479.0,
            108.59404526338756
        ],
        "wc_reply_reviewers_avg": [
            147.0,
            179.6793440177993
        ],
        "wc_reply_authors_avg": [
            1143.6666666666667,
            608.378354498434
        ],
        "reply_reviewers_avg": [
            2.0,
            1.632993161855452
        ],
        "reply_authors_avg": [
            3.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            23,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 544,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2901334410635777038&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1et1lrtwr",
        "title": "Unsupervised Meta-Learning for Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Meta-learning on self-proposed task distributions to speed up reinforcement learning without human specified task distributions ",
        "abstract": "Meta-learning algorithms learn to acquire new tasks more quickly from past experience. In the context of reinforcement learning, meta-learning algorithms can acquire reinforcement learning procedures to solve new problems more efficiently by utilizing experience from prior tasks. The performance of meta-learning algorithms depends on the tasks available for meta-training: in the same way that supervised learning generalizes best to test points drawn from the same distribution as the training points, meta-learning methods generalize best to tasks from the same distribution as the meta-training tasks. In effect, meta-reinforcement learning offloads the design burden from algorithm design to task design. If we can automate the process of task design as well, we can devise a meta-learning algorithm that is truly automated. In this work, we take a step in this direction, proposing a family of unsupervised meta-learning algorithms for reinforcement learning. We motivate and describe a general recipe for unsupervised meta-reinforcement learning, and present an instantiation of this approach. Our conceptual and theoretical contributions consist of formulating the unsupervised meta-reinforcement learning problem and describing how task proposals based on mutual information can in principle be used to train optimal meta-learners. Our experimental results indicate that unsupervised meta-reinforcement learning effectively acquires accelerated reinforcement learning procedures without the need for manual task design and significantly exceeds the performance of learning from scratch.",
        "keywords": "Meta-Learning;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Abhishek Gupta;Benjamin Eysenbach;Chelsea Finn;Sergey Levine",
        "authorids": "abhigupta@berkeley.edu;beysenba@cs.cmu.edu;cbfinn@eecs.berkeley.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ngupta2020unsupervised,\ntitle={Unsupervised Meta-Learning for Reinforcement Learning},\nauthor={Abhishek Gupta and Benjamin Eysenbach and Chelsea Finn and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=S1et1lrtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1et1lrtwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "463;238;364",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "820;395;992",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            355.0,
            92.07605551933683
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            735.6666666666666,
            250.91344235723116
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 145,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15092894714895568430&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1et8gBKwH",
        "title": "Semi-supervised Pose Estimation with Geometric Latent Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "Semi-supervised method for identifying planar rotations for limited amount of labelled data. ",
        "abstract": "Pose estimation is the task of finding the orientation of an  object within an image with respect to a fixed frame of reference. Current classification and regression approaches to the task require large quantities of labelled data for their purposes. The amount of labelled data for pose estimation is relatively limited. With this in mind, we propose the use of Conditional Variational Autoencoders (CVAEs) \\cite{Kingma2014a} with circular latent representations to estimate the corresponding 2D rotations of an object. The method is capable of training with datasets that have an arbitrary amount of labelled images providing relatively similar performance for cases in which 10-20% of the labels for images is missing. ",
        "keywords": "Semi-supervised learning;pose estimation;angle estimation;variational autoencoders",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Luis A. Perez Rey;Dmitri Jarnikov;Mike Holenderski",
        "authorids": "l.a.perez.rey@tue.nl;d.s.jarnikov@tue.nl;m.holenderski@tue.nl",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nrey2020semisupervised,\ntitle={Semi-supervised Pose Estimation with Geometric Latent Representations},\nauthor={Luis A. Perez Rey and Dmitri Jarnikov and Mike Holenderski},\nyear={2020},\nurl={https://openreview.net/forum?id=S1et8gBKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1et8gBKwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "457;607;229",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "783;843;394",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            431.0,
            155.4091374404993
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            673.3333333333334,
            199.0315441219194
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:adGfSyyzP8YJ:scholar.google.com/&scioq=Semi-supervised+Pose+Estimation+with+Geometric+Latent+Representations&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1evHerYPr",
        "title": "Improving Generalization in Meta Reinforcement Learning using Learned Objectives",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We introduce MetaGenRL, a novel meta reinforcement learning algorithm. Unlike prior work, MetaGenRL can generalize to new environments that are entirely different from those used for meta-training.",
        "abstract": "Biological evolution has distilled the experiences of many learners into the general learning algorithms of humans. Our novel meta reinforcement learning algorithm MetaGenRL is inspired by this process. MetaGenRL distills the experiences of many complex agents to meta-learn a low-complexity neural objective function that decides how future individuals will learn. Unlike recent meta-RL algorithms, MetaGenRL can generalize to new environments that are entirely different from those used for meta-training. In some cases, it even outperforms human-engineered RL algorithms. MetaGenRL uses off-policy second-order gradients during meta-training that greatly increase its sample efficiency.",
        "keywords": "meta reinforcement learning;meta learning;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Louis Kirsch;Sjoerd van Steenkiste;Juergen Schmidhuber",
        "authorids": "louis@idsia.ch;sjoerd@idsia.ch;juergen@idsia.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nKirsch2020Improving,\ntitle={Improving Generalization in Meta Reinforcement Learning using Learned Objectives},\nauthor={Louis Kirsch and Sjoerd van Steenkiste and Juergen Schmidhuber},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1evHerYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1evHerYPr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "590;522;576",
        "wc_reply_reviewers": "361;0;0",
        "wc_reply_authors": "1212;720;257",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            562.6666666666666,
            29.31817790306136
        ],
        "wc_reply_reviewers_avg": [
            120.33333333333333,
            170.17703200556244
        ],
        "wc_reply_authors_avg": [
            729.6666666666666,
            389.9370319537359
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 159,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8179367601014023545&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1evKR4KvB",
        "title": "A Deep Dive into Count-Min Sketch for Extreme Classification in Logarithmic Memory",
        "track": "main",
        "status": "Withdraw",
        "tldr": "How to estimate original probability vector for millions of classes from count-min sketch measurements -  a theoretical and practical setup.",
        "abstract": "Extreme Classification Methods have become of paramount importance, particularly for Information Retrieval (IR) problems, owing to the development of smart algorithms that are scalable to industry challenges. One of the prime class of models that aim to solve the memory and speed challenge of extreme multi-label learning is Group Testing. Multi-label Group Testing (MLGT) methods construct label groups by grouping original labels either randomly or based on some similarity and then train smaller classifiers to first predict the groups and then recover the original label vectors. Recently, a novel approach called MACH (Merged Average Classifiers via Hashing) was proposed which projects the huge label vectors to a small and manageable count-min sketch (CMS) matrix and then learns to predict this matrix to recover the original prediction probabilities. Thereby, the model memory scales O(logK) for K classes. MACH is a simple algorithm which works exceptionally well in practice. Despite this simplicity of MACH, there is a big gap between the theoretical understanding of the trade-offs with MACH. In this paper we fill this gap. Leveraging the theory of count-min sketch we provide precise quantification of the memory-identifiablity tradeoffs. We extend the theory to the case of multi-label classification, where the dependencies make the estimators hard to calculate in closed forms. To mitigate this issue, we propose novel quadratic approximation using the Inclusion-Exclusion Principle. Our estimator has significantly lower reconstruction error than the typical CMS estimator across various values of number of classes K, label sparsity and compression ratio.",
        "keywords": "Extreme Classification;Count-Min Sketch",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tharun Medini;Anshumali Shrivastava",
        "authorids": "tharun.medini@rice.edu;anshumali@rice.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=S1evKR4KvB",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IF9s5klwW7kJ:scholar.google.com/&scioq=A+Deep+Dive+into+Count-Min+Sketch+for+Extreme+Classification+in+Logarithmic+Memory&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1ewjhEFwr",
        "title": "Storage Efficient and Dynamic Flexible Runtime Channel Pruning via Deep Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, we propose a deep reinforcement learning (DRL) based framework to efficiently perform runtime channel pruning on convolutional neural networks (CNNs). Our DRL-based framework aims to learn a pruning strategy to determine how many and which channels to be pruned in each convolutional layer, depending on each specific input instance in runtime. The learned policy optimizes the performance of the network by restricting the computational resource on layers under an overall computation budget. Furthermore, unlike other runtime pruning methods which require to store all channels parameters in inference, our framework can reduce parameters storage consumption at deployment by introducing a static pruning component. Comparison experimental results with existing runtime and static pruning methods on state-of-the-art CNNs demonstrate that our proposed framework is able to provide a tradeoff between dynamic flexibility and storage efficiency in runtime channel pruning. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jianda Chen;Shangyu Chen;Sinno Jialin Pan",
        "authorids": "jianda001@e.ntu.edu.sg;schen025@e.ntu.edu.sg;sinnopan@ntu.edu.sg",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchen2020storage,\ntitle={Storage Efficient and Dynamic Flexible Runtime Channel Pruning via Deep Reinforcement Learning},\nauthor={Jianda Chen and Shangyu Chen and Sinno Jialin Pan},\nyear={2020},\nurl={https://openreview.net/forum?id=S1ewjhEFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1ewjhEFwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "191;744;429",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "629;1117;533",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            454.6666666666667,
            226.48963675090204
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            759.6666666666666,
            255.6942618745199
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4466778342935676978&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "S1exA2NtDB",
        "title": "ES-MAML: Simple Hessian-Free Meta Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We provide a new framework for MAML in the ES/blackbox setting, and show that it allows deterministic and linear policies, better exploration, and non-differentiable adaptation operators.",
        "abstract": "We introduce ES-MAML, a new framework for solving the model agnostic meta learning (MAML) problem based on Evolution Strategies (ES). Existing algorithms for MAML are based on policy gradients, and incur significant difficulties when attempting to estimate second derivatives using backpropagation on stochastic policies. We show how ES can be applied to MAML to obtain an algorithm which avoids the problem of estimating second derivatives, and is also conceptually simple and easy to implement. Moreover, ES-MAML can handle new types of nonsmooth adaptation operators, and other techniques for improving performance and estimation of ES methods become applicable. We show empirically that ES-MAML is competitive with existing methods and often yields better adaptation with fewer queries.",
        "keywords": "ES;MAML;evolution;strategies;meta;learning;gaussian;perturbation;reinforcement;learning;adaptation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xingyou Song;Wenbo Gao;Yuxiang Yang;Krzysztof Choromanski;Aldo Pacchiano;Yunhao Tang",
        "authorids": "xsong@berkeley.edu;wg2279@columbia.edu;yxyang@google.com;kchoro@google.com;pacchiano@berkeley.edu;yt2541@columbia.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nSong2020ES-MAML:,\ntitle={ES-MAML: Simple Hessian-Free Meta Learning},\nauthor={Xingyou Song and Wenbo Gao and Yuxiang Yang and Krzysztof Choromanski and Aldo Pacchiano and Yunhao Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1exA2NtDB}\n}",
        "github": "[![github](/images/github_icon.svg) google-research/google-research](https://github.com/google-research/google-research/tree/master/es_maml)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer5",
        "site": "https://openreview.net/forum?id=S1exA2NtDB",
        "pdf_size": 0,
        "rating": "1;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "720;542;198;453",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "1833;1094;995;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "3;2;2;0",
        "rating_avg": [
            5.75,
            2.8613807855648994
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            478.25,
            188.2051739458828
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            980.5,
            652.1558479382056
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.75,
            1.0897247358851685
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 148,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10965831951706650949&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1g2skStPB",
        "title": "Causal Discovery with Reinforcement Learning",
        "track": "main",
        "status": "Talk",
        "tldr": "We apply reinforcement learning to score-based causal discovery and achieve promising results on both synthetic and real datasets",
        "abstract": "Discovering causal structure among a set of variables is a fundamental problem in many empirical sciences. Traditional score-based casual discovery methods rely on various local heuristics to search for a Directed Acyclic Graph (DAG) according to a predefined score function. While these methods, e.g., greedy equivalence search, may have attractive results with infinite samples and certain model assumptions, they are less satisfactory in practice due to finite data and possible violation of assumptions. Motivated by recent advances in neural combinatorial optimization, we propose to use Reinforcement Learning (RL) to search for the DAG with the best scoring. Our encoder-decoder model takes observable data as input and generates graph adjacency matrices that are used to compute rewards. The reward incorporates both the predefined score function and two penalty terms for enforcing acyclicity. In contrast with typical RL applications where the goal is to learn a policy, we use RL as a search strategy and our final output would be the graph, among all graphs generated during training, that achieves the best reward. We conduct experiments on both synthetic and real datasets, and show that the proposed approach not only has an improved search ability but also allows for a flexible score function under the acyclicity constraint. ",
        "keywords": "causal discovery;structure learning;reinforcement learning;directed acyclic graph",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shengyu Zhu;Ignavier Ng;Zhitang Chen",
        "authorids": "zhushengyu@huawei.com;ignavierng@cs.toronto.edu;chenzhitang2@huawei.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nZhu2020Causal,\ntitle={Causal Discovery with Reinforcement Learning},\nauthor={Shengyu Zhu and Ignavier Ng and Zhitang Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1g2skStPB}\n}",
        "github": "[![github](/images/github_icon.svg) huawei-noah/trustworthyAI](https://github.com/huawei-noah/trustworthyAI)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1g2skStPB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "190;182;503",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "329;371;1997",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;3",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            291.6666666666667,
            149.47091876199715
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            899.0,
            776.5925572653913
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 315,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15746962195892177964&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1g490VKvB",
        "title": "The Dynamics of Signal Propagation in Gated Recurrent Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We calculate conditions for signal propagation in LSTMs and GRUs, and use these to predict trainability of networks on long sequence tasks and construct initialization schemes that improve performance on such tasks. ",
        "abstract": "Training recurrent neural networks (RNNs) on long sequence tasks is plagued with difficulties arising from the exponential explosion or vanishing of signals as they propagate forward or backward through the network. Many techniques have been proposed to ameliorate these issues, including various algorithmic and architectural modifications. Two of the most successful RNN architectures, the LSTM and the GRU, do exhibit modest improvements over vanilla RNN cells, but they still suffer from instabilities when trained on very long sequences. In this work, we develop a mean field theory of signal propagation in LSTMs and GRUs that enables us to calculate the time scales for signal propagation as well as the spectral properties of the state-to-state Jacobians. By optimizing these quantities in terms of the initialization hyperparameters, we derive a novel initialization scheme that eliminates or reduces training instabilities. We demonstrate the efficacy of our initialization scheme on multiple sequence tasks, on which it enables successful training while a standard initialization either fails completely or is orders of magnitude slower. We also observe a beneficial effect on generalization performance using this new initialization.",
        "keywords": "recurrent neural networks;theory of deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dar Gilboa;Bo Chang;Minmin Chen;Greg Yang;Samuel S. Schoenholz;Ed H. Chi;Jeffrey Pennington",
        "authorids": "dg2893@columbia.edu;bchang@stat.ubc.ca;minminc@google.com;gregyang@microsoft.com;schsam@google.com;edchi@google.com;jpennin@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\ngilboa2020the,\ntitle={The Dynamics of Signal Propagation in Gated Recurrent Neural Networks},\nauthor={Dar Gilboa and Bo Chang and Minmin Chen and Greg Yang and Samuel S. Schoenholz and Ed H. Chi and Jeffrey Pennington},\nyear={2020},\nurl={https://openreview.net/forum?id=S1g490VKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1g490VKvB",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "264;474;1006",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            581.3333333333334,
            312.28334712067004
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4634186078508492191&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1g6xeSKDS",
        "title": "Mixed-curvature Variational Autoencoders",
        "track": "main",
        "status": "Poster",
        "tldr": "Variational Autoencoders with latent spaces modeled as products of constant curvature Riemannian manifolds improve on image reconstruction over single-manifold variants.",
        "abstract": "Euclidean space has historically been the typical workhorse geometry for machine learning applications due to its power and simplicity. However, it has recently been shown that geometric spaces with constant non-zero curvature improve representations and performance on a variety of data types and downstream tasks. Consequently, generative models like Variational Autoencoders (VAEs) have been successfully generalized to elliptical and hyperbolic latent spaces. While these approaches work well on data with particular kinds of biases e.g. tree-like data for a hyperbolic VAE, there exists no generic approach unifying and leveraging all three models. We develop a Mixed-curvature Variational Autoencoder, an efficient way to train a VAE whose latent space is a product of constant curvature Riemannian manifolds, where the per-component curvature is fixed or learnable. This generalizes the Euclidean VAE to curved latent spaces and recovers it when curvatures of all latent space components go to 0.",
        "keywords": "variational autoencoders;riemannian manifolds;non-Euclidean geometry",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ondrej Skopek;Octavian-Eugen Ganea;Gary B\u00e9cigneul",
        "authorids": "oskopek@oskopek.com;oct@mit.edu;garyb@mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nSkopek2020Mixed-curvature,\ntitle={Mixed-curvature Variational Autoencoders},\nauthor={Ondrej Skopek and Octavian-Eugen Ganea and Gary B\u00e9cigneul},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1g6xeSKDS}\n}",
        "github": "https://github.com/oskopek/mvae",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1g6xeSKDS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "297;427;662",
        "wc_reply_reviewers": "0;0;83",
        "wc_reply_authors": "374;399;1233",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;3",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            462.0,
            151.05186747162932
        ],
        "wc_reply_reviewers_avg": [
            27.666666666666668,
            39.12657522565563
        ],
        "wc_reply_authors_avg": [
            668.6666666666666,
            399.1744258118526
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 43,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4577288345206475501&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1g7tpEYDS",
        "title": "From Variational to Deterministic Autoencoders",
        "track": "main",
        "status": "Poster",
        "tldr": "Deterministic regularized autoencoders can learn a smooth, meaningful latent space as VAEs without having to force some arbitrarily chosen prior (i.e., Gaussian).",
        "abstract": " Variational Autoencoders (VAEs) provide a theoretically-backed and popular framework for deep generative models. However, learning a VAE from data poses still unanswered theoretical questions and considerable practical challenges. In this work, we propose an alternative framework for generative modeling that is simpler, easier to train, and deterministic, yet has many of the advantages of the VAE. We observe that sampling a stochastic encoder in a Gaussian VAE can be interpreted as simply injecting noise into the input of a deterministic decoder. We investigate how substituting this kind of stochasticity, with other explicit and implicit regularization schemes, can lead to an equally smooth and meaningful latent space without having to force it to conform to an arbitrarily chosen prior. To retrieve a generative mechanism to sample new data points, we introduce an ex-post density estimation step that can be readily applied to the proposed framework as well as existing VAEs, improving their sample quality. We show, in a rigorous empirical study, that the proposed regularized deterministic autoencoders are able to generate samples that are comparable to, or better than, those of VAEs and more powerful alternatives when applied to images as well as to structured data such as molecules. ",
        "keywords": "Unsupervised learning;Generative Models;Variational Autoencoders;Regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Partha Ghosh;Mehdi S. M. Sajjadi;Antonio Vergari;Michael Black;Bernhard Scholkopf",
        "authorids": "partha.ghosh@tuebingen.mpg.de;msajjadi@tue.mpg.de;antonio.vergari@tuebingen.mpg.de;black@tue.mpg.de;bs@tue.mpg.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nGhosh2020From,\ntitle={From Variational to Deterministic Autoencoders},\nauthor={Partha Ghosh and Mehdi S. M. Sajjadi and Antonio Vergari and Michael Black and Bernhard Scholkopf},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1g7tpEYDS}\n}",
        "github": "https://github.com/ParthaEth/Regularized_autoencoders-RAE-",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1g7tpEYDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "939;377;304",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1017;472;596",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;3",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            540.0,
            283.7052461035338
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            695.0,
            233.24807966340614
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 364,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10583740506297544895&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "S1g8K1BFwS",
        "title": "Probability Calibration for Knowledge Graph Embedding Models",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a novel method to calibrate knowledge graph embedding models without the need of negative examples.",
        "abstract": "Knowledge graph embedding research has overlooked the problem of probability calibration. We show popular embedding models are indeed uncalibrated. That means probability estimates associated to predicted triples are unreliable. We present a novel method to calibrate a model when ground truth negatives are not available, which is the usual case in knowledge graphs. We propose to use Platt scaling and isotonic regression alongside our method. Experiments on three datasets with ground truth negatives show our contribution leads to well calibrated models when compared to the gold standard of using negatives. We get significantly better results than the uncalibrated models from all calibration methods. We show isotonic regression offers the best the performance overall, not without trade-offs. We also show that calibrated models reach state-of-the-art accuracy without the need to define relation-specific decision thresholds.",
        "keywords": "knowledge graph embeddings;probability calibration;calibration;graph representation learning;knowledge graphs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pedro Tabacof;Luca Costabello",
        "authorids": "tabacof@gmail.com;luca.costabello@accenture.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nTabacof2020Probability,\ntitle={Probability Calibration for Knowledge Graph Embedding Models},\nauthor={Pedro Tabacof and Luca Costabello},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1g8K1BFwS}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=S1g8K1BFwS)",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer6;AnonReviewer5",
        "site": "https://openreview.net/forum?id=S1g8K1BFwS",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "174;107;617;962",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "265;13;720;511",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            465.0,
            347.4758984447698
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            377.25,
            264.87957169249574
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 57,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15032109218017214433&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1gAO0EYwB",
        "title": "Elastic-InfoGAN: Unsupervised Disentangled Representation Learning in Imbalanced Data",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Elastic-InfoGAN is a modification of InfoGAN that learns, without any supervision, disentangled representations in class imbalanced data",
        "abstract": "We propose a novel unsupervised generative model, Elastic-InfoGAN, that learns to disentangle object identity from other low-level aspects in class-imbalanced datasets. We first investigate the issues surrounding the assumptions about uniformity made by InfoGAN, and demonstrate its ineffectiveness to properly disentangle object identity in imbalanced data. Our key idea is to make the discovery of the discrete latent factor of variation invariant to identity-preserving transformations in real images, and use that as the signal to learn the latent distribution's parameters. Experiments on both artificial (MNIST) and real-world (YouTube-Faces) datasets demonstrate the effectiveness of our approach in imbalanced data by: (i) better disentanglement of object identity as a latent factor of variation; and (ii) better approximation of class imbalance in the data, as reflected in the learned parameters of the latent distribution.",
        "keywords": "Generative Adversarial Networks;Imbalanced data;Data Augmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Utkarsh Ojha;Krishna Kumar Singh;Cho-Jui Hsieh;Yong Jae Lee",
        "authorids": "uojha@ucdavis.edu;krsingh@ucdavis.edu;chohsieh@cs.ucla.edu;yongjaelee@ucdavis.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1gAO0EYwB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "565;278;509",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            450.6666666666667,
            124.2157621059242
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10229999981629183881&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1gE6TEYDB",
        "title": "Global reasoning network for image super-resolution",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A state-of-the-art model based on global reasoning for image super-resolution",
        "abstract": "Recent image super-resolution(SR) studies leverage very deep convolutional neural networks and the rich hierarchical features they offered, which leads to better reconstruction performance than conventional methods. However, the small receptive fields in the up-sampling and reconstruction process of those models stop them to take full advantage of global contextual information. This causes problems for further performance improvement. In this paper, inspired by image reconstruction principles of human visual system, we propose an image super-resolution global reasoning network (SRGRN) to effectively learn the correlations between different regions of an image, through global reasoning. Specifically, we propose global reasoning up-sampling module (GRUM) and global reasoning reconstruction block (GRRB). They construct a graph model to perform relation reasoning on regions of low resolution (LR) images.They aim to reason the interactions between different regions in the up-sampling and reconstruction process and thus leverage more contextual information to generate accurate details. Our proposed SRGRN are more robust and can handle low resolution images that are corrupted by multiple types of degradation. Extensive experiments on different benchmark data-sets show that our model outperforms other state-of-the-art methods. Also our model is lightweight and consumes less computing power, which makes it very suitable for real life deployment.",
        "keywords": "Global reasoning network;upsampling module;graph model;image super-resolution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiahui Zhang;Bin Zhou;Qingchang Tao;Deqiang Wang",
        "authorids": "jhzhang988@gmail.com;binzhou@sdu.edu.cn;taoqingchang@mail.tsinghua.edu.cn;wdq_sdu@sdu.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1gE6TEYDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "167;189;244",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            200.0,
            32.38312317653544
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9va2t2UxyP0J:scholar.google.com/&scioq=Global+reasoning+network+for+image+super-resolution&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1gEFkrtvH",
        "title": "BasisVAE: Orthogonal Latent Space for Deep Disentangled Representation",
        "track": "main",
        "status": "Reject",
        "tldr": "Construct orthogonal latent space for deep disentangled representation based on a basis in the linear algebra",
        "abstract": "The variational autoencoder, one of the generative models, defines the latent space for the data representation, and uses variational inference to infer the posterior probability. Several methods have been devised to disentangle the latent space for controlling the generative model easily. However, due to the excessive constraints, the more disentangled the latent space is, the lower quality the generative model has. A disentangled generative model would allocate a single feature of the generated data to the only single latent variable. In this paper, we propose a method to decompose the latent space into basis, and reconstruct it by linear combination of the latent bases. The proposed model called BasisVAE consists of the encoder that extracts the features of data and estimates the coefficients for linear combination of the latent bases, and the decoder that reconstructs the data with the combined latent bases. In this method, a single latent basis is subject to change in a single generative factor, and relatively invariant to the changes in other factors. It maintains the performance while relaxing the constraint for disentanglement on a basis, as we no longer need to decompose latent space on a standard basis. Experiments on the well-known benchmark datasets of MNIST, 3DFaces and CelebA demonstrate the efficacy of the proposed method, compared to other state-of-the-art methods. The proposed model not only defines the latent space to be separated by the generative factors, but also shows the better quality of the generated and reconstructed images. The disentangled representation is verified with the generated images and the simple classifier trained on the output of the encoder.",
        "keywords": "variational autoencoder;latent space;basis;disentangled representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jin-Young  Kim;Sung-Bae Cho",
        "authorids": "seago0828@yonsei.ac.kr;sbcho@yonsei.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkim2020basisvae,\ntitle={Basis{\\{}VAE{\\}}: Orthogonal Latent Space for Deep Disentangled Representation},\nauthor={Jin-Young  Kim and Sung-Bae Cho},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gEFkrtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1gEFkrtvH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "272;777;413",
        "wc_reply_reviewers": "156;455;487",
        "wc_reply_authors": "491;386;1070",
        "reply_reviewers": "2;1;6",
        "reply_authors": "3;1;7",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            487.3333333333333,
            212.76016753351388
        ],
        "wc_reply_reviewers_avg": [
            366.0,
            149.0659809167292
        ],
        "wc_reply_authors_avg": [
            649.0,
            300.7623646668579
        ],
        "reply_reviewers_avg": [
            3.0,
            2.160246899469287
        ],
        "reply_authors_avg": [
            3.6666666666666665,
            2.494438257849294
        ],
        "replies_avg": [
            25,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5885266291860267719&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1gEIerYwH",
        "title": "Transferring Optimality Across Data Distributions via Homotopy Methods",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a new homotopy-based method to transfer \"optimality knowledge\" across different data distributions in order to speed up training of deep models.  ",
        "abstract": "Homotopy methods, also known as continuation methods, are a powerful mathematical tool to efficiently solve various problems in numerical analysis, including complex non-convex optimization problems where no or only little prior knowledge regarding the localization of the solutions is available. \nIn this work, we propose a novel homotopy-based numerical method that can be used to transfer knowledge regarding the localization of an optimum across different task distributions in deep learning applications. We validate the proposed methodology with some empirical evaluations in the regression and classification scenarios, where it shows that superior numerical performance can be achieved in popular deep learning benchmarks, i.e. FashionMNIST, CIFAR-10, and draw connections with the widely used fine-tuning heuristic. In addition, we give more insights on the properties of a general homotopy method when used in combination with Stochastic Gradient Descent by conducting a general local theoretical analysis in a simplified setting. ",
        "keywords": "deep learning;numerical optimization;transfer learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matilde Gargiani;Andrea Zanelli;Quoc Tran Dinh;Moritz Diehl;Frank Hutter",
        "authorids": "gargiani@informatik.uni-freiburg.de;andrea.zanelli@imtek.uni-freiburg.de;quoctd@email.unc.edu;moritz.diehl@imtek.uni-freiburg.de;fh@cs.uni-freiburg.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nGargiani2020Transferring,\ntitle={Transferring Optimality Across Data Distributions via Homotopy Methods},\nauthor={Matilde Gargiani and Andrea Zanelli and Quoc Tran Dinh and Moritz Diehl and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gEIerYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1gEIerYwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "83;295;318",
        "wc_reply_reviewers": "0;14;0",
        "wc_reply_authors": "456;84;319",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            232.0,
            105.77649392311444
        ],
        "wc_reply_reviewers_avg": [
            4.666666666666667,
            6.599663291074443
        ],
        "wc_reply_authors_avg": [
            286.3333333333333,
            153.61495876234045
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10999692434749265681&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1gFvANKDS",
        "title": "Asymptotics of Wide Networks from Feynman Diagrams",
        "track": "main",
        "status": "Spotlight",
        "tldr": "A general method for computing the asymptotic behavior of wide networks using Feynman diagrams",
        "abstract": "Understanding the asymptotic behavior of wide networks is of considerable interest. In this work, we present a general method for analyzing this large width behavior. The method is an adaptation of Feynman diagrams, a standard tool for computing multivariate Gaussian integrals. We apply our method to study training dynamics, improving existing bounds and deriving new results on wide network evolution during stochastic gradient descent. Going beyond the strict large width limit, we present closed-form expressions for higher-order terms governing wide network training, and test these predictions empirically.\n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ethan Dyer;Guy Gur-Ari",
        "authorids": "edyer@google.com;guyga@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nDyer2020Asymptotics,\ntitle={Asymptotics of Wide Networks from Feynman Diagrams},\nauthor={Ethan Dyer and Guy Gur-Ari},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gFvANKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=S1gFvANKDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "326;156;634",
        "wc_reply_reviewers": "0;0;261",
        "wc_reply_authors": "454;0;411",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;0;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            372.0,
            197.83494804171144
        ],
        "wc_reply_reviewers_avg": [
            87.0,
            123.03657992645927
        ],
        "wc_reply_authors_avg": [
            288.3333333333333,
            204.63680563921588
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.816496580927726
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 137,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3839547995068836302&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1gINCVYDH",
        "title": "Posterior Sampling: Make Reinforcement Learning Sample Efficient Again",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Machine learning thrives on leveraging structure in data, and many breakthroughs (e.g.\\ convolutional networks) have been made by designing algorithms which exploit the underlying structure of a distribution. Reinforcement Learning agents interact with worlds that are similarly full of structure. For example, no sequence of actions an agent takes will ever cause the laws of physics to change, therefore an agent which learns to generalize such laws through time and space will have an advantage. Sample efficient reinforcement learning can be accomplished when assuming that the world has structure and designing learning algorithms which exploit this assumption without knowing the actual structure beforehand. Posterior Sampling for Reinforcement Learning (PSRL) \\citep{strens2000bayesian} is such a method which assumes structure in the world and exploits it for learning. A PSLR learning agent first samples models of the environment which conform to both prior assumptions on the world's structure and past observations and then interacts with the true environment using a policy guided by the sampled model of the environment. While PSRL delivers theoretical Bayesian regret bounds, there are many open issues which must be addressed before PSRL can be applied to current benchmark continuous reinforcement reinforcement tasks. In this work, we identify these issues and find practical solutions to them leading to a novel algorithm we call Neural-PSRL. We validate the algorithm's effectiveness by achieving state of the art results in the HalfCheetah-v3 and Hopper-v3 domains.",
        "keywords": "Model Based Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Calvin Seward;Urs Bergmann;Roland Vollgraf;Sepp Hochreiter",
        "authorids": "seward@bioinf.jku.at;urs.bergmann@zalando.de;roland.vollgraf@zalando.de;hochreit@bioinf.jku.at",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://drive.google.com/file/d/1GNEug1V-NCFiD2Bi8taKpLnDl4k-vFgy/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1gINCVYDH",
        "pdf_size": 0,
        "rating": "1;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "589;424;282;237",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "256;299;335;755",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.25,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            383.0,
            137.5081815747703
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            411.25,
            200.42501715105328
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IjDVv7XVO-YJ:scholar.google.com/&scioq=Posterior+Sampling:+Make+Reinforcement+Learning+Sample+Efficient+Again&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "S1gKA6NtPS",
        "title": "Deep symbolic regression",
        "track": "main",
        "status": "Reject",
        "tldr": "A deep learning approach to symbolic regression, in which an autoregressive RNN emits a distribution over expressions that is optimized using reinforcement learning",
        "abstract": "Discovering the underlying mathematical expressions describing a dataset is a core challenge for artificial intelligence. This is the problem of symbolic regression. Despite recent advances in training neural networks to solve complex tasks, deep learning approaches to symbolic regression are lacking. We propose a framework that combines deep learning with symbolic regression via a simple idea: use a large model to search the space of small models. More specifically, we use a recurrent neural network to emit a distribution over tractable mathematical expressions, and employ reinforcement learning to train the network to generate better-fitting expressions. Our algorithm significantly outperforms standard genetic programming-based symbolic regression in its ability to exactly recover symbolic expressions on a series of benchmark problems, both with and without added noise. More broadly, our contributions include a framework that can be applied to optimize hierarchical, variable-length objects under a black-box performance metric, with the ability to incorporate a priori constraints in situ.",
        "keywords": "symbolic regression;reinforcement learning;automated machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Brenden K. Petersen",
        "authorids": "petersen33@llnl.gov",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\npetersen2020deep,\ntitle={Deep symbolic regression},\nauthor={Brenden K. Petersen},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gKA6NtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1gKA6NtPS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "154;388;491",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "704;818;935",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.3333333333333,
            141.00197004139892
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            819.0,
            94.30800602281866
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "S1gKkpNKwH",
        "title": "Reinforcement Learning with Chromatic Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that ENAS with ES-optimization in RL is highly scalable, and use it to compactify neural network policies by weight sharing.",
        "abstract": "We present a neural architecture search algorithm to construct compact reinforcement learning (RL) policies, by combining ENAS and ES in a highly scalable and intuitive way. By defining the combinatorial search space of NAS to be the set of different edge-partitionings (colorings) into same-weight classes, we represent compact architectures via efficient learned edge-partitionings. For several RL tasks, we manage to learn colorings translating to effective policies parameterized by as few as 17 weight parameters, providing >90 % compression over vanilla policies and 6x compression over state-of-the-art compact policies based on Toeplitz matrices, while still maintaining good reward. We believe that our work is one of the first attempts to propose a rigorous approach to training structured neural network architectures for RL problems that are of interest especially in mobile robotics with limited storage and computational resources.",
        "keywords": "reinforcement;learning;chromatic;networks;partitioning;efficient;neural;architecture;search;weight;sharing;compactification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xingyou Song;Krzysztof Choromanski;Jack Parker-Holder;Yunhao Tang;Wenbo Gao;Aldo Pacchiano;Tamas Sarlos;Deepali Jain;Yuxiang Yang",
        "authorids": "xingyousong@google.com;kchoro@google.com;jh3764@columbia.edu;yt2541@columbia.edu;wg2279@columbia.edu;pacchiano@berkeley.edu;stamas@google.com;jaindeepali@google.com;yxyang@google.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@misc{\nsong2020reinforcement,\ntitle={Reinforcement Learning with Chromatic Networks},\nauthor={Xingyou Song and Krzysztof Choromanski and Jack Parker-Holder and Yunhao Tang and Wenbo Gao and Aldo Pacchiano and Tamas Sarlos and Deepali Jain and Yuxiang Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gKkpNKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1gKkpNKwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "288;723;469",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "240;1212;285",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            493.3333333333333,
            178.4196053004141
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            579.0,
            447.9754457556798
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17577277826680990143&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "S1gLBgBtDH",
        "title": "SLM Lab: A Comprehensive Benchmark and Modular Software Framework for Reproducible Deep Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a new software framework (SLM Lab) for reinforcement learning and use it to produce a massive performance benchmark of RL algorithms.",
        "abstract": "We introduce SLM Lab, a software framework for reproducible reinforcement learning (RL) research.  SLM Lab implements a number of popular RL algorithms, provides synchronous and asynchronous parallel experiment execution, hyperparameter search, and result analysis.  RL algorithms in SLM Lab are implemented in a modular way such that differences in algorithm performance can be confidently ascribed to differences between algorithms, not between implementations.  In this work we present the design choices behind SLM Lab and use it to produce a comprehensive single-codebase RL algorithm benchmark.  In addition, as a consequence of SLM Lab's modular design, we introduce and evaluate a discrete-action variant of the Soft Actor-Critic algorithm (Haarnoja et al., 2018) and a hybrid synchronous/asynchronous training method for RL agents.",
        "keywords": "reinforcement learning;machine learning;benchmark;reproducibility;software;framework;implementation issues;parallelization;software platforms",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wah Loon Keng;Laura Graesser;Milan Cvitkovic",
        "authorids": "kengzwl@gmail.com;lhgraesser@gmail.com;mcvitkov@caltech.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkeng2020slm,\ntitle={{\\{}SLM{\\}} Lab: A Comprehensive Benchmark and Modular Software Framework for Reproducible Deep Reinforcement Learning},\nauthor={Wah Loon Keng and Laura Graesser and Milan Cvitkovic},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gLBgBtDH}\n}",
        "github": "https://github.com/kengz/SLM-Lab",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1gLBgBtDH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "747;185;132",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "791;407;14",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            354.6666666666667,
            278.2640632365036
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            404.0,
            317.21601472813444
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14217328682753782188&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1gLC3VtPB",
        "title": "INTERPRETING CNN COMPRESSION USING INFORMATION BOTTLENECK",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In this paper, we first investigate the representation learned in convolutional neural networks at the filter-wise granularity by computing the mutual information between channels of higher conv-layers and input or output variables. Then we identify the approximate minimal sufficient statistics of learned representation based on the information bottleneck principle and propose a novel approach to automatically compress a neural network. This approach prunes a large trained network structurally and automatically by extracting relevant information backpropagately layer by layer in the post-training phase. Our experimental results match the two fundamental data processing inequalities, and prove that mutual information is a fundamental element for examining the efficiency of the internal representations at the filter-wise granularity. In addition, using the information bottleneck principle to interpret structure compression is an efficient method to get closer to the information theoretic limit of compression/prediction problem. Finally, from the observed results, we argue that compression is causally linked to the improved generalization performance. ",
        "keywords": "Learning Representation;Information Bottleneck;Model Compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hui Xiang;Feifei Shi;Peng Wang;Qigang Wang;Zhongchao Shi",
        "authorids": "xianghui1@lenovo.com;shiff3@lenovo.com;wangpeng31@lenovo.com;wangqg1@lenovo.com;shizc2@lenovo.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1gLC3VtPB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "900;171;207",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            426.0,
            335.49068541466244
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TaCQFqMzXtgJ:scholar.google.com/&scioq=INTERPRETING+CNN+COMPRESSION+USING+INFORMATION+BOTTLENECK&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1gN8yrYwB",
        "title": "AUGMENTED POLICY GRADIENT METHODS FOR EFFICIENT REINFORCEMENT LEARNING",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a new mixture of model-based and model-free reinforcement learning\n(RL) algorithms that combines the strengths of both RL methods. Our goal is to reduce the sample complexity of model-free approaches utilizing fictitious trajectory\nrollouts performed on a learned dynamics model to improve the data efficiency of\npolicy gradient methods while maintaining the same asymptotic behaviour. We\nsuggest to use a special type of uncertainty quantification by a stochastic dynamics\nmodel in which the next state prediction is randomly drawn from the distribution\npredicted by the dynamics model. As a result, the negative effect of exploiting\nerroneously optimistic regions in the dynamics model is addressed by next state\npredictions based on an uncertainty aware ensemble of dynamics models. The\ninfluence of the ensemble of dynamics models on the policy update is controlled\nby adjusting the number of virtually performed rollouts in the next iteration according to the ratio of the real and virtual total reward. Our approach, which we\ncall Model-Based Policy Gradient Enrichment (MBPGE), is tested on a collection of benchmark tests including simulated robotic locomotion. We compare our\napproach to plain model-free algorithms and a model-based one. Our evaluation\nshows that MBPGE leads to higher learning rates in an early training stage and an\nimproved asymptotic behaviour.",
        "keywords": "model-free reinforcement learning;model-based reinforcement learning;Baysian neural network;deep learning;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kai Lagemann;Gregor Roering;Christoph Henke;Rene Vossen;Frank Hees",
        "authorids": "kai.lagemann@rwth-aachen.de;gregor.roering@rwth-aachen.de;christoph.henke@ifu.rwth-aachen.de;rene.vossen@ifu.rwth-aachen.de;hees.office@ima-ifu.rwth-aachen.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nlagemann2020augmented,\ntitle={{\\{}AUGMENTED{\\}} {\\{}POLICY{\\}} {\\{}GRADIENT{\\}} {\\{}METHODS{\\}} {\\{}FOR{\\}} {\\{}EFFICIENT{\\}} {\\{}REINFORCEMENT{\\}} {\\{}LEARNING{\\}}},\nauthor={Kai Lagemann and Gregor Roering and Christoph Henke and Rene Vossen and Frank Hees},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gN8yrYwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1gN8yrYwB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "630;630;563",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            607.6666666666666,
            31.584102892999123
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dUN24ta9JqkJ:scholar.google.com/&scioq=AUGMENTED+POLICY+GRADIENT+METHODS+FOR+EFFICIENT+REINFORCEMENT+LEARNING&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1gNc3NtvB",
        "title": "Learning Algorithmic Solutions to Symbolic Planning Tasks with a Neural Computer",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel neural computer architecture that learns transferable abstract strategies to symbolic planning tasks as algorithmic solutions with evolution strategies.",
        "abstract": "A key feature of intelligent behavior is the ability to learn abstract strategies that transfer to unfamiliar problems. Therefore, we present a novel architecture, based on memory-augmented networks, that is inspired by the von Neumann and Harvard architectures of modern computers. This architecture enables the learning of abstract algorithmic solutions via Evolution Strategies in a reinforcement learning setting. Applied to Sokoban, sliding block puzzle and robotic manipulation tasks, we show that the architecture can learn algorithmic solutions with strong generalization and abstraction: scaling to arbitrary task configurations and complexities, and being independent of both the data representation and the task domain.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Tanneberg;Elmar Rueckert;Jan Peters",
        "authorids": "daniel@robot-learning.de;rueckert@rob.uni-luebeck.de;mail@jan-peters.net",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntanneberg2020learning,\ntitle={Learning Algorithmic Solutions to Symbolic Planning Tasks with a Neural Computer},\nauthor={Daniel Tanneberg and Elmar Rueckert and Jan Peters},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gNc3NtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1gNc3NtvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "257;704;372",
        "wc_reply_reviewers": "0;0;226",
        "wc_reply_authors": "414;791;833",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            444.3333333333333,
            189.51927489191056
        ],
        "wc_reply_reviewers_avg": [
            75.33333333333333,
            106.53742169877317
        ],
        "wc_reply_authors_avg": [
            679.3333333333334,
            188.40087284534775
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3399895548178827016&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1gR2ANFvB",
        "title": "Model Comparison of Beer data classification using an electronic nose",
        "track": "main",
        "status": "Reject",
        "tldr": "On this paper we will discuss the process of sniffing volatile organic compounds from liquid beer samples and exploring various machine learning models",
        "abstract": "Olfaction has been and still is an area which is challenging to the research community. Like other senses of the body, there has been a push to replicate the sense of smell to aid in identifying odorous compounds in the form of an electronic nose. At IBM, our team (Cogniscent) has designed a modular sensor board platform based on the artificial olfaction concept we called EVA (Electronic Volatile Analyzer). EVA is an IoT electronic nose device that aims to reproduce olfaction in living begins by integrating an array of partially specific and uniquely selective smell recognition sensors which are directly exposed to the target chemical analyte or the environment. We are exploring a new technique called temperature-controlled oscillation, which gives us virtual array of sensors to represent our signals/ fingerprint. In our study, we run experiments on identifying different types of beers using EVA. In order to successfully carry this classification task, the entire process starting from preparation of samples, having a consistent protocol of data collection in place all the way to providing the data to be analyzed and input to a machine learning model is very important. On this paper, we will discuss the process of sniffing volatile organic compounds from liquid beer samples and successfully classifying different brands of beers as a pilot test. We researched on different machine learning models in order to get the best classification accuracy for our Beer samples. The best classification accuracy is achieved by using a multi-level perceptron (MLP) artificial neural network (ANN) model, classification of three different brands of beers after splitting one-week data to a training and testing set yielded an accuracy of 97.334. While using separate weeks of data for training and testing set the model yielded an accuracy of 67.812, this is because of drift playing a role in the overall classification process. Using Random forest, the classification accuracy achieved by the model is 0.923. And Decision Tree achieved 0.911. ",
        "keywords": "Electronic Nose;EVA;modular;olfaction;sensitivity;selectivity;analyte;temperature oscillated waveforms;features;fingerprint",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohammed Abdi;Aminat Adebiyi;Andrea Fasoli;Alberto Mannari;Ronald Labby;Luisa Bozano",
        "authorids": "mohammed.munir.abdi@ibm.com;aminat.adebiyi@ibm.com;andrea.fasoli@ibm.com;alberto.mannari@ibm.com;rlabby@us.ibm.com;lbozano@us.ibm.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nabdi2020model,\ntitle={Model Comparison of Beer data classification using an electronic nose},\nauthor={Mohammed Abdi and Aminat Adebiyi and Andrea Fasoli and Alberto Mannari and Ronald Labby and Luisa Bozano},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gR2ANFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1gR2ANFvB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "104;122;56",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            94.0,
            27.85677655436824
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9834503296775877576&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1gSj0NKvB",
        "title": "Comparing Rewinding and Fine-tuning in Neural Network Pruning",
        "track": "main",
        "status": "Talk",
        "tldr": "Instead of fine-tuning after pruning, rewind weights or learning rate schedule to their values earlier in training and retrain from there to achieve higher accuracy when pruning neural networks.",
        "abstract": "Many neural network pruning algorithms proceed in three steps: train the network to completion, remove unwanted structure to compress the network, and retrain the remaining structure to recover lost accuracy. The standard retraining technique, fine-tuning, trains the unpruned weights from their final trained values using a small fixed learning rate. In this paper, we compare fine-tuning to alternative retraining techniques. Weight rewinding (as proposed by Frankle et al., (2019)), rewinds unpruned weights to their values from earlier in training and retrains them from there using the original training schedule. Learning rate rewinding (which we propose) trains the unpruned weights from their final values using the same learning rate schedule as weight rewinding. Both rewinding techniques outperform fine-tuning, forming the basis of a network-agnostic pruning algorithm that matches the accuracy and compression ratios of several more network-specific state-of-the-art techniques.\n",
        "keywords": "pruning;sparsity;fine-tuning;lottery ticket",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alex Renda;Jonathan Frankle;Michael Carbin",
        "authorids": "renda@csail.mit.edu;jfrankle@csail.mit.edu;mcarbin@csail.mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nRenda2020Comparing,\ntitle={Comparing Rewinding and Fine-tuning in Neural Network Pruning},\nauthor={Alex Renda and Jonathan Frankle and Michael Carbin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gSj0NKvB}\n}",
        "github": "https://github.com/lottery-ticket/rewinding-iclr20-public",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1gSj0NKvB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "236;310;375",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "31;156;66",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            307.0,
            56.786148545808835
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            84.33333333333333,
            52.65189666310439
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 479,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15288579142798778406&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1gTAp4FDB",
        "title": "Neural-Guided Symbolic Regression with Asymptotic Constraints",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Symbolic regression is a type of discrete optimization problem that involves searching expressions that fit given data points. In many cases, other mathematical constraints about the unknown expression not only provide more information beyond just values at some inputs, but also effectively constrain the search space. We identify the asymptotic constraints of leading polynomial powers as the function approaches 0 and infinity as useful constraints and create a system to use them for symbolic regression. The first part of the system is a conditional expression generating neural network which preferentially generates expressions with the desired leading powers, producing novel expressions outside the training domain. The second part, which we call Neural-Guided Monte Carlo Tree Search, uses the network during a search to find an expression that conforms to a set of data points and desired leading powers. Lastly, we provide an extensive experimental validation on thousands of target expressions showing the efficacy of our system compared to exiting methods for finding unknown functions outside of the training set.",
        "keywords": "symbolic regression;program synthesis;monte carlo tree search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Li Li;Minjie Fan;Rishabh Singh;Patrick Riley",
        "authorids": "leeley@google.com;mjfan@google.com;rising@google.com;pfr@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nli2020neuralguided,\ntitle={Neural-Guided Symbolic Regression with Asymptotic Constraints},\nauthor={Li Li and Minjie Fan and Rishabh Singh and Patrick Riley},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gTAp4FDB}\n}",
        "github": "https://drive.google.com/drive/folders/1mz3tIIFNRm2wIeFOwZvQi807KP-YYK15?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1gTAp4FDB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "332;229;202",
        "wc_reply_reviewers": "315;0;0",
        "wc_reply_authors": "1373;686;153",
        "reply_reviewers": "1;0;0",
        "reply_authors": "7;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            254.33333333333334,
            56.013887166983466
        ],
        "wc_reply_reviewers_avg": [
            105.0,
            148.49242404917499
        ],
        "wc_reply_authors_avg": [
            737.3333333333334,
            499.38384257224646
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            3.3333333333333335,
            2.6246692913372702
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14846405691480828107&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1gTwJSKvr",
        "title": "OPTIMAL BINARY QUANTIZATION FOR DEEP NEURAL NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Quantizing weights and activations of deep neural networks results in significant improvement in inference efficiency at the cost of lower accuracy. A source of the accuracy gap between full precision and quantized models is the quantization error.\nIn this work, we focus on the binary quantization, in which values are mapped to -1 and 1. We introduce several novel quantization algorithms: optimal 2-bits, optimal ternary, and greedy. Our quantization algorithms can be implemented efficiently on the hardware using bitwise operations. We present proofs to show that our proposed methods are optimal, and also provide empirical error analysis. We conduct experiments on the ImageNet dataset and show a reduced accuracy gap when using the proposed optimal quantization algorithms.",
        "keywords": "Binary Neural Networks;Quantization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hadi Pouransari;Oncel Tuzel",
        "authorids": "mpouransari@apple.com;onceltuzel@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\npouransari2020optimal,\ntitle={{\\{}OPTIMAL{\\}} {\\{}BINARY{\\}} {\\{}QUANTIZATION{\\}} {\\{}FOR{\\}} {\\{}DEEP{\\}} {\\{}NEURAL{\\}} {\\{}NETWORKS{\\}}},\nauthor={Hadi Pouransari and Oncel Tuzel},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gTwJSKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1gTwJSKvr",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "165;394;349;170",
        "wc_reply_reviewers": "0;0;41;0",
        "wc_reply_authors": "541;589;919;254",
        "reply_reviewers": "0;0;1;0",
        "reply_authors": "1;1;2;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            269.5,
            103.24848667171834
        ],
        "wc_reply_reviewers_avg": [
            10.25,
            17.75352077758099
        ],
        "wc_reply_authors_avg": [
            575.75,
            235.96967495845732
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oUh6_aAh4yQJ:scholar.google.com/&scioq=OPTIMAL+BINARY+QUANTIZATION+FOR+DEEP+NEURAL+NETWORKS&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1gV36NKPS",
        "title": "Natural Language State Representation for Reinforcement Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Recent advances in Reinforcement Learning have highlighted the difficulties in learning within complex high dimensional domains. We argue that one of the main reasons that current approaches do not perform well, is that the information is represented sub-optimally. A natural way to describe what we observe, is through natural language. In this paper, we implement a natural language state representation to learn and complete tasks. Our experiments suggest that natural language based agents are more robust, converge faster and perform better than vision based agents, showing the benefit of using natural language representations for Reinforcement Learning.",
        "keywords": "Reinforcement Learning;Natural Language;Representation Learning;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Erez Schwartz;Guy Tennenholtz;Chen Tessler;Shie Mannor",
        "authorids": "erezschwartz@campus.technion.ac.il;sguyt@campus.technion.ac.il;chen.tessler@gmail.com;shiemannor@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://github.com/anon-rl-iclr2020/paper_submission-",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1gV36NKPS",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "402;1044;832",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            759.3333333333334,
            267.08467238353876
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6471325459579702676&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1gV5gHKvB",
        "title": "Fooling Pre-trained Language Models: An Evolutionary Approach to Generate Wrong Sentences with High Acceptability Score",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Large pre-trained language representation models have recently collected numerous successes in language understanding. \nThey obtained state-of-the-art results in many classical benchmark datasets, such as GLUE benchmark and SQuAD dataset, but do they really understand the language? \nIn this paper we investigate two among the best pre-trained language models, BERT and RoBERTa, analysing their weaknesses by generating adversarial sentences in an evolutionary approach.\nOur goal is to discover if and why it is possible to fool these models, and how to face this issue.\nThis adversarial attack is followed by a cross analysis, understanding robustness and generalization proprieties of models and fooling techniques.\nWe find that BERT can be easily fooled, but an augmentation of the original dataset with adversarial samples is enough to make it learn how not to be fooled again. RoBERTa, instead, is more resistent to this approach even if it still have some weak spots.",
        "keywords": "Pre-trained Language Models;Adversarial Attack;Evolutionary Algorithm;BERT;RoBERTa;CoLA",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marco Di Giovanni;Marco Brambilla",
        "authorids": "marco.digiovanni@polimi.it;marco.brambilla@polimi.it",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngiovanni2020fooling,\ntitle={Fooling Pre-trained Language Models: An Evolutionary Approach to Generate Wrong Sentences with High Acceptability Score},\nauthor={Marco Di Giovanni and Marco Brambilla},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gV5gHKvB}\n}",
        "github": "https://mega.nz/#!4NclyajI!wIhKovxXwa4mGezOD8mcKFAVkL0ZyM_diqmGr5_P87o",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1gV5gHKvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "543;149;752",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1110;441;1117",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            481.3333333333333,
            250.0057777110139
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            889.3333333333334,
            317.0324203540634
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hf2xf3LhPO4J:scholar.google.com/&scioq=Fooling+Pre-trained+Language+Models:+An+Evolutionary+Approach+to+Generate+Wrong+Sentences+with+High+Acceptability+Score&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1gV6AVKwB",
        "title": "Cross Domain Imitation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Imitation learning across domains with discrepancies such as embodiment and viewpoint mismatch. ",
        "abstract": "We study the question of how to imitate tasks across domains with discrepancies such as embodiment and viewpoint mismatch. Many prior works require paired, aligned demonstrations and an additional RL procedure for the task. However, paired, aligned demonstrations are seldom obtainable and RL procedures are expensive. In this work, we formalize the Cross Domain Imitation Learning (CDIL) problem, which encompasses imitation learning in the presence of viewpoint and embodiment mismatch. Informally, CDIL is the process of learning how to perform a task optimally, given demonstrations of the task in a distinct domain. We propose a two step approach to CDIL: alignment followed by adaptation. In the alignment step we execute a novel unsupervised MDP alignment algorithm, Generative Adversarial MDP Alignment (GAMA), to learn state and action correspondences from unpaired, unaligned demonstrations. In the adaptation step we leverage the correspondences to zero-shot imitate tasks across domains. To describe when CDIL is feasible via alignment and adaptation, we introduce a theory of MDP alignability. We experimentally evaluate GAMA against baselines in both embodiment and viewpoint mismatch scenarios where aligned demonstrations don\u2019t exist and show the effectiveness of our approach.",
        "keywords": "Imitation Learning;Domain Adaptation;Reinforcement Learning;Zeroshot Learning;Machine Learning;Artificial Intelligence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kun Ho Kim;Yihong Gu;Jiaming Song;Shengjia Zhao;Stefano Ermon",
        "authorids": "khkim@cs.stanford.edu;gyh15@mails.tsinghua.edu.cn;jiaming.tsong@gmail.com;sjzhao@stanford.edu;ermon@cs.stanford.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nkim2020cross,\ntitle={Cross Domain Imitation Learning},\nauthor={Kun Ho Kim and Yihong Gu and Jiaming Song and Shengjia Zhao and Stefano Ermon},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gV6AVKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1gV6AVKwB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "808;322;285",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2245;796;430",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            471.6666666666667,
            238.30279524634668
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1157.0,
            783.7078537312229
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7271614980323356255&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1gXiaEYvr",
        "title": "Prototype Recalls for Continual Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Prototype recall: a method to prevent catastrophic forgetting in few-shot image classification task",
        "abstract": "Continual learning is a critical ability of continually acquiring and transferring knowledge without catastrophically forgetting previously learned knowledge. However, enabling continual learning for AI remains a long-standing challenge. In this work, we propose a novel method, Prototype Recalls, that efficiently embeds and recalls previously learnt knowledge to tackle catastrophic forgetting issue. In particular, we consider continual learning in classification tasks. For each classification task, our method learns a metric space containing a set of prototypes where embedding of the samples from the same class cluster around prototypes and class-representative prototypes are separated apart. To alleviate catastrophic forgetting, our method preserves the embedding function from the samples to the previous metric space, through our proposed prototype recalls from previous tasks. Specifically, the recalling process is implemented by replaying a small number of samples from previous tasks and correspondingly matching their embedding to their nearest class-representative prototypes. Compared with recent continual learning methods, our contributions are fourfold: first, our method achieves the best memory retention capability while adapting quickly to new tasks. Second, our method uses metric learning for classification and does not require adding in new neurons given new object classes. Third, our method is more memory efficient since only class-representative prototypes need to be recalled. Fourth, our method suggests a promising solution for few-shot continual learning. Without tampering with the performance on initial tasks, our method learns novel concepts given a few training examples of each class in new tasks. ",
        "keywords": "continual learning;catastrophic forgetting;prototypes;image classification;few-shot continual learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mengmi Zhang;Tao Wang;Joo Hwee Lim;Jiashi Feng",
        "authorids": "mengmi@u.nus.edu;twangnh@gmail.com;joohwee@i2r.a-star.edu.sg;elefjia@nus.edu.sg",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020prototype,\ntitle={Prototype Recalls for Continual Learning},\nauthor={Mengmi Zhang and Tao Wang and Joo Hwee Lim and Jiashi Feng},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gXiaEYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1gXiaEYvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1900;592;1417",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1303.0,
            540.0388874886696
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:70J07jLLFd8J:scholar.google.com/&scioq=Prototype+Recalls+for+Continual+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1g_S0VYvr",
        "title": "Learning to Combat Compounding-Error in Model-Based Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Despite its potential to improve sample complexity versus model-free approaches, model-based reinforcement learning can fail catastrophically if the model is inaccurate. An algorithm should ideally be able to trust an imperfect model over a reasonably long planning horizon, and only rely on model-free updates when the model errors get infeasibly large. In this paper, we investigate techniques for choosing the planning horizon on a state-dependent basis, where a state's planning horizon is determined by the maximum cumulative model error around that state. We demonstrate that these state-dependent model errors can be learned with Temporal Difference methods, based on a novel approach of temporally decomposing the cumulative model errors. Experimental results show that the proposed method can successfully adapt the planning horizon to account for state-dependent model accuracy,  significantly improving the efficiency of policy learning compared to model-based and model-free baselines.\n",
        "keywords": "reinforcement learning;model-based RL",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chenjun Xiao;Yifan Wu;Chen Ma;Dale Schuurmans;Martin M\u00fcller",
        "authorids": "chenjun@ualberta.ca;yw4@andrew.cmu.edu;chenchloem@gmail.com;daes@ualberta.ca;mmueller@ualberta.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nxiao2020learning,\ntitle={Learning to Combat Compounding-Error in Model-Based Reinforcement Learning},\nauthor={Chenjun Xiao and Yifan Wu and Chen Ma and Dale Schuurmans and Martin M{\\\"u}ller},\nyear={2020},\nurl={https://openreview.net/forum?id=S1g_S0VYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1g_S0VYvr",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "341;411;438",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "835;599;791",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            396.6666666666667,
            40.876507787345155
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            741.6666666666666,
            102.46733896981786
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 46,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15114281379273438844&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "S1g_t1StDB",
        "title": "Self-Educated Language Agent with Hindsight Experience Replay for Instruction Following",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Language creates a compact representation of the world and allows the description of unlimited situations and objectives through compositionality. These properties make it a natural fit to guide the training of interactive agents as it could ease recurrent challenges in Reinforcement Learning such as sample complexity, generalization, or multi-tasking. Yet, it remains an open-problem to relate language and RL in even simple instruction following scenarios. Current methods rely on expert demonstrations, auxiliary losses, or inductive biases in neural architectures. In this paper, we propose an orthogonal approach called Textual Hindsight Experience Replay (THER) that extends the Hindsight Experience Replay approach to the language setting. Whenever the agent does not fulfill its instruction, THER learn to output a new directive that matches the agent trajectory, and it relabels the episode with a positive reward. To do so, THER learns to map a state into an instruction by using past successful trajectories, which removes the need to have external expert interventions to relabel episodes as in vanilla HER. We observe that this simple idea also initiates a learning synergy between language acquisition and policy learning on instruction following tasks in the BabyAI environment. ",
        "keywords": "Language;reinforcement learning;instruction following;Hindsight Experience Replay",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Geoffrey Cideron;Mathieu Seurin;Florian Strub;Olivier Pietquin",
        "authorids": "geoffrey.cideron@inria.fr;mathieu.seurin@inria.fr;fstrub@google.com;pietquin@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ncideron2020selfeducated,\ntitle={Self-Educated Language Agent with Hindsight Experience Replay for Instruction Following},\nauthor={Geoffrey Cideron and Mathieu Seurin and Florian Strub and Olivier Pietquin},\nyear={2020},\nurl={https://openreview.net/forum?id=S1g_t1StDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1g_t1StDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "477;391;795",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "814;780;1566",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            554.3333333333334,
            173.76101084215898
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1053.3333333333333,
            362.7757188983604
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=801277757896350480&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1gfu3EtDr",
        "title": "EgoMap: Projective mapping and structured egocentric memory for Deep RL",
        "track": "main",
        "status": "Reject",
        "tldr": "We demonstrate the improvement in generalization performance achieved when spatially structured memory and projective geometry are included in a Deep RL agent's architecture.",
        "abstract": "Tasks involving localization, memorization and planning in partially observable 3D environments are an ongoing challenge in Deep Reinforcement Learning. We present EgoMap, a spatially structured neural memory architecture. EgoMap augments a deep reinforcement learning agent\u2019s performance in 3D environments on challenging tasks with multi-step objectives. The EgoMap architecture incorporates several inductive biases including a differentiable inverse projection of CNN feature vectors onto a top-down spatially structured map. The map is updated with ego-motion measurements through a differentiable affine transform. We show this architecture outperforms both standard recurrent agents and state of the art agents with structured memory. We demonstrate that incorporating these inductive biases into an agent\u2019s architecture allows for stable training with reward alone, circumventing the expense of acquiring and labelling expert trajectories. A detailed ablation study demonstrates the impact of key aspects of the architecture and through extensive qualitative analysis, we show how the agent exploits its structured internal memory to achieve higher performance. ",
        "keywords": "Reinforcement Learning;Deep Learning;Computer Vision;Robotics;Neural Memory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Edward Beeching;Christian Wolf;Jilles Dibangoye;Olivier Simonin",
        "authorids": "edward.beeching@inria.fr;christian.wolf@insa-lyon.fr;jilles.dibangoye@inria.fr;olivier.simonin@inria.fr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nbeeching2020egomap,\ntitle={EgoMap: Projective mapping and structured egocentric memory for Deep {\\{}RL{\\}}},\nauthor={Edward Beeching and Christian Wolf and Jilles Dibangoye and Olivier Simonin},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gfu3EtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=S1gfu3EtDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "513;88;525",
        "wc_reply_reviewers": "0;0;54",
        "wc_reply_authors": "566;133;690",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            375.3333333333333,
            203.23440216218864
        ],
        "wc_reply_reviewers_avg": [
            18.0,
            25.45584412271571
        ],
        "wc_reply_authors_avg": [
            463.0,
            238.77325366687674
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12475959642189201693&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "S1ghzlHFPS",
        "title": "Informed Temporal Modeling via Logical Specification of Factorial LSTMs",
        "track": "main",
        "status": "Reject",
        "tldr": "Factorize LSTM states and zero-out/tie LSTM weight matrices according to real-world structural biases expressed by Datalog programs.",
        "abstract": "Consider a world in which events occur that involve various entities. Learning how to predict future events from patterns of past events becomes more difficult as we consider more types of events. Many of the patterns detected in the dataset by an ordinary LSTM will be spurious since the number of potential pairwise correlations, for example, grows quadratically with the number of events. We propose a type of factorial LSTM architecture where different blocks of LSTM cells are responsible for capturing different aspects of the world state. We use Datalog rules to specify how to derive the LSTM structure from a database of facts about the entities in the world. This is analogous to how a probabilistic relational model (Getoor & Taskar, 2007) specifies a recipe for deriving a graphical model structure from a database. In both cases, the goal is to obtain useful inductive biases by encoding informed independence assumptions into the model. We specifically consider the neural Hawkes process, which uses an LSTM to modulate the rate of instantaneous events in continuous time. In both synthetic and real-world domains, we show that we obtain better generalization by using appropriate factorial designs specified by simple Datalog programs.\n",
        "keywords": "factorized LSTM;temporal point process;event streams;structural bias;Datalog",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongyuan Mei;Guanghui Qin;Minjie Xu;Jason Eisner",
        "authorids": "hongyuanmei@gmail.com;gqin@jhu.edu;chokkyvista06@gmail.com;jason@cs.jhu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmei2020informed,\ntitle={Informed Temporal Modeling via Logical Specification of Factorial {\\{}LSTM{\\}}s},\nauthor={Hongyuan Mei and Guanghui Qin and Minjie Xu and Jason Eisner},\nyear={2020},\nurl={https://openreview.net/forum?id=S1ghzlHFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1ghzlHFPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "561;443;495",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            499.6666666666667,
            48.28618389928485
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OJfYl1V6f3EJ:scholar.google.com/&scioq=Informed+Temporal+Modeling+via+Logical+Specification+of+Factorial+LSTMs&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1glGANtDr",
        "title": "Doubly Robust Bias Reduction in Infinite Horizon Off-Policy Estimation",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We develop a new doubly robust estimator based on the infinite horizon density ratio and off policy value estimation.",
        "abstract": "Infinite horizon off-policy policy evaluation is a highly challenging task due to the excessively large variance of typical importance sampling (IS) estimators. Recently, Liu et al. (2018) proposed an approach that significantly reduces the variance of infinite-horizon off-policy evaluation by estimating the stationary density ratio, but at the cost of introducing potentially high risks due to the error in density ratio estimation. In this paper, we develop a bias-reduced augmentation of their method, which can take advantage of a learned value function to obtain higher accuracy. Our method is doubly robust in that the bias vanishes when either the density ratio or value function estimation is perfect.  In general, when either of them is accurate, the bias can also be reduced. Both theoretical and empirical results show that our method yields significant advantages over previous methods.",
        "keywords": "off-policy evaluation;infinite horizon;doubly robust;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ziyang Tang*;Yihao Feng*;Lihong Li;Dengyong Zhou;Qiang Liu",
        "authorids": "ztang@cs.utexas.edu;yihao@cs.utexas.edu;lihongli.cs@gmail.com;dennyzhou@google.com;lqiang@cs.utexas.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nTang*2020Doubly,\ntitle={Doubly Robust Bias Reduction in Infinite Horizon Off-Policy Estimation},\nauthor={Ziyang Tang* and Yihao Feng* and Lihong Li and Dengyong Zhou and Qiang Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1glGANtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1glGANtDr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "279;567;699",
        "wc_reply_reviewers": "144;53;145",
        "wc_reply_authors": "430;664;172",
        "reply_reviewers": "1;1;1",
        "reply_authors": "2;2;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            515.0,
            175.36248173426378
        ],
        "wc_reply_reviewers_avg": [
            114.0,
            43.135445594854666
        ],
        "wc_reply_authors_avg": [
            422.0,
            200.93780132170252
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 78,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4097929778918583638&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "S1gmrxHFvB",
        "title": "AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty",
        "track": "main",
        "status": "Poster",
        "tldr": "We obtain state-of-the-art on robustness to data shifts, and we maintain calibration under data shift even though even when accuracy drops",
        "abstract": "Modern deep neural networks can achieve high accuracy when the training distribution and test distribution are identically distributed, but this assumption is frequently violated in practice. When the train and test distributions are mismatched, accuracy can plummet. Currently there are few techniques that improve robustness to unforeseen data shifts encountered during deployment. In this work, we propose a technique to improve the robustness and uncertainty estimates of image classifiers. We propose AugMix, a data processing technique that is simple to implement, adds limited computational overhead, and helps models withstand unforeseen corruptions. AugMix significantly improves robustness and uncertainty measures on challenging image classification benchmarks, closing the gap between previous methods and the best possible performance in some cases by more than half. ",
        "keywords": "robustness;uncertainty",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dan Hendrycks*;Norman Mu*;Ekin Dogus Cubuk;Barret Zoph;Justin Gilmer;Balaji Lakshminarayanan",
        "authorids": "hendrycks@berkeley.edu;normanmu@google.com;cubuk@google.com;barretzoph@google.com;gilmer@google.com;balajiln@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nhendrycks*2020augmix,\ntitle={AugMix: A Simple Method to Improve Robustness and Uncertainty under Data Shift},\nauthor={Dan Hendrycks* and Norman Mu* and Ekin Dogus Cubuk and Barret Zoph and Justin Gilmer and Balaji Lakshminarayanan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gmrxHFvB}\n}",
        "github": "https://github.com/google-research/augmix",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1gmrxHFvB",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "542;614;364",
        "wc_reply_reviewers": "34;12;281",
        "wc_reply_authors": "783;436;1350",
        "reply_reviewers": "1;1;3",
        "reply_authors": "3;1;4",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            506.6666666666667,
            105.07563413507857
        ],
        "wc_reply_reviewers_avg": [
            109.0,
            121.95354306729537
        ],
        "wc_reply_authors_avg": [
            856.3333333333334,
            376.72477427898946
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.247219128924647
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1605,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10820297852320096780&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1gmvyHFDS",
        "title": "Provenance detection through learning transformation-resilient watermarking",
        "track": "main",
        "status": "Reject",
        "tldr": "Develop a method to detect the provenance of signals that have undergone adversarial transformations.",
        "abstract": "Advancements in deep generative models have made it possible to synthesize images, videos and audio signals that are hard to distinguish from natural signals, creating opportunities for potential abuse of these capabilities. This motivates the problem of tracking the provenance of signals, i.e., being able to determine the original source of a signal. Watermarking the signal at the time of signal creation is a potential solution, but current techniques are brittle and watermark detection mechanisms can easily be bypassed by doing some post-processing (cropping images, shifting pitch in the audio etc.). In this paper, we introduce ReSWAT (Resilient Signal Watermarking via Adversarial Training), a framework for learning transformation-resilient watermark detectors that are able to detect a watermark even after a signal has been through several post-processing transformations. Our detection method can be applied to domains with continuous data representations such as images, videos or sound signals. Experiments on watermarking image and audio signals show that our method can reliably detect the provenance of a synthetic signal, even if the signal has been through several post-processing transformations, and improve upon related work in this setting. Furthermore, we show that for specific kinds of transformations (perturbations bounded in the $\\ell_2$ norm), we can even get formal guarantees on the ability of our model to detect the watermark.  We provide qualitative examples of watermarked image and audio samples in the anonymous code submission link.",
        "keywords": "watermarking;provenance detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jamie Hayes;Krishnamurthy Dvijotham;Yutian Chen;Sander Dieleman;Pushmeet Kohli;Norman Casagrande",
        "authorids": "j.hayes@cs.ucl.ac.uk;dvij@google.com;yutianc@google.com;sedielem@google.com;pushmeet@google.com;ncasagrande@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nhayes2020provenance,\ntitle={Provenance detection through learning transformation-resilient watermarking},\nauthor={Jamie Hayes and Krishnamurthy Dvijotham and Yutian Chen and Sander Dieleman and Pushmeet Kohli and Norman Casagrande},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gmvyHFDS}\n}",
        "github": "https://drive.google.com/open?id=1c-qqHfTr3uMQSIuTR_Z8qZv0uTVrkH5m",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1gmvyHFDS",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "424;237;276",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "708;192;103",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            312.3333333333333,
            80.54950168823034
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            334.3333333333333,
            266.7087466798859
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wP496MIuLQgJ:scholar.google.com/&scioq=Provenance+detection+through+learning+transformation-resilient+watermarking&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1gnxaVFDB",
        "title": "EDUCE: Explaining model Decision through Unsupervised Concepts Extraction",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new self-interpretable model that performs output prediction and simultaneously provides an explanation in terms of the presence of semantically meaningful concepts in the input. We experiment on multiple text processing tasks.",
        "abstract": "Providing explanations along with predictions is crucial in some text processing tasks. Therefore, we propose a new self-interpretable model that performs output prediction and simultaneously provides an explanation in terms of the presence of particular concepts in the input. To do so, our model's prediction relies solely on a low-dimensional binary representation of the input, where each feature denotes the presence or absence of concepts. The presence of a concept is decided from an excerpt i.e. a small sequence of consecutive words in the text. Relevant concepts for the prediction task at hand are automatically defined by our model, avoiding the need for concept-level annotations. To ease interpretability, we enforce that for each concept, the corresponding excerpts share similar semantics and are differentiable from each others. We experimentally demonstrate the relevance of our approach on text classification and multi-sentiment analysis tasks.",
        "keywords": "Interpretability;explainability;text processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Diane Bouchacourt;Ludovic Denoyer",
        "authorids": "dianeb@fb.com;denoyer@fb.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nbouchacourt2020educe,\ntitle={{\\{}EDUCE{\\}}: Explaining model Decision through Unsupervised Concepts Extraction},\nauthor={Diane Bouchacourt and Ludovic Denoyer},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gnxaVFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1gnxaVFDB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "670;625;146",
        "wc_reply_reviewers": "0;110;0",
        "wc_reply_authors": "1192;1431;46",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            480.3333333333333,
            237.1220970067718
        ],
        "wc_reply_reviewers_avg": [
            36.666666666666664,
            51.85449728701349
        ],
        "wc_reply_authors_avg": [
            889.6666666666666,
            604.4889485250237
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5283895570264035629&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1gqraNKwB",
        "title": "Contextual Inverse Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We analyze contextual Markov decision processes in an inverse reinforcement learning setting. We propose and analyze several algorithms both theoretically and empirically.",
        "abstract": "We consider the Inverse Reinforcement Learning problem in Contextual Markov\nDecision Processes. In this setting, the reward, which is unknown to the agent, is a\nfunction of a static parameter referred to as the context. There is also an \u201cexpert\u201d\nwho knows this mapping and acts according to the optimal policy for each context.\nThe goal of the agent is to learn the expert\u2019s mapping by observing demonstrations.\nWe define an optimization problem for finding this mapping and show that when\nit is linear, the problem is convex. We present and analyze the sample complexity\nof three algorithms for solving this problem: the mirrored descent algorithm,\nevolution strategies, and the ellipsoid method. We also extend the first two methods\nto work with general reward functions, e.g., deep neural networks, but without the\ntheoretical guarantees. Finally, we compare the different techniques empirically in\ndriving simulation and a medical treatment regime.",
        "keywords": "Contextual MDP;Inverse Reinforcement Learning;Reinforcement Learning;Mirror Descent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Philip Korsunsky;Stav Belogolovsky;Tom Zahavy;Chen Tessler;Shie Mannor",
        "authorids": "philip.korsunsky@gmail.com;stav.belo@gmail.com;tomzahavy@gmail.com;chen.tessler@gmail.com;shie@ee.technion.ac.il",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nkorsunsky2020contextual,\ntitle={Contextual Inverse Reinforcement Learning},\nauthor={Philip Korsunsky and Stav Belogolovsky and Tom Zahavy and Chen Tessler and Shie Mannor},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gqraNKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=S1gqraNKwB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "371;259;643",
        "wc_reply_reviewers": "440;0;87",
        "wc_reply_authors": "1137;382;703",
        "reply_reviewers": "3;0;1",
        "reply_authors": "5;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            424.3333333333333,
            161.2396422168637
        ],
        "wc_reply_reviewers_avg": [
            175.66666666666666,
            190.25655193857466
        ],
        "wc_reply_authors_avg": [
            740.6666666666666,
            309.3760746333749
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            1.247219128924647
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.699673171197595
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7_-EwO2vZUUJ:scholar.google.com/&scioq=Contextual+Inverse+Reinforcement+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1gtclSFvr",
        "title": "Neural Phrase-to-Phrase Machine Translation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We present Neural Phrase-to-Phrase Machine Translation (\\nppmt), a phrase-based translation model that uses a novel phrase-attention mechanism to discover relevant input (source) segments to generate output (target) phrases. We propose an efficient dynamic programming algorithm to marginalize over all possible segments at training time and use a greedy algorithm or beam search for decoding. We also show how to incorporate a memory module derived from an external phrase dictionary to \\nppmt{} to improve decoding. %that allows %the model to be trained faster %\\nppmt is significantly faster %than existing neural phrase-based %machine translation method by \\cite{huang2018towards}. Experiment results demonstrate that \\nppmt{} outperforms the best neural phrase-based translation model \\citep{huang2018towards} both in terms of model performance and speed, and is comparable to a state-of-the-art Transformer-based machine translation system \\citep{vaswani2017attention}.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiangtao;Feng;Lingpeng Kong;Po-sen Huang;Chong;Wang;Da;Huang Jiayuan;Mao;Kan;Qiao;Dengyong;Zhou",
        "authorids": "lingpenk@google.com;lingpenk@google.com;lingpenk@google.com;lingpenk@google.com;lingpenk@google.com;lingpenk@google.com;lingpenk@google.com;lingpenk@google.com;lingpenk@google.com;lingpenk@google.com;lingpenk@google.com;lingpenk@google.com;lingpenk@google.com",
        "gender": ";;;;;;;;;;;;",
        "homepage": ";;;;;;;;;;;;",
        "dblp": ";;;;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;;;",
        "orcid": ";;;;;;;;;;;;",
        "linkedin": ";;;;;;;;;;;;",
        "or_profile": ";;;;;;;;;;;;",
        "aff": ";;;;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;;;",
        "position": ";;;;;;;;;;;;",
        "bibtex": "@misc{\njiangtao2020neural,\ntitle={Neural Phrase-to-Phrase Machine Translation},\nauthor={Jiangtao and Feng and Lingpeng Kong and Po-sen Huang and Chong and Wang and Da and Huang Jiayuan and Mao and Kan and Qiao and Dengyong and Zhou},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gtclSFvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1gtclSFvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "356;316;154",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            275.3333333333333,
            87.33587782552802
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            13,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15176428691159721382&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1gvg0NYvH",
        "title": "Mean Field Models for Neural Networks in Teacher-student Setting",
        "track": "main",
        "status": "Reject",
        "tldr": "We discuss mean field models for two-layer fully-connected networks and ResNet models and characterize stationary distributions in the teacher-student setting.",
        "abstract": "Mean field models have provided a convenient framework for understanding the training dynamics for certain neural networks in the infinite width limit. The resulting mean field equation characterizes the evolution of the time-dependent empirical distribution of the network parameters. Following this line of work, this paper first focuses on the teacher-student setting. For the two-layer networks, we derive the necessary condition of the stationary distributions of the mean field equation and explain an empirical phenomenon concerning training speed differences using the Wasserstein flow description. Second, we apply this approach to two extended ResNet models and characterize the necessary condition of stationary distributions in the teacher-student setting.",
        "keywords": "mean field model;optimal transport;ResNet",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lexing Ying;Yuandong Tian",
        "authorids": "lexing@stanford.edu;yuandong@fb.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nying2020mean,\ntitle={Mean Field Models for Neural Networks in Teacher-student Setting},\nauthor={Lexing Ying and Yuandong Tian},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gvg0NYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1gvg0NYvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "485;470;764",
        "wc_reply_reviewers": "40;0;67",
        "wc_reply_authors": "95;168;155",
        "reply_reviewers": "1;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            573.0,
            135.19615379144483
        ],
        "wc_reply_reviewers_avg": [
            35.666666666666664,
            27.523727137790686
        ],
        "wc_reply_authors_avg": [
            139.33333333333334,
            31.794478905761125
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ozST_EDIjJIJ:scholar.google.com/&scioq=Mean+Field+Models+for+Neural+Networks+in+Teacher-student+Setting&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1gwC1StwS",
        "title": "Barcodes as summary of objective functions' topology",
        "track": "main",
        "status": "Reject",
        "tldr": "We apply canonical forms of gradient complexes (barcodes) to explore neural networks loss surfaces.",
        "abstract": "We apply canonical forms of gradient complexes (barcodes) to explore neural networks loss surfaces. We present an algorithm for calculations of the objective function's barcodes of minima.  Our experiments confirm two principal observations: (1) the barcodes of minima are located in a small lower part of the range of values of objective function and (2) increase of the neural network's depth brings down the minima's barcodes. This has natural implications for the neural network learning and the ability to generalize. ",
        "keywords": "Barcodes;canonical form invariants;loss surface;gradient complexes",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Serguei Barannikov;Alexander Korotin;Dmitry Oganesyan;Daniil Emtsev;Evgeny Burnaev",
        "authorids": "serguei.barannikov@imj-prg.fr;a.korotin@skoltech.ru;d.oganesyan@skoltech.ru;demtsev@student.ethz.ch;e.burnaev@skoltech.ru",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nbarannikov2020barcodes,\ntitle={Barcodes as summary of objective functions' topology},\nauthor={Serguei Barannikov and Alexander Korotin and Dmitry Oganesyan and Daniil Emtsev and Evgeny Burnaev},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gwC1StwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1gwC1StwS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "268;482;442",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "341;412;278",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            397.3333333333333,
            92.89898934984289
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            343.6666666666667,
            54.73775865179559
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8385614370379839426&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1gyl6Vtvr",
        "title": "MaskConvNet: Training Efficient ConvNets from Scratch via Budget-constrained Filter Pruning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, we propose a framework, called MaskConvNet, for ConvNets filter pruning. MaskConvNet provides elegant support for training budget-aware pruned networks from scratch, by adding a simple mask module to a ConvNet architecture. MaskConvNet enjoys several advantages - (1) Flexible, the mask module can be integrated with any ConvNets in a plug-and-play manner. (2) Simple, the mask module is implemented by a hard Sigmoid function with a small number of trainable mask variables, adding negligible memory and computational overheads to the networks during training. (3) Effective, it is able to achieve competitive pruning rate while maintaining comparable accuracy with the baseline ConvNets without pruning, regardless of the datasets and ConvNet architectures used. (4) Fast, it is observed that the number of training epochs required by MaskConvNet is close to training a baseline without pruning. (5) Budget-aware, with a sparsity budget on target metric (e.g. model size and FLOP), MaskConvNet is able to train in a way that the optimizer can adaptively sparsify the network and automatically maintain sparsity level, till the pruned network produces good accuracy and fulfill the budget constraint simultaneously. Results on CIFAR-10 and ImageNet with several ConvNet architectures show that MaskConvNet works competitively well compared to previous pruning methods, with budget-constraint well respected. Code is available at https://www.dropbox.com/s/c4zi3n7h1bexl12/maskconv-iclr-code.zip?dl=0. We hope MaskConvNet, as a simple and general pruning framework, can address the gaps in existing literate and advance future studies to push the boundaries of neural network pruning.",
        "keywords": "Structured Pruning;Sparsity Regularization;Budget-Aware",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Raden Mu'az Mun'im;Jie Lin;Vijay Chandrasekhar;Koichi Shinoda",
        "authorids": "raden.m.muaz@gmail.com;lin-j@i2r.a-star.edu.sg;vijay@i2r.a-star.edu.sg;shinoda@ks.cs.titech.ac.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmun'im2020maskconvnet,\ntitle={MaskConvNet: Training Efficient ConvNets from Scratch via Budget-constrained Filter Pruning},\nauthor={Raden Mu'az Mun'im and Jie Lin and Vijay Chandrasekhar and Koichi Shinoda},\nyear={2020},\nurl={https://openreview.net/forum?id=S1gyl6Vtvr}\n}",
        "github": "https://www.dropbox.com/s/c4zi3n7h1bexl12/maskconv-iclr-code.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1gyl6Vtvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "496;242;603",
        "wc_reply_reviewers": "132;0;0",
        "wc_reply_authors": "609;774;2066",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;2;3",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            447.0,
            151.39572869360174
        ],
        "wc_reply_reviewers_avg": [
            44.0,
            62.22539674441618
        ],
        "wc_reply_authors_avg": [
            1149.6666666666667,
            651.4375556328396
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:C2dVlhkgSmsJ:scholar.google.com/&scioq=MaskConvNet:+Training+Efficient+ConvNets+from+Scratch+via+Budget-constrained+Filter+Pruning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1l-C0NtwS",
        "title": "Cross-lingual Alignment vs Joint Training: A Comparative Study and A Simple Unified Framework",
        "track": "main",
        "status": "Poster",
        "tldr": "We conduct a comparative study of cross-lingual alignment vs joint training methods and unify these two previously exclusive paradigms in a new framework. ",
        "abstract": "Learning multilingual representations of text has proven a successful method for many cross-lingual transfer learning tasks. There are two main paradigms for learning such representations: (1) alignment, which maps different independently trained monolingual representations into a shared space, and (2) joint training, which directly learns unified multilingual representations using monolingual and cross-lingual objectives jointly. In this paper, we first conduct direct comparisons of representations learned using both of these methods across diverse cross-lingual tasks. Our empirical results reveal a set of pros and cons for both methods, and show that the relative performance of alignment versus joint training is task-dependent. Stemming from this analysis, we propose a simple and novel framework that combines these two previously mutually-exclusive approaches. Extensive experiments demonstrate that our proposed framework alleviates limitations of both approaches, and outperforms existing methods on the MUSE bilingual lexicon induction (BLI) benchmark. We further show that this framework can generalize to contextualized representations such as Multilingual BERT, and produces state-of-the-art results on the CoNLL cross-lingual NER benchmark.",
        "keywords": "Cross-lingual Representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zirui Wang*;Jiateng Xie*;Ruochen Xu;Yiming Yang;Graham Neubig;Jaime G. Carbonell",
        "authorids": "ziruiw@cs.cmu.edu;jiatengx@cs.cmu.edu;ruochenx@cs.cmu.edu;yiming@cs.cmu.edu;gneubig@cs.cmu.edu;jgc@cs.cmu.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nWang*2020Cross-lingual,\ntitle={Cross-lingual Alignment vs Joint Training: A Comparative Study and A Simple Unified Framework},\nauthor={Zirui Wang* and Jiateng Xie* and Ruochen Xu and Yiming Yang and Graham Neubig and Jaime G. Carbonell},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1l-C0NtwS}\n}",
        "github": "https://github.com/thespectrewithin/joint-align",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1l-C0NtwS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "767;303;497",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "840;227;374",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            522.3333333333334,
            190.27231946753463
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            480.3333333333333,
            261.3073966721102
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 80,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17808816563200033029&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1l2IyrYPr",
        "title": "Reducing Sentiment Bias in Language Models via Counterfactual Evaluation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We reduce sentiment biases based on counterfactual evaluation of text generation using language models.",
        "abstract": "Recent improvements in large-scale language models have driven progress on automatic generation of syntactically and semantically consistent text for many real-world applications. Many of these advances leverage the availability of large corpora. While training on such corpora encourages the model to understand long-range dependencies in text, it can also result in the models internalizing the social biases present in the corpora. This paper aims to quantify and reduce biases exhibited by language models. Given a conditioning context (e.g. a writing prompt) and a language model, we analyze if (and how) the sentiment of the generated text is affected by changes in values of sensitive attributes (e.g. country names, occupations, genders, etc.) in the conditioning context, a.k.a. counterfactual evaluation. We quantify these biases by adapting individual and group fairness metrics from the fair machine learning literature. Extensive evaluation on two different corpora (news articles and Wikipedia) shows that state-of-the-art Transformer-based language models exhibit biases learned from data. We propose embedding-similarity and sentiment-similarity regularization methods that improve both individual and group fairness metrics without sacrificing perplexity and semantic similarity---a positive step toward development and deployment of fairer language models for real-world applications.",
        "keywords": "language model;fairness;counterfactual analysis;sentiment analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Po-Sen Huang;Huan Zhang;Ray Jiang;Robert Stanforth;Johannes Welbl;Jack Rae;Vishal Maini;Dani Yogatama;Pushmeet Kohli",
        "authorids": "posenhuang@google.com;huan@huan-zhang.com;rayjiang@google.com;stanforth@google.com;j.welbl@cs.ucl.ac.uk;jwrae@google.com;vmaini@google.com;dyogatama@google.com;pushmeet@google.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1l2IyrYPr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1150;353;217",
        "wc_reply_reviewers": "341;0;0",
        "wc_reply_authors": "1143;468;560",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            573.3333333333334,
            411.5275068435752
        ],
        "wc_reply_reviewers_avg": [
            113.66666666666667,
            160.7489415897418
        ],
        "wc_reply_authors_avg": [
            723.6666666666666,
            298.8827343439043
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 219,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13571904545164642346&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1l66nNFvB",
        "title": "Graph Warp Module: an Auxiliary Module for Boosting the Power of Graph Neural Networks in Molecular Graph Analysis",
        "track": "main",
        "status": "Reject",
        "tldr": "Proposing an auxiliary  module with its own I/O that can be attached to a generic GNN of message passing type in order to improve its representation power/ generalization performance on small-graph datasets.",
        "abstract": "Graph Neural Network (GNN) is a popular architecture for the analysis of chemical molecules, and it has numerous applications in material and medicinal science.\nCurrent lines of GNNs developed for molecular analysis, however, do not fit well on the training set, and their performance does not scale well with the complexity of the network. \nIn this paper, we propose an auxiliary module to be attached to a GNN that can boost the representation power of the model without hindering the original GNN architecture. \nOur auxiliary module can improve the representation power and the generalization ability of a wide variety of GNNs, including those that are used commonly in biochemical applications. ",
        "keywords": "Graph Neural Networks;molecular graph analysis;supernode;auxiliary module",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Katsuhiko Ishiguro;Shin-ichi Maeda;Masanori Koyama",
        "authorids": "k.ishiguro.jp@ieee.org;ichi@preferred.jp;masomatics@preferred.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nishiguro2020graph,\ntitle={Graph Warp Module: an Auxiliary Module for Boosting the Power of Graph Neural Networks in Molecular Graph Analysis},\nauthor={Katsuhiko Ishiguro and Shin-ichi Maeda and Masanori Koyama},\nyear={2020},\nurl={https://openreview.net/forum?id=S1l66nNFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1l66nNFvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "214;195;225",
        "wc_reply_reviewers": "0;0;10",
        "wc_reply_authors": "159;99;121",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            211.33333333333334,
            12.391753530294071
        ],
        "wc_reply_reviewers_avg": [
            3.3333333333333335,
            4.714045207910316
        ],
        "wc_reply_authors_avg": [
            126.33333333333333,
            24.78350706058814
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15952555879635550120&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1l6ITVKPS",
        "title": "An Explicitly Relational Neural Network Architecture",
        "track": "main",
        "status": "Reject",
        "tldr": "We present an end-to-end differentiable architecture that learns to map pixels to predicates, and evaluate it on a suite of simple relational reasoning tasks",
        "abstract": "With a view to bridging the gap between deep learning and symbolic AI, we present a novel end-to-end neural network architecture that learns to form propositional representations with an explicitly relational structure from raw pixel data. In order to evaluate and analyse the architecture, we introduce a family of simple visual relational reasoning tasks of varying complexity. We show that the proposed architecture, when pre-trained on a curriculum of such tasks, learns to generate reusable representations that better facilitate subsequent learning on previously unseen tasks when compared to a number of baseline architectures. The workings of a successfully trained model are visualised to shed some light on how the architecture functions.",
        "keywords": "relational representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Murray Shanahan;Kyriacos Nikiforou;Antonia Creswell;Christos Kaplanis;David Barrett;Marta Garnelo",
        "authorids": "mshanahan@google.com;knikiforou@google.com;tonicreswell@google.com;christos.kaplanis14@imperial.ac.uk;barrettdavid@google.com;garnelo@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nshanahan2020an,\ntitle={An Explicitly Relational Neural Network Architecture},\nauthor={Murray Shanahan and Kyriacos Nikiforou and Antonia Creswell and Christos Kaplanis and David Barrett and Marta Garnelo},\nyear={2020},\nurl={https://openreview.net/forum?id=S1l6ITVKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1l6ITVKPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "215;412;83",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "514;1031;22",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            236.66666666666666,
            135.1846473860434
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            522.3333333333334,
            411.96466946679897
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 82,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=37732747764322837&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1l8oANFDH",
        "title": "Synthesizing Programmatic Policies that Inductively Generalize",
        "track": "main",
        "status": "Poster",
        "tldr": "An approach to learn program policies for control tasks that inductively generalize. ",
        "abstract": "Deep reinforcement learning has successfully solved a number of challenging control tasks. However, learned policies typically have difficulty generalizing to novel environments. We propose an algorithm for learning programmatic state machine policies that can capture repeating behaviors. By doing so, they have the ability to generalize to instances requiring an arbitrary number of repetitions, a property we call inductive generalization. However, state machine policies are hard to learn since they consist of a combination of continuous and discrete structures. We propose a learning framework called adaptive teaching, which learns a state machine policy by imitating a teacher; in contrast to traditional imitation learning, our teacher adaptively updates itself based on the structure of the student. We show that our algorithm can be used to learn policies that inductively generalize to novel environments, whereas traditional neural network policies fail to do so. ",
        "keywords": "Program synthesis;reinforcement learning;inductive generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jeevana Priya Inala;Osbert Bastani;Zenna Tavares;Armando Solar-Lezama",
        "authorids": "jinala@csail.mit.edu;obastani@seas.upenn.edu;zenna@mit.edu;asolar@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nInala2020Synthesizing,\ntitle={Synthesizing Programmatic Policies that Inductively Generalize},\nauthor={Jeevana Priya Inala and Osbert Bastani and Zenna Tavares and Armando Solar-Lezama},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1l8oANFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1l8oANFDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "374;464;348",
        "wc_reply_reviewers": "177;224;0",
        "wc_reply_authors": "621;1410;685",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            395.3333333333333,
            49.701330185642135
        ],
        "wc_reply_reviewers_avg": [
            133.66666666666666,
            96.44457245255201
        ],
        "wc_reply_authors_avg": [
            905.3333333333334,
            357.80845279127897
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 62,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5132342861823299018&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1lACa4YDS",
        "title": "Meta-Learning for Variational Inference",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Variational inference (VI) plays an essential role in approximate Bayesian inference due to its computational efficiency and general applicability. \nCrucial to the performance of VI is the selection of the divergence measure in the optimization objective, as it affects the properties of the approximate posterior significantly. In this paper, we propose a meta-learning algorithm to learn (i) the divergence measure suited for the task of interest to automate the design of the VI method; and (ii) initialization of the variational parameters, which reduces the number of VI optimization steps drastically. We demonstrate the learned divergence outperforms the hand-designed divergence on Gaussian mixture distribution approximation, Bayesian neural network regression, and partial variational autoencoder based recommender systems.",
        "keywords": "Variational inference;Meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruqi Zhang;Yingzhen Li;Chris De Sa;Sam Devlin;Cheng Zhang",
        "authorids": "rz297@cornell.edu;yingzhen.li@microsoft.com;cdesa@cs.cornell.edu;sam.devlin@microsoft.com;cheng.zhang@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang2020metalearning,\ntitle={Meta-Learning for Variational Inference},\nauthor={Ruqi Zhang and Yingzhen Li and Chris De Sa and Sam Devlin and Cheng Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lACa4YDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1lACa4YDS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "431;658;235",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1383;1921;26",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            441.3333333333333,
            172.84353875366267
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1110.0,
            797.3510310187519
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "S1lAOhEKPS",
        "title": "X-Forest: Approximate Random Projection Trees for Similarity Measurement",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Similarity measurement plays a central role in various data mining and machine learning tasks. Generally, a similarity measurement solution should, in an ideal state, possess the following three properties: accuracy, efficiency and independence from prior knowledge. Yet unfortunately, vital as similarity measurements are, no previous works have addressed all of them. In this paper, we propose X-Forest, consisting of a group of approximate Random Projection Trees, such that all three targets mentioned above are tackled simultaneously. Our key techniques are as follows. First, we introduced RP Trees into the tasks of similarity measurement such that accuracy is improved. In addition, we enforce certain layers in each tree to share identical projection vectors, such that exalted efficiency is achieved. Last but not least, we introduce randomness into partition to eliminate its reliance on prior knowledge.   We conduct experiments on three real-world datasets, whose results demonstrate that our model, X-Forest, reaches an efficiency of up to 3.5 times higher than RP Trees with negligible compromising on its accuracy, while also being able to outperform traditional Euclidean distance-based similarity metrics by as much as 20% with respect to clustering tasks.   We have released codes in github anonymously so as to meet the demand of reproducibility.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yikai Zhao;Peiqing Chen;Zidong Zhao;Tong Yang;Jie Jiang;Bin Cui;Gong Zhang;Steve Uhlig",
        "authorids": "zyk@pku.edu.cn;chenpeiqing@pku.edu.cn;benkerd@pku.edu.cn;yangtongemail@gmail.com;jie.jiang@pku.edu.cn;bin.cui@pku.edu.cn;nicholas.zhang@huawei.com;steve.uhlig@qmul.ac.uk",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nzhao2020xforest,\ntitle={X-Forest: Approximate Random Projection Trees for Similarity Measurement},\nauthor={Yikai Zhao and Peiqing Chen and Zidong Zhao and Tong Yang and Jie Jiang and Bin Cui and Gong Zhang and Steve Uhlig},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lAOhEKPS}\n}",
        "github": "https://github.com/X-Forest/Approximate-Random-Projection-Trees",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1lAOhEKPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "751;600;937",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            762.6666666666666,
            137.82678339938948
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FhRgfEnzlP8J:scholar.google.com/&scioq=X-Forest:+Approximate+Random+Projection+Trees+for+Similarity+Measurement&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1lBTerYwH",
        "title": "Generalized Zero-shot ICD Coding",
        "track": "main",
        "status": "Reject",
        "tldr": "To solve the generalized zero-shot ICD coding problem, we propose a generative model to generate features that preserve the semantics of the zero-shot codes and exploit the hierarchy structure of the label space.",
        "abstract": "The International Classification of Diseases (ICD) is a list of classification codes for the diagnoses. Automatic ICD coding is in high demand as the manual coding can be labor-intensive and error-prone. It is a multi-label text classification task with extremely long-tailed label distribution, making it difficult to perform fine-grained classification on both frequent and zero-shot codes at the same time. In this paper, we propose a latent feature generation framework for generalized zero-shot ICD coding, where we aim to improve the prediction on codes that have no labeled data without compromising the performance on seen codes. Our framework generates pseudo features conditioned on the ICD code descriptions and exploits the ICD code hierarchical structure. To guarantee the semantic consistency between the generated features and real features, we reconstruct the keywords in the input documents that are related to the conditioned ICD codes. To the best of our knowledge, this works represents the first one that proposes an adversarial generative model for the generalized zero-shot learning on multi-label text classification. Extensive experiments demonstrate the effectiveness of our approach. On the public MIMIC-III dataset, our methods improve the F1 score from nearly 0 to 20.91% for the zero-shot codes, and increase the AUC score by 3% (absolute improvement) from previous state of the art. We also show that the framework improves the performance on few-shot codes.",
        "keywords": "Generalized Zero-shot Learning;ICD Coding;NLP;Generative Model;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Congzheng Song;Shanghang Zhang;Najmeh Sadoughi;Pengtao Xie;Eric Xing",
        "authorids": "cs2296@cornell.edu;shanghang.zhang@petuum.com;najmeh.sadoughi@petuum.com;pengtao.xie@petuum.com;eric.xing@petuum.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsong2020generalized,\ntitle={Generalized Zero-shot {\\{}ICD{\\}} Coding},\nauthor={Congzheng Song and Shanghang Zhang and Najmeh Sadoughi and Pengtao Xie and Eric Xing},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lBTerYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1lBTerYwH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "287;391;447",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "749;698;487",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            375.0,
            66.29228210483228
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            644.6666666666666,
            113.41467669084496
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13336361569751964531&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1lBVgHYvr",
        "title": "Towards Certified Defense for Unrestricted Adversarial Attacks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Certified defenses against adversarial examples are very important in safety-critical applications of machine learning. However, existing certified defense strategies only safeguard against perturbation-based adversarial attacks, where the attacker is only allowed to modify normal data points by adding small perturbations. In this paper, we provide certified defenses under the more general threat model of unrestricted adversarial attacks. We allow the attacker to generate arbitrary inputs to fool the classifier, and assume the attacker knows everything except the classifiers' parameters and the training dataset used to learn it. Lack of knowledge about the classifiers parameters prevents an attacker from generating adversarial examples successfully. Our defense draws inspiration from differential privacy, and is based on intentionally adding noise to the classifier's outputs to limit the attacker's knowledge about the parameters. We prove concrete bounds on the minimum number of queries required for any attacker to generate a successful adversarial attack. For a simple linear classifiers we prove that the bound is asymptotically optimal up to a constant by exhibiting an attack algorithm that achieves this lower bound. We empirically show the success of our defense strategy against strong black box attack algorithms.",
        "keywords": "Adversarial Defense;Certified Defense;Adversarial Examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shengjia Zhao;Yang Song;Stefano Ermon",
        "authorids": "sjzhao@stanford.edu;yangsong@cs.stanford.edu;ermon@cs.stanford.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhao2020towards,\ntitle={Towards Certified Defense for Unrestricted Adversarial Attacks},\nauthor={Shengjia Zhao and Yang Song and Stefano Ermon},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lBVgHYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1lBVgHYvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "478;291;504",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            424.3333333333333,
            94.8765279484634
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kidsH24QkCgJ:scholar.google.com/&scioq=Towards+Certified+Defense+for+Unrestricted+Adversarial+Attacks&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "S1lDkaEFwS",
        "title": "Thwarting finite difference adversarial attacks with output randomization",
        "track": "main",
        "status": "Reject",
        "tldr": "Black box adversarial attacks are rendered ineffective by simple randomization of neural network outputs.",
        "abstract": "  Adversarial input poses a critical problem to deep neural networks (DNN). This problem is more severe in the \"black box\" setting where an adversary only needs to repeatedly query a DNN to estimate the gradients required to create adversarial examples. Current defense techniques against attacks in this setting are not effective. Thus, in this paper, we present a novel defense technique based on randomization applied to a DNN's output layer. While effective as a defense technique, this approach introduces a trade off between accuracy and robustness. We show that for certain types of randomization, we can bound the probability of introducing errors by carefully setting distributional parameters. For the particular case of finite difference black box attacks, we quantify the error introduced by the defense in the finite difference estimate of the gradient. Lastly, we show empirically that the defense can thwart three adaptive black box adversarial attack algorithms. ",
        "keywords": "black box adversarial attacks;adversarial examples;defense;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haidar Khan;Dan Park;Azer Khan;B\u00fclent Yener",
        "authorids": "haidark@gmail.com;parkd5@gmail.com;azerkkhan@gmail.com;byener@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkhan2020thwarting,\ntitle={Thwarting finite difference adversarial attacks with output randomization},\nauthor={Haidar Khan and Dan Park and Azer Khan and B{\\\"u}lent Yener},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lDkaEFwS}\n}",
        "github": "https://gofile.io/?c=Q7x8SX",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1lDkaEFwS",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "228;491;574;254",
        "wc_reply_reviewers": "127;0;215;0",
        "wc_reply_authors": "251;334;173;88",
        "reply_reviewers": "1;0;1;0",
        "reply_authors": "2;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            386.75,
            148.95867715578035
        ],
        "wc_reply_reviewers_avg": [
            85.5,
            90.98488885523794
        ],
        "wc_reply_authors_avg": [
            211.5,
            91.24280793574911
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:O88Ge2F-YYIJ:scholar.google.com/&scioq=Thwarting+finite+difference+adversarial+attacks+with+output+randomization&hl=en&as_sdt=0,5",
        "gs_version_total": 4
    },
    {
        "id": "S1lEX04tPr",
        "title": "CM3: Cooperative Multi-goal Multi-stage Multi-agent Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "A modular method for fully cooperative multi-goal multi-agent reinforcement learning, based on curriculum learning for efficient exploration and credit assignment for action-goal interactions.",
        "abstract": "A variety of cooperative multi-agent control problems require agents to achieve individual goals while contributing to collective success. This multi-goal multi-agent setting poses difficulties for recent algorithms, which primarily target settings with a single global reward, due to two new challenges: efficient exploration for learning both individual goal attainment and cooperation for others' success, and credit-assignment for interactions between actions and goals of different agents. To address both challenges, we restructure the problem into a novel two-stage curriculum, in which single-agent goal attainment is learned prior to learning multi-agent cooperation, and we derive a new multi-goal multi-agent policy gradient with a credit function for localized credit assignment. We use a function augmentation scheme to bridge value and policy functions across the curriculum. The complete architecture, called CM3, learns significantly faster than direct adaptations of existing algorithms on three challenging multi-goal multi-agent problems: cooperative navigation in difficult formations, negotiating multi-vehicle lane changes in the SUMO traffic simulator, and strategic cooperation in a Checkers environment.",
        "keywords": "multi-agent reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiachen Yang;Alireza Nakhaei;David Isele;Kikuo Fujimura;Hongyuan Zha",
        "authorids": "yjiachen@gmail.com;anakhaei@honda-ri.com;disele@honda-ri.com;kfujimura@honda-ri.com;zha@cc.gatech.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nYang2020CM3:,\ntitle={CM3: Cooperative Multi-goal Multi-stage Multi-agent Reinforcement Learning},\nauthor={Jiachen Yang and Alireza Nakhaei and David Isele and Kikuo Fujimura and Hongyuan Zha},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lEX04tPr}\n}",
        "github": "[![github](/images/github_icon.svg) 011235813/cm3](https://github.com/011235813/cm3)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1lEX04tPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "432;456;104",
        "wc_reply_reviewers": "0;0;26",
        "wc_reply_authors": "671;623;124",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            330.6666666666667,
            160.57673831812903
        ],
        "wc_reply_reviewers_avg": [
            8.666666666666666,
            12.256517540566826
        ],
        "wc_reply_authors_avg": [
            472.6666666666667,
            247.322102170878
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 120,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11188676090053014781&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1lF8xHYwS",
        "title": "Unsupervised Domain Adaptation through Self-Supervision",
        "track": "main",
        "status": "Reject",
        "tldr": "We use self-supervision on both domain to align them for unsupervised domain adaptation.",
        "abstract": "This paper addresses unsupervised domain adaptation, the setting where labeled training data is available on a source domain, but the goal is to have good performance on a target domain with only unlabeled data. Like much of previous work, we seek to align the learned representations of the source and target domains while preserving discriminability. The way we accomplish alignment is by learning to perform auxiliary self-supervised task(s) on both domains simultaneously.  Each self-supervised task brings the two domains closer together along the direction relevant to that task. Training this jointly with the main task classifier on the source domain is shown to successfully generalize to the unlabeled target domain.  The presented objective is straightforward to implement and easy to optimize. We achieve state-of-the-art results on four out of seven standard benchmarks, and competitive results on segmentation adaptation. We also demonstrate that our method composes well with another popular pixel-level adaptation method.",
        "keywords": "unsupervised domain adaptation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Sun;Eric Tzeng;Trevor Darrell;Alexei A. Efros",
        "authorids": "yusun@berkeley.edu;etzeng@eecs.berkeley.edu;trevor@eecs.berkeley.edu;efros@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsun2020unsupervised,\ntitle={Unsupervised Domain Adaptation through Self-Supervision},\nauthor={Yu Sun and Eric Tzeng and Trevor Darrell and Alexei A. Efros},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lF8xHYwS}\n}",
        "github": "https://drive.google.com/open?id=1fVAUS_0VNnqK8Z3e_YyD-8fugB-CAe3W",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1lF8xHYwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "224;442;238",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "643;387;20",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            301.3333333333333,
            99.63042819451407
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            350.0,
            255.68079057032554
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 278,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14122651322798856951&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1lHfxBFDH",
        "title": "Gumbel-Matrix Routing for Flexible Multi-task Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This paper proposes a novel per-task routing method for multi-task applications. Multi-task neural networks can learn to transfer knowledge across different tasks by using parameter sharing. However, sharing parameters between unrelated tasks can hurt performance. To address this issue, routing networks can be applied to learn to share each group of parameters with a different subset of tasks to better leverage tasks relatedness. However, this use of routing methods requires to address the challenge of learning the routing jointly with the parameters of a modular multi-task neural network. We propose the Gumbel-Matrix routing, a novel multi-task routing method based on the Gumbel-Softmax, that is designed to learn fine-grained parameter sharing. When applied to the Omniglot benchmark, the proposed method improves the state-of-the-art error rate by 17%.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Krzysztof Maziarz;Efi Kokiopoulou;Andrea Gesmundo;Luciano Sbaiz;Gabor Bartok;Jesse Berent",
        "authorids": "krzysztof.s.maziarz@gmail.com;kokiopou@google.com;agesmundo@google.com;sbaiz@google.com;bartok@google.com;jberent@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nmaziarz2020gumbelmatrix,\ntitle={Gumbel-Matrix Routing for Flexible Multi-task Learning},\nauthor={Krzysztof Maziarz and Efi Kokiopoulou and Andrea Gesmundo and Luciano Sbaiz and Gabor Bartok and Jesse Berent},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lHfxBFDH}\n}",
        "github": "https://drive.google.com/open?id=1vrXONNr_SBzQw81gZlqsWMslkFmzy2HK",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1lHfxBFDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "306;512;279",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "330;494;424",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.6666666666667,
            104.05874409304688
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            416.0,
            67.19126927411527
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11995097005893642750&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1lJv0VYDr",
        "title": "Model Imitation for Model-Based Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Our method incorporates WGAN to achieve occupancy measure matching for transition learning.",
        "abstract": "Model-based reinforcement learning (MBRL) aims to learn a dynamic model to reduce the number of interactions with real-world environments. However, due to estimation error, rollouts in the learned model, especially those of long horizon, fail to match the ones in real-world environments. This mismatching has seriously impacted the sample complexity of MBRL. The phenomenon can be attributed to the fact that previous works employ supervised learning to learn the one-step transition models, which has inherent difficulty ensuring the matching of distributions from multi-step rollouts. Based on the claim, we propose to learn the synthesized model by matching the distributions of multi-step rollouts sampled from the synthesized model and the real ones via WGAN. We theoretically show that matching the two can minimize the difference of cumulative rewards between the real transition and the learned one. Our experiments also show that the proposed model imitation method outperforms the state-of-the-art in terms of sample complexity and average return.",
        "keywords": "Model-Based Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yueh-Hua Wu;Ting-Han Fan;Peter J. Ramadge;Hao Su",
        "authorids": "kriswu8021@gmail.com;tinghanf@princeton.edu;ramadge@princeton.edu;haosu@eng.ucsd.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwu2020model,\ntitle={Model Imitation for Model-Based Reinforcement Learning},\nauthor={Yueh-Hua Wu and Ting-Han Fan and Peter J. Ramadge and Hao Su},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lJv0VYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1lJv0VYDr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "390;550;229",
        "wc_reply_reviewers": "0;49;0",
        "wc_reply_authors": "673;368;371",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            389.6666666666667,
            131.04791320564993
        ],
        "wc_reply_reviewers_avg": [
            16.333333333333332,
            23.098821518760555
        ],
        "wc_reply_authors_avg": [
            470.6666666666667,
            143.07651410657476
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12095243595418681364&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1lLvyBtPB",
        "title": "Anomalous Pattern Detection in Activations and Reconstruction Error of Autoencoders",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Unsupervised method to detect adversarial samples in autoencoder's activations and reconstruction error space",
        "abstract": "In real-world machine learning applications, large outliers and pervasive noise are commonplace, and access to clean training data as required by standard deep autoencoders is unlikely.\nReliably detecting anomalies in a given set of images is a task of high practical relevance for visual quality inspection, surveillance, or medical image analysis. Autoencoder neural networks learn to reconstruct normal images, and hence can classify those images as anomalous if the reconstruction error exceeds some threshold. In this paper, we proposed an unsupervised method based on subset scanning over autoencoder activations. The contributions of our work are threefold. First, we propose a novel method combining detection with reconstruction error and subset scanning scores to improve the anomaly score of current autoencoders without requiring any retraining. Second, we provide the ability to inspect and visualize the set of anomalous nodes in the reconstruction error space that make a sample noised. Third, we show that subset scanning can be used for anomaly detection in the inner layers of the autoencoder. We provide detection power results for several untargeted adversarial noise models under standard datasets.",
        "keywords": "unsupervised anomaly detection;adversarial attacks;autoencoders;subset scanning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Celia Cintas;Skyler Speakman;Victor Akinwande;Srihari Sridharan;William Ogallo;Edward McFowland III",
        "authorids": "celia.cintas@ibm.com;skyler@ke.ibm.com;victor.akinwande1@ibm.com;sriharis.sridharan@ke.ibm.com;william.ogallo@ibm.com;mcfowland@umn.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "https://github.com/usersubsetscan/autoencoder_anomaly_subset",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1lLvyBtPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "1286;85;534",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "125;23;89",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            635.0,
            495.4802384219442
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            79.0,
            42.23742416388575
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5712402642020573353&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1lNWertDr",
        "title": "Decoupling Hierarchical Recurrent Neural Networks With Locally Computable Losses",
        "track": "main",
        "status": "Reject",
        "tldr": "We replace some gradients paths in hierarchical RNN's by an auxiliary loss. We show that this can reduce the memory cost while preserving performance.",
        "abstract": "Learning long-term dependencies is a key long-standing challenge of recurrent neural networks (RNNs). Hierarchical recurrent neural networks (HRNNs) have been considered a promising approach as long-term dependencies are resolved through shortcuts up and down the hierarchy. Yet, the memory requirements of Truncated Backpropagation Through Time (TBPTT) still prevent training them on very long sequences. In this paper, we empirically show that in (deep) HRNNs, propagating gradients back from higher to lower levels can be replaced by locally computable losses, without harming the learning capability of the network, over a wide range of tasks. This decoupling by local losses reduces the memory requirements of training by a factor exponential in the depth of the hierarchy in comparison to standard TBPTT.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Asier Mujika;Felix Weissenberger;Angelika Steger",
        "authorids": "asierm@inf.ethz.ch;felix.weissenberger@inf.ethz.ch;steger@inf.ethz.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmujika2020decoupling,\ntitle={Decoupling Hierarchical Recurrent Neural Networks With Locally Computable Losses},\nauthor={Asier Mujika and Felix Weissenberger and Angelika Steger},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lNWertDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1lNWertDr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "229;564;607",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            466.6666666666667,
            168.9700828220454
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2177161522665999208&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1lOTC4tDS",
        "title": "Dream to Control: Learning Behaviors by Latent Imagination",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We present Dreamer, an agent that learns long-horizon behaviors purely by latent imagination using analytic value gradients.",
        "abstract": "Learned world models summarize an agent's experience to facilitate learning complex behaviors. While learning world models from high-dimensional sensory inputs is becoming feasible through deep learning, there are many potential ways for deriving behaviors from them. We present Dreamer, a reinforcement learning agent that solves long-horizon tasks from images purely by latent imagination. We efficiently learn behaviors by propagating analytic gradients of learned state values back through trajectories imagined in the compact state space of a learned world model. On 20 challenging visual control tasks, Dreamer exceeds existing approaches in data-efficiency, computation time, and final performance.",
        "keywords": "world model;latent dynamics;imagination;planning by backprop;policy optimization;planning;reinforcement learning;control;representations;latent variable model;visual control;value function",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Danijar Hafner;Timothy Lillicrap;Jimmy Ba;Mohammad Norouzi",
        "authorids": "mail@danijar.com;countzero@google.com;jba@cs.toronto.edu;mnorouzi@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nHafner2020Dream,\ntitle={Dream to Control: Learning Behaviors by Latent Imagination},\nauthor={Danijar Hafner and Timothy Lillicrap and Jimmy Ba and Mohammad Norouzi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lOTC4tDS}\n}",
        "github": "https://danijar.com/dreamer",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1lOTC4tDS",
        "pdf_size": 0,
        "rating": "6;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "610;338;188;184",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "636;776;468;315",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            330.0,
            173.16466152191677
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            548.75,
            173.50414260184107
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1635,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14974700822970491825&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "S1lRg0VKDr",
        "title": "On summarized validation curves and generalization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The validation curve is widely used for model selection and hyper-parameter search with the curve usually summarized over all the training tasks. However, this summarization tends to lose the intricacies of the per-task curves and it isn't able to reflect if all the tasks are at their validation optimum even if the summarized curve might be. In this work, we explore this loss of information, how it affects the model at testing and how to detect it using interval plots. We propose two techniques as a proof-of-concept of the potential gain in the test performance when per-task validation curves are accounted for. Our experiments on three large datasets show up to a 2.5% increase (averaged over multiple trials) in the test accuracy rate when model selection uses the per-task validation maximums instead of the summarized validation maximum. This potential increase is not a result of any modification to the model but rather at what point of training the weights were selected from. This presents an exciting direction for new training and model selection techniques that rely on more than just averaged metrics. ",
        "keywords": "model selection;deep learning;early stopping;validation curves",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohammad Hashir;Yoshua Bengio;Joseph Paul Cohen",
        "authorids": "mohammad.hashir.khan@umontreal.ca;yoshua.bengio@mila.quebec;joseph@josephpcohen.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhashir2020on,\ntitle={On summarized validation curves and generalization},\nauthor={Mohammad Hashir and Yoshua Bengio and Joseph Paul Cohen},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lRg0VKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1lRg0VKDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "199;504;233",
        "wc_reply_reviewers": "0;111;0",
        "wc_reply_authors": "422;424;322",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            312.0,
            136.4722193952552
        ],
        "wc_reply_reviewers_avg": [
            37.0,
            52.32590180780452
        ],
        "wc_reply_authors_avg": [
            389.3333333333333,
            47.61885714247619
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kXLFzBRc-7wJ:scholar.google.com/&scioq=On+summarized+validation+curves+and+generalization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1lSapVtwS",
        "title": "Stochastic Conditional Generative Networks with Basis Decomposition",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "While generative adversarial networks (GANs) have revolutionized machine learning, a number of open questions remain to fully understand them and exploit their power. One of these questions is how to efficiently achieve proper diversity and sampling of the multi-mode data space. To address this, we introduce BasisGAN, a stochastic conditional multi-mode image generator. By exploiting the observation that a convolutional filter can be well approximated as a linear combination of a small set of basis elements, we learn a plug-and-played basis generator to stochastically generate basis elements, with just a few hundred of parameters, to fully embed stochasticity into convolutional filters. By sampling basis elements instead of filters, we dramatically reduce the cost of modeling the parameter space with no sacrifice on either image diversity or fidelity. To illustrate this proposed plug-and-play framework, we construct variants of BasisGAN based on state-of-the-art conditional image generation networks, and train the networks by simply plugging in a basis generator, without additional auxiliary components, hyperparameters, or training objectives. The experimental success is complemented with theoretical results indicating how the perturbations introduced by the proposed sampling of basis elements can propagate to the appearance of generated images.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ze Wang;Xiuyuan Cheng;Guillermo Sapiro;Qiang Qiu",
        "authorids": "ze.w@duke.edu;xiuyuan.cheng@duke.edu;guillermo.sapiro@duke.edu;qiang.qiu@duke.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWang2020Stochastic,\ntitle={Stochastic Conditional Generative Networks with Basis Decomposition},\nauthor={Ze Wang and Xiuyuan Cheng and Guillermo Sapiro and Qiang Qiu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lSapVtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=S1lSapVtwS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "224;104;353",
        "wc_reply_reviewers": "0;0;280",
        "wc_reply_authors": "141;69;704",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            227.0,
            101.67595585978034
        ],
        "wc_reply_reviewers_avg": [
            93.33333333333333,
            131.99326582148888
        ],
        "wc_reply_authors_avg": [
            304.6666666666667,
            283.89708620476466
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11571440188179293794&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "S1lVhxSYPH",
        "title": "Ternary MobileNets via Per-Layer Hybrid Filter Banks",
        "track": "main",
        "status": "Reject",
        "tldr": "2x savings in model size, 28% energy reduction for MobileNets on ImageNet at no loss in accuracy using hybrid layers composed of conventional full-precision filters and ternary filters",
        "abstract": "MobileNets family of computer vision neural networks have fueled tremendous progress in the design and organization of resource-efficient architectures in recent years. New applications with stringent real-time requirements in highly constrained devices require further compression of MobileNets-like already computeefficient networks. Model quantization is a widely used technique to compress and accelerate neural network inference and prior works have quantized MobileNets to 4 \u2212 6 bits albeit with a modest to significant drop in accuracy. While quantization to sub-byte values (i.e. precision \u2264 8 bits) has been valuable, even further quantization of MobileNets to binary or ternary values is necessary to realize significant energy savings and possibly runtime speedups on specialized hardware, such as ASICs and FPGAs. Under the key observation that convolutional filters at each layer of a deep neural network may respond differently to ternary quantization, we propose a novel quantization method that generates per-layer hybrid filter banks consisting of full-precision and ternary weight filters for MobileNets. The layer-wise hybrid filter banks essentially combine the strengths of full-precision and ternary weight filters to derive a compact, energy-efficient architecture for MobileNets. Using this proposed quantization method, we quantized a substantial portion of weight filters of MobileNets to ternary values resulting in 27.98% savings in energy, and a 51.07% reduction in the model size, while achieving comparable accuracy and no degradation in throughput on specialized hardware in comparison to the baseline full-precision MobileNets.",
        "keywords": "Model compression;ternary quantization;energy-efficient models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dibakar Gope;Jesse G Beu;Urmish Thakker;Matthew Mattina",
        "authorids": "dibakar.gope@arm.com;jesse.beu@arm.com;urmish.thakker@arm.com;matthew.mattina@arm.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ngope2020ternary,\ntitle={Ternary MobileNets via Per-Layer Hybrid Filter Banks},\nauthor={Dibakar Gope and Jesse G Beu and Urmish Thakker and Matthew Mattina},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lVhxSYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1lVhxSYPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "131;179;201",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "751;315;729",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            170.33333333333334,
            29.227080289043965
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            598.3333333333334,
            200.5481377514026
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14009905489091953015&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "S1lXnhVKPr",
        "title": "Variance Reduced Local SGD with Lower Communication Complexity",
        "track": "main",
        "status": "Reject",
        "tldr": "We prove that the proposed algorithm can achieve linear iteration speedup with lower communication complexity than Local SGD and the experimental results verify our theoretical clarification.",
        "abstract": "To accelerate the training of machine learning models, distributed stochastic gradient descent (SGD) and its variants have been widely adopted, which apply multiple workers in parallel to speed up training. Among them, Local SGD has gained much attention due to its lower communication cost. Nevertheless, when the data distribution on workers is non-identical, Local SGD requires $O(T^{\\frac{3}{4}} N^{\\frac{3}{4}})$ communications to maintain its \\emph{linear iteration speedup} property, where $T$ is the total number of iterations and $N$ is the number of workers. In this paper, we propose Variance Reduced Local SGD (VRL-SGD) to further reduce the communication complexity. Benefiting from eliminating the dependency on the gradient variance among workers, we theoretically prove that VRL-SGD achieves a \\emph{linear iteration speedup} with a lower communication complexity $O(T^{\\frac{1}{2}} N^{\\frac{3}{2}})$ even if workers access non-identical datasets. We conduct experiments on three machine learning tasks, and the experimental results demonstrate that VRL-SGD performs impressively better than Local SGD when the data among workers are quite diverse.",
        "keywords": "variance reduction;local SGD;distributed optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xianfeng Liang;Shuheng Shen;Jingchang Liu;Zhen Pan;Yifei Cheng;Enhong Chen",
        "authorids": "zeroxf@mail.ustc.edu.cn;vaip@mail.ustc.edu.cn;jliude@cse.ust.hk;pzhen@mail.ustc.edu.cn;chengyif@mail.ustc.edu.cn;cheneh@ustc.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nliang2020variance,\ntitle={Variance Reduced Local {\\{}SGD{\\}} with Lower Communication Complexity},\nauthor={Xianfeng Liang and Shuheng Shen and Jingchang Liu and Zhen Pan and Yifei Cheng and Enhong Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lXnhVKPr}\n}",
        "github": "https://github.com/VRL-SGD/VRL-SGD",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1lXnhVKPr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "196;307;442",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "437;370;140",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            315.0,
            100.58826969383657
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            315.6666666666667,
            127.19101994856223
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 176,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16814123513757893450&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1l_ZlrFvS",
        "title": "Why do These Match? Explaining the Behavior of Image Similarity Models",
        "track": "main",
        "status": "Reject",
        "tldr": "A black box approach for explaining the predictions of an image similarity model.",
        "abstract": "Explaining a deep learning model can help users understand its behavior and allow researchers to discern its shortcomings. Recent work has primarily focused on explaining models for tasks like image classification or visual question answering.  In this paper, we introduce an explanation approach for image similarity models, where a model's output is a score measuring the similarity of two inputs rather than a classification.  In this task, an explanation depends on both of the input images, so standard methods do not apply. We propose an explanation method that pairs a saliency map identifying important image regions with an attribute that best explains the match.  We find that our explanations provide additional information not typically captured by saliency maps alone, and can also improve performance on the classic task of attribute recognition. Our approach's ability to generalize is demonstrated on two datasets from diverse domains, Polyvore Outfits and Animals with Attributes 2.",
        "keywords": "explainable artificial intelligence;image similarity;artificial intelligence for fashion",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bryan A. Plummer;Mariya I. Vasileva;Vitali Petsiuk;Kate Saenko;David Forsyth",
        "authorids": "bplumme2@illinois.edu;mvasile2@illinois.edu;vpetsiuk@bu.edu;saenko@bu.edu;daf@illinois.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nplummer2020why,\ntitle={Why do These Match? Explaining the Behavior of Image Similarity Models},\nauthor={Bryan A. Plummer and Mariya I. Vasileva and Vitali Petsiuk and Kate Saenko and David Forsyth},\nyear={2020},\nurl={https://openreview.net/forum?id=S1l_ZlrFvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1l_ZlrFvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "644;517;726",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "556;159;392",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            629.0,
            85.98061797095126
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            369.0,
            162.88850992831468
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3515395364581743131&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "S1ldO2EFPr",
        "title": "Graph Neural Networks Exponentially Lose Expressive Power for Node Classification",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We relate the asymptotic behavior of graph neural networks to the graph spectra of underlying graphs and gives principled guidelines for normalizing weights.",
        "abstract": "Graph Neural Networks (graph NNs) are a promising deep learning approach for analyzing graph-structured data. However, it is known that they do not improve (or sometimes worsen) their predictive performance as we pile up many layers and add non-lineality. To tackle this problem, we investigate the expressive power of graph NNs via their asymptotic behaviors as the layer size tends to infinity.\nOur strategy is to generalize the forward propagation of a Graph Convolutional Network (GCN), which is a popular graph NN variant, as a specific dynamical system. In the case of a GCN, we show that when its weights satisfy the conditions determined by the spectra of the (augmented) normalized Laplacian, its output exponentially approaches the set of signals that carry information of the connected components and node degrees only for distinguishing nodes.\nOur theory enables us to relate the expressive power of GCNs with the topological information of the underlying graphs inherent in the graph spectra. To demonstrate this, we characterize the asymptotic behavior of GCNs on the Erd\\H{o}s -- R\\'{e}nyi graph.\nWe show that when the Erd\\H{o}s -- R\\'{e}nyi graph is sufficiently dense and large, a broad range of GCNs on it suffers from the ``information loss\" in the limit of infinite layers with high probability.\nBased on the theory, we provide a principled guideline for weight normalization of graph NNs. We experimentally confirm that the proposed weight scaling enhances the predictive performance of GCNs in real data. Code is available at https://github.com/delta2323/gnn-asymptotics.",
        "keywords": "Graph Neural Network;Deep Learning;Expressive Power",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kenta Oono;Taiji Suzuki",
        "authorids": "kenta_oono@mist.i.u-tokyo.ac.jp;taiji@mist.i.u-tokyo.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nOono2020Graph,\ntitle={Graph Neural Networks Exponentially Lose Expressive Power for Node Classification},\nauthor={Kenta Oono and Taiji Suzuki},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1ldO2EFPr}\n}",
        "github": "https://github.com/delta2323/gnn-asymptotics",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1ldO2EFPr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "195;250;243",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "155;654;464",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            229.33333333333334,
            24.44494948973214
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            424.3333333333333,
            205.63776133990783
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 575,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2284729019139241748&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "S1ljpTEFPB",
        "title": "Sparsity Learning in Deep Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Proposing a novel method based on the guided attention to enforce the sparisty in deep neural networks.",
        "abstract": "The main goal of network pruning is imposing sparsity on the neural network by increasing the number of parameters with zero value in order to reduce the architecture size and the computational speedup.",
        "keywords": "Neural Networks;Deep Learning;Sparsity;Guided Attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amirsina Torfi;Rouzbeh A. Shirvani;Sobhan Soleymani;Nasser M. Nasrabadi",
        "authorids": "atorfi@vt.edu;rouzbeh.asghari@gmail.com;ssoleyma@mix.wvu.edu;nasser.nasrabadi@mail.wvu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://drive.google.com/open?id=1GuS7nfgKUiWbnJKcGU_JIMYTfk4FfPSr",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1ljpTEFPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "2089;286;321",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            898.6666666666666,
            841.8140465024856
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "S1lk61BtvB",
        "title": "``\"Best-of-Many-Samples\" Distribution Matching",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new objective for training hybrid VAE-GANs which lead to significant improvement in mode coverage and quality.",
        "abstract": "Generative Adversarial Networks (GANs) can achieve state-of-the-art sample quality in generative modelling tasks but suffer from the mode collapse problem. Variational Autoencoders (VAE) on the other hand explicitly maximize a reconstruction-based data log-likelihood forcing it to cover all modes, but suffer from poorer sample quality. Recent works have proposed hybrid VAE-GAN frameworks which integrate a GAN-based synthetic likelihood to the VAE objective to address both the mode collapse and sample quality issues, with limited success. This is because the VAE objective forces a trade-off between the data log-likelihood and divergence to the latent prior. The synthetic likelihood ratio term also shows instability during training. We propose a novel objective with a ``\"Best-of-Many-Samples\" reconstruction cost and a stable direct estimate of the synthetic likelihood. This enables our hybrid VAE-GAN framework to achieve high data log-likelihood and low divergence to the latent prior at the same time and shows significant improvement over both hybrid VAE-GANS and plain GANs in mode coverage and quality.",
        "keywords": "Distribution Matching;Generative Adversarial Networks;Variational Autoencoders",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Apratim Bhattacharyya;Mario Fritz;Bernt Schiele",
        "authorids": "abhattac@mpi-inf.mpg.de;fritz@cispa.saarland;schiele@mpi-inf.mpg.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nbhattacharyya2020bestofmanysamples,\ntitle={``''Best-of-Many-Samples'' Distribution Matching},\nauthor={Apratim Bhattacharyya and Mario Fritz and Bernt Schiele},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lk61BtvB}\n}",
        "github": "https://drive.google.com/drive/folders/10RCFaA8kOgkRHXIJpXIWAC-uUyLiEhlY?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1lk61BtvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "726;119;142",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            329.0,
            280.87838412143196
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8107509139433833427&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1ln1TNKvH",
        "title": "A Harmonic Structure-Based Neural Network Model for Musical Pitch Detection",
        "track": "main",
        "status": "Withdraw",
        "tldr": "harmonic acoustic model",
        "abstract": "In this paper we design a harmonic acoustic model for pitch detection. This model arranges conventional convolution and sparse convolution in a way such that the global harmonic patterns captured by sparse convolution are composed of the enough number of local patterns captured by layers of conventional convolution. When trained on the MAPS dataset, the harmonic model outperforms all existing pitch detection systems trained on the same dataset. Most impressively, when trained on MAPS with simple data augmentation, the harmonic model with an LSTM layer on top surpasses an up-to-date, more complex pitch detection system trained on the MAESTRO dataset to which complicated data augmentation is applied and whose training split is an order-of-magnitude larger than the training split of MAPS. The harmonic model has demonstrated potential to be used for advanced automatic music transcription (AMT) systems.",
        "keywords": "musical pitch detection;automatic music transcription",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xian Wang;Lingqiao Liu;Qinfeng Shi",
        "authorids": "xian.wang01@adelaide.edu.au;lingqiao.liu@adelaide.edu.au;javen.shi@adelaide.edu.au",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://github.com/anyconf/music",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1ln1TNKvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "878;488;262",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            542.6666666666666,
            254.4344491001344
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16788182151537327212&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "S1lslCEYPB",
        "title": "Improved Mutual Information Estimation",
        "track": "main",
        "status": "Reject",
        "tldr": "we propose a new variational bound for estimating mutual information and show the strength of our estimator in large-scale self-supervised representation learning through MI maximization.",
        "abstract": "We propose a new variational lower bound on the KL divergence and show that the Mutual Information (MI) can be estimated by maximizing this bound using a witness function on a hypothesis function class and an auxiliary scalar variable. If the function class is in a Reproducing Kernel Hilbert Space (RKHS), this leads to a jointly convex problem. We analyze the bound by deriving its dual formulation and show its connection to a likelihood ratio estimation problem. We show that the auxiliary variable introduced in our variational form plays the role of a Lagrange multiplier that enforces a normalization constraint on the likelihood ratio. By extending the function space to neural networks, we propose an efficient neural MI estimator, and validate its performance on synthetic examples, showing advantage over the existing baselines. We then demonstrate the strength of our estimator in large-scale self-supervised representation learning through MI maximization.",
        "keywords": "mutual information;variational bound;kernel methods;Neural estimators;mutual information maximization;self-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Youssef Mroueh*;Igor Melnyk*;Pierre Dognin*;Jerret Ross*;Tom Sercu*",
        "authorids": "mroueh@us.ibm.com;igor.melnyk@ibm.com;pdognin@us.ibm.com;rossja@us.ibm.com;tom.sercu@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nmroueh*2020improved,\ntitle={Improved Mutual Information Estimation},\nauthor={Youssef Mroueh* and Igor Melnyk* and Pierre Dognin* and Jerret Ross* and Tom Sercu*},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lslCEYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1lslCEYPB",
        "pdf_size": 0,
        "rating": "1;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "484;598;447;208",
        "wc_reply_reviewers": "0;69;0;0",
        "wc_reply_authors": "290;818;565;82",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "1;2;1;1",
        "rating_avg": [
            3.25,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            434.25,
            141.98657506961706
        ],
        "wc_reply_reviewers_avg": [
            17.25,
            29.877876430563134
        ],
        "wc_reply_authors_avg": [
            438.75,
            278.01382609503435
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7611548615174541449&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1ltg1rFDS",
        "title": "Black-box Off-policy Estimation for Infinite-Horizon Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a novel approach for the off-policy estimation problem in infinite-horizon RL.",
        "abstract": "Off-policy estimation for long-horizon problems is important in many real-life applications such as healthcare and robotics, where high-fidelity simulators may not be available and on-policy evaluation is expensive or impossible.  Recently, \\citet{liu18breaking} proposed an approach that avoids the curse of horizon suffered by typical importance-sampling-based methods. While showing promising results, this approach is limited in practice as it requires data being collected by a known behavior policy. In this work, we propose a novel approach that eliminates such limitations. In particular, we formulate the problem as solving for the fixed point of a \"backward flow\" operator and show that the fixed point solution gives the desired importance ratios of stationary distributions between the target and behavior policies.  We analyze its asymptotic consistency and finite-sample\ngeneralization. Experiments on benchmarks verify the effectiveness of our proposed approach.\n",
        "keywords": "reinforcement learning;off-policy estimation;importance sampling;propensity score",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ali Mousavi;Lihong Li;Qiang Liu;Denny Zhou",
        "authorids": "ali.mousavi1988@gmail.com;lihongli.cs@gmail.com;dennyzhou@google.com;lqiang@cs.utexas.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nMousavi2020Black-box,\ntitle={Black-box Off-policy Estimation for Infinite-Horizon Reinforcement Learning},\nauthor={Ali Mousavi and Lihong Li and Qiang Liu and Denny Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1ltg1rFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1ltg1rFDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "479;657;638",
        "wc_reply_reviewers": "328;0;0",
        "wc_reply_authors": "1153;473;511",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            591.3333333333334,
            79.80949539719917
        ],
        "wc_reply_reviewers_avg": [
            109.33333333333333,
            154.6206828194584
        ],
        "wc_reply_authors_avg": [
            712.3333333333334,
            311.98433009082714
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2362073700412273965&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1lukyrKPr",
        "title": "LEX-GAN: Layered Explainable Rumor Detector Based on Generative Adversarial Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Social media have emerged to be increasingly popular and have been used as tools for gathering and propagating information. However, the vigorous growth of social media contributes to the fast-spreading and far-reaching rumors. Rumor detection has become a necessary defense. Traditional rumor detection methods based on hand-crafted feature selection are replaced by automatic approaches that are based on Artificial Intelligence (AI). AI decision making systems need to have the necessary means, such as explainability to assure users their trustworthiness. Inspired by the thriving development of Generative Adversarial Networks (GANs) on text applications, we propose LEX-GAN, a GAN-based layered explainable rumor detector to improve the detection quality and provide explainability. Unlike fake news detection that needs a previously collected verified news database, LEX-GAN realizes explainable rumor detection based on only tweet-level text. LEX-GAN is trained with generated non-rumor-looking rumors. The generators produce rumors by intelligently inserting controversial information in non-rumors, and force the discriminators to detect detailed glitches and deduce exactly which parts in the sentence are problematic. The layered structures in both generative and discriminative model contributes to the high performance. We show LEX-GAN's mutation detection ability in textural sequences by performing a gene classification and mutation detection task.",
        "keywords": "explainable rumor detection;layered generative adversarial networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mingxi Cheng;Yizhi Li;Shahin Nazarian;Paul Bogdan",
        "authorids": "mingxic@usc.edu;yizhi.li@bupt.edu.cn;shahin.nazarian@usc.edu;pbogdan@usc.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ncheng2020lexgan,\ntitle={{\\{}LEX{\\}}-{\\{}GAN{\\}}: Layered Explainable Rumor Detector Based on Generative Adversarial Networks},\nauthor={Mingxi Cheng and Yizhi Li and Shahin Nazarian and Paul Bogdan},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lukyrKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=S1lukyrKPr",
        "pdf_size": 0,
        "rating": "1;1;3;8",
        "confidence": "0;0;0;0",
        "wc_review": "233;223;293;134",
        "wc_reply_reviewers": "0;109;0;0",
        "wc_reply_authors": "825;1620;777;468",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "2;3;1;1",
        "rating_avg": [
            3.25,
            2.8613807855648994
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            220.75,
            56.79073427945795
        ],
        "wc_reply_reviewers_avg": [
            27.25,
            47.198384506251905
        ],
        "wc_reply_authors_avg": [
            922.5,
            425.3683697690744
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.75,
            0.82915619758885
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3228271015382950851&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1lvWeBFwB",
        "title": "Random Partition Relaxation for Training Binary and Ternary Weight Neural Network",
        "track": "main",
        "status": "Withdraw",
        "tldr": "State-of-the-art training method for binary and ternary weight networks based on alternating optimization of randomly relaxed weight partitions",
        "abstract": "We present Random Partition Relaxation (RPR), a method for strong quantization of the parameters of convolutional neural networks to binary (+1/-1) and ternary (+1/0/-1) values. Starting from a pretrained model, we first quantize the weights and then relax random partitions of them to their continuous values for retraining before quantizing them again and switching to another weight partition for further adaptation.  We empirically evaluate the performance of RPR with ResNet-18, ResNet-50 and GoogLeNet on the ImageNet classification task for binary and ternary weight networks. We show accuracies beyond the state-of-the-art for binary- and ternary-weight GoogLeNet and competitive performance for ResNet-18 and ResNet-50 using a SGD-based training method that can easily be integrated into existing frameworks. ",
        "keywords": "binary weight neural networks;ternary weight neural networks;quantization;quantized neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lukas Cavigelli;Luca Benini",
        "authorids": "cavigelli@iis.ee.ethz.ch;benini@iis.ee.ethz.ch",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1lvWeBFwB",
        "pdf_size": 0,
        "rating": "1;1;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "329;483;214;396",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;244",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;1",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            355.5,
            98.26113168491395
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            61.0,
            105.65509926170151
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.25,
            0.4330127018922193
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10437484979343408236&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1lvn0NtwH",
        "title": "Mutual Exclusivity as a Challenge for Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Children use the mutual exclusivity (ME) bias to learn new words, while standard neural nets show the opposite bias, hindering learning in naturalistic scenarios such as lifelong learning.",
        "abstract": "Strong inductive biases allow children to learn in fast and adaptable ways. Children use the mutual exclusivity (ME) bias to help disambiguate how words map to referents, assuming that if an object has one label then it does not need another. In this paper, we investigate whether or not standard neural architectures have a ME bias, demonstrating that they lack this learning assumption. Moreover, we show that their inductive biases are poorly matched to lifelong learning formulations of classification and translation. We demonstrate that there is a compelling case for designing neural networks that reason by mutual exclusivity, which remains an open challenge.",
        "keywords": "Cognitive Science;Deep Learning;Word Learning;Lifelong Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kanishk Gandhi;Brenden Lake",
        "authorids": "kanishk.gandhi@nyu.edu;brenden@nyu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngandhi2020mutual,\ntitle={Mutual Exclusivity as a Challenge for Deep Neural Networks},\nauthor={Kanishk Gandhi and Brenden Lake},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lvn0NtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1lvn0NtwH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "864;285;289",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "17;14;281",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            479.3333333333333,
            272.00531040567733
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            104.0,
            125.16389255691915
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 38,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15224291622200910075&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "S1lxBR4FDr",
        "title": "ROBUST SINGLE-STEP ADVERSARIAL TRAINING",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Deep learning models have shown impressive performance across a spectrum of computer vision applications including medical diagnosis and autonomous driving. One of the major concerns that these models face is their susceptibility to adversarial attacks. Realizing the importance of this issue, more researchers are working towards developing robust models that are less affected by adversarial attacks. Adversarial training method shows promising results in this direction. In adversarial training regime, models are trained with mini-batches augmented with adversarial samples. In order to scale adversarial training to large networks and datasets, fast and simple methods (e.g., single-step gradient ascent) are used for generating adversarial samples. It is shown that models trained using single-step adversarial training method (adversarial samples are generated using non-iterative method) are pseudo robust. Further, this pseudo robustness of models is attributed to the gradient masking effect. However, existing works fail to explain when and why gradient masking effect occurs during single-step adversarial training. In this work, (i) we show that models trained using single-step adversarial training method learns to prevent the generation of single-step adversaries, and this is due to over-fitting of the model during the initial stages of training, and (ii) to mitigate this effect, we propose a single-step adversarial training method with dropout scheduling to learn robust models. Unlike models trained using single-step adversarial training method, models trained using the proposed single-step adversarial training method are robust against both single-step and multi-step adversarial attacks, and achieve on-par results compared to the computationally expensive state-of-the-art multi-step adversarial training method, in white-box and black-box settings.",
        "keywords": "Adversarial Defense;robust deep neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "B.S. Vivek;R. Venkatesh Babu",
        "authorids": "svivek@iisc.ac.in;venky@iisc.ac.in",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1lxBR4FDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "650;107;221",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "550;170;267",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            326.0,
            233.7819496881656
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            329.0,
            161.21000796063086
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J9TaavLXtyAJ:scholar.google.com/&scioq=ROBUST+SINGLE-STEP+ADVERSARIAL+TRAINING&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1lxKlSKPH",
        "title": "Consistency Regularization for Generative Adversarial Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Generative Adversarial Networks (GANs) are known to be difficult to train, despite considerable research effort. Several regularization techniques for stabilizing training have been proposed, but they introduce non-trivial computational overheads and interact poorly with existing techniques like spectral normalization. In this work, we propose a simple, effective training stabilizer based on the notion of consistency regularization\u2014a popular technique in the semi-supervised learning literature. In particular, we augment data passing into the GAN discriminator and penalize the sensitivity of the discriminator to these augmentations. We conduct a series of experiments to demonstrate that consistency regularization works effectively with spectral normalization and various GAN architectures, loss functions and optimizer settings. Our method achieves the best FID scores for unconditional image generation compared to other regularization methods on CIFAR-10 and CelebA. Moreover, Our consistency regularized GAN (CR-GAN) improves state of-the-art FID scores for conditional generation from 14.73 to 11.48 on CIFAR-10 and from 8.73 to 6.66 on ImageNet-2012.",
        "keywords": "Generative Adversarial Networks;Consistency Regularization;GAN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Han Zhang;Zizhao Zhang;Augustus Odena;Honglak Lee",
        "authorids": "zhanghan@google.com;zizhaoz@google.com;augustusodena@google.com;honglak@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nZhang2020Consistency,\ntitle={Consistency Regularization for Generative Adversarial Networks},\nauthor={Han Zhang and Zizhao Zhang and Augustus Odena and Honglak Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lxKlSKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer6",
        "site": "https://openreview.net/forum?id=S1lxKlSKPH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "130;408;349",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "100;979;480",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.6666666666667,
            119.59468587227816
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            519.6666666666666,
            359.9447488465726
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 337,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10821477650797006297&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1ly10EKDS",
        "title": "Reanalysis of Variance Reduced Temporal Difference Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper provides a rigorous study of the variance reduced TD learning and characterizes its advantage over vanilla TD learning",
        "abstract": "Temporal difference (TD) learning is a popular algorithm for policy evaluation in reinforcement learning, but the vanilla TD can substantially suffer from the inherent optimization variance. A variance reduced TD (VRTD) algorithm was proposed by \\cite{korda2015td}, which applies the variance reduction technique directly to the online TD learning with Markovian samples. In this work, we first point out the technical errors in the analysis of VRTD in \\cite{korda2015td}, and then provide a mathematically solid analysis of the non-asymptotic convergence of VRTD and its variance reduction performance. We show that VRTD is guaranteed to converge to a neighborhood of the fixed-point solution of TD at a linear convergence rate. Furthermore, the variance error (for both i.i.d.\\ and Markovian sampling) and the bias error (for Markovian sampling) of VRTD are significantly reduced by the batch size of variance reduction in comparison to those of vanilla TD. As a result, the overall computational complexity of VRTD to attain a given accurate solution outperforms that of TD under Markov sampling and outperforms that of TD under i.i.d.\\ sampling for a sufficiently small conditional number.",
        "keywords": "Reinforcement Learning;TD learning;Markovian sample;Variance Reduction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tengyu Xu;Zhe Wang;Yi Zhou;Yingbin Liang",
        "authorids": "xu.3260@osu.edu;wang.10982@osu.edu;yi.zhou@utah.edu;liang.889@osu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nXu2020Reanalysis,\ntitle={Reanalysis of Variance Reduced Temporal Difference Learning},\nauthor={Tengyu Xu and Zhe Wang and Yi Zhou and Yingbin Liang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1ly10EKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer5;AnonReviewer4",
        "site": "https://openreview.net/forum?id=S1ly10EKDS",
        "pdf_size": 0,
        "rating": "3;6;6;8;8",
        "confidence": "0;0;0;0;0",
        "wc_review": "279;504;129;541;905",
        "wc_reply_reviewers": "0;0;0;0;0",
        "wc_reply_authors": "251;603;146;835;525",
        "reply_reviewers": "0;0;0;0;0",
        "reply_authors": "1;1;1;1;1",
        "rating_avg": [
            6.2,
            1.8330302779823362
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            471.6,
            263.8830043788345
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            472.0,
            247.73211338056274
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3737883944452447999&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1ly2grtvB",
        "title": "IS THE LABEL TRUSTFUL: TRAINING BETTER DEEP LEARNING MODEL VIA UNCERTAINTY MINING NET",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this work, we consider a new problem of training deep neural network on partially labeled data with label noise.  As far as we know, \nthere have been very few efforts to tackle such problems.\nWe present a novel end-to-end deep generative pipeline for improving classifier performance when dealing with such data problems.  We call it \nUncertainty Mining Net (UMN).  \n During the training stage, we utilize all the available data (labeled and unlabeled) to train the classifier via a semi-supervised generative framework. \n During training, UMN estimates the uncertainly of the labels\u2019 to focus on clean data for  learning. More precisely, UMN applies the sample-wise label uncertainty estimation scheme. \n Extensive experiments and comparisons against state-of-the-art methods on several popular benchmark datasets demonstrate that UMN can reduce the effects of label noise and significantly improve classifier performance.",
        "keywords": "Semi-supervised Learning;Robust Learning;Deep Generative Model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yang Sun;Abhishek Kolagunda;Steven Eliuk;Xiaolong Wang",
        "authorids": "yang.sun1@ibm.com;abhishek.kolagunda@ibm.com;steven.eliuk@ibm.com;visionxiaolong@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsun2020is,\ntitle={{\\{}IS{\\}} {\\{}THE{\\}} {\\{}LABEL{\\}} {\\{}TRUSTFUL{\\}}: {\\{}TRAINING{\\}} {\\{}BETTER{\\}} {\\{}DEEP{\\}} {\\{}LEARNING{\\}} {\\{}MODEL{\\}} {\\{}VIA{\\}} {\\{}UNCERTAINTY{\\}} {\\{}MINING{\\}} {\\{}NET{\\}}},\nauthor={Yang Sun and Abhishek Kolagunda and Steven Eliuk and Xiaolong Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=S1ly2grtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1ly2grtvB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "502;441;246",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1178;770;369",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            396.3333333333333,
            109.17977020594165
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            772.3333333333334,
            330.2769881310063
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:s0kMAVdgjCIJ:scholar.google.com/&scioq=IS+THE+LABEL+TRUSTFUL:+TRAINING+BETTER+DEEP+LEARNING+MODEL+VIA+UNCERTAINTY+MINING+NET&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1lyyANYwr",
        "title": "Constrained Markov Decision Processes via Backward Value Functions",
        "track": "main",
        "status": "Reject",
        "tldr": "We present an on-policy method for solving constrained MDPs that respects trajectory-level constraints by converting them into local state-dependent constraints, and works for both discrete and continuous high-dimensional spaces.",
        "abstract": "Although Reinforcement Learning (RL) algorithms have found tremendous success in simulated domains, they often cannot directly be applied to physical systems, especially in cases where there are hard constraints to satisfy (e.g. on safety or resources). In standard RL, the agent is incentivized to explore any behavior as long as it maximizes rewards, but in the real world undesired behavior can damage either the system or the agent in a way that breaks the learning process itself. In this work, we model the problem of learning with constraints as a Constrained Markov Decision Process, and provide a new on-policy formulation for solving it. A key contribution of our approach is to translate cumulative cost constraints into state-based constraints. Through this, we define a safe policy improvement method which maximizes returns while ensuring that the constraints are satisfied at every step. We provide theoretical guarantees under which the agent converges while ensuring safety over the course of training. We also highlight computational advantages of this approach. The effectiveness of our approach is demonstrated on safe navigation tasks and in safety-constrained versions of MuJoCo environments, with deep neural networks.",
        "keywords": "Reinforcement Learning;Constrained Markov Decision Processes;Deep Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Harsh Satija;Philip Amortila;Joelle Pineau",
        "authorids": "harsh.satija@mail.mcgill.ca;philip.amortila@mail.mcgill.ca;jpineau@cs.mcgill.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsatija2020constrained,\ntitle={Constrained Markov Decision Processes via Backward Value Functions},\nauthor={Harsh Satija and Philip Amortila and Joelle Pineau},\nyear={2020},\nurl={https://openreview.net/forum?id=S1lyyANYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1lyyANYwr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "246;315;202",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "340;552;89",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            254.33333333333334,
            46.50686926561375
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            327.0,
            189.2423490307248
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 76,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9229081924443549243&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "S1x0CnEtvB",
        "title": "AutoGrow: Automatic Layer Growing in Deep Convolutional Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "A method that automatically grows layers in neural networks to discover optimal depth.",
        "abstract": "Depth is a key component of Deep Neural Networks (DNNs), however, designing depth is heuristic and requires many human efforts. We propose AutoGrow to automate depth discovery in DNNs: starting from a shallow seed architecture, AutoGrow grows new layers if the growth improves the accuracy; otherwise, stops growing and thus discovers the depth. We propose robust growing and stopping policies to generalize to different network architectures and datasets. Our experiments show that by applying the same policy to different network architectures, AutoGrow can always discover near-optimal depth on various datasets of MNIST, FashionMNIST, SVHN, CIFAR10, CIFAR100 and ImageNet. For example, in terms of accuracy-computation trade-off, AutoGrow discovers a better depth combination in ResNets than human experts. Our AutoGrow is efficient. It discovers depth within similar time of training a single DNN.",
        "keywords": "Growing;depth;neural networks;automation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei Wen;Feng Yan;Hai Li",
        "authorids": "wei.wen@duke.edu;fyan@unr.edu;hai.li@duke.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwen2020autogrow,\ntitle={AutoGrow: Automatic Layer Growing in Deep Convolutional Networks},\nauthor={Wei Wen and Feng Yan and Hai Li},\nyear={2020},\nurl={https://openreview.net/forum?id=S1x0CnEtvB}\n}",
        "github": "https://drive.google.com/file/d/1C_kdg7Ffb4EE1WKpRO7W2t9f83WueuvY/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1x0CnEtvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "676;824;122",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "456;393;271",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            540.6666666666666,
            302.14492475558075
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            373.3333333333333,
            76.79554385220248
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 44,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9235896407457836085&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "S1x1IkHtPr",
        "title": "A Generative Model for Molecular Distance Geometry",
        "track": "main",
        "status": "Reject",
        "tldr": "Neural network based generative model for molecular conformations utilizing Euclidean distance geometry.",
        "abstract": "Computing equilibrium states for many-body systems, such as molecules, is a long-standing challenge. In the absence of methods for generating statistically independent samples, great computational effort is invested in simulating these systems using, for example, Markov chain Monte Carlo. We present a probabilistic model that generates such samples for molecules from their graph representations. Our model learns a low-dimensional manifold that preserves the geometry of local atomic neighborhoods through a principled learning representation that is based on Euclidean distance geometry. We create a new dataset for molecular conformation generation with which we show experimentally that our generative model achieves state-of-the-art accuracy. Finally, we show how to use our model as a proposal distribution in an importance sampling scheme to compute molecular properties.",
        "keywords": "graph neural networks;variational autoencoders;distance geometry;molecular conformation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gregor N. C. Simm;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato",
        "authorids": "gncsimm@gmail.com;jmh233@cam.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsimm2020a,\ntitle={A Generative Model for Molecular Distance Geometry},\nauthor={Gregor N. C. Simm and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato},\nyear={2020},\nurl={https://openreview.net/forum?id=S1x1IkHtPr}\n}",
        "github": "https://figshare.com/s/1b42bf865bd78c457354",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1x1IkHtPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "1412;531;119",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2689;1185;774",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            687.3333333333334,
            539.3158217181799
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1549.3333333333333,
            823.1485622629463
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 151,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11522427677669311015&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "S1x2PCNKDB",
        "title": "Task-Relevant Adversarial Imitation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Improve GAIL by preventing the discriminator from exploiting task-irrelevant information, to solve difficult sim robot manipulation tasks from pixels.",
        "abstract": "We show that a critical problem in adversarial imitation from high-dimensional sensory data is the tendency of discriminator networks to distinguish agent and expert behaviour using task-irrelevant features beyond the control of the agent. We analyze this problem in detail and propose a solution as well as several baselines that outperform standard Generative Adversarial Imitation Learning (GAIL). Our proposed solution, Task-Relevant Adversarial Imitation Learning (TRAIL), uses a constrained optimization objective to overcome task-irrelevant features. Comprehensive experiments show that TRAIL can solve challenging manipulation tasks from pixels by imitating human operators, where other agents such as behaviour cloning (BC), standard GAIL, improved GAIL variants including our newly proposed baselines, and Deterministic Policy Gradients from Demonstrations (DPGfD) fail to find solutions, even when the other agents have access to task reward. ",
        "keywords": "adversarial;imitation;robot;manipulation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Konrad Zolna;Scott Reed;Alexander Novikov;Ziyu Wang;Sergio G\u00f3mez;David Budden;Serkan Cabi;Misha Denil;Nando de Freitas",
        "authorids": "konrad.zolna@gmail.com;reedscot@google.com;anovikov@google.com;ziyu@google.com;sergomez@google.com;budden@google.com;cabi@google.com;mdenil@google.com;nandodefreitas@google.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@misc{\nzolna2020taskrelevant,\ntitle={Task-Relevant Adversarial Imitation Learning},\nauthor={Konrad Zolna and Scott Reed and Alexander Novikov and Ziyu Wang and Sergio G{\\'o}mez and David Budden and Serkan Cabi and Misha Denil and Nando de Freitas},\nyear={2020},\nurl={https://openreview.net/forum?id=S1x2PCNKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1x2PCNKDB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "910;471;93",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "631;340;38",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            491.3333333333333,
            333.8486017476917
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            336.3333333333333,
            242.10511950161006
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 73,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5904395373999465016&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1x522NFvS",
        "title": "On unsupervised-supervised risk and one-class neural networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Most unsupervised neural networks training methods concern generative models, deep clustering, pretraining or some form of representation learning. We rather deal in this work with unsupervised training of the final classification stage of a standard deep learning stack, with a focus on two types of methods: unsupervised-supervised risk approximations and one-class models. We derive a new analytical solution for the former and identify and analyze its similarity with the latter.\nWe apply and validate the proposed approach on multiple experimental conditions, in particular on four challenging recent Natural Language Processing tasks as well as on an anomaly detection task, where it improves over state-of-the-art models.",
        "keywords": "unsupervised training;one-class models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Christophe Cerisara",
        "authorids": "cerisara@loria.fr",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\ncerisara2020on,\ntitle={On unsupervised-supervised risk and one-class neural networks},\nauthor={Christophe Cerisara},\nyear={2020},\nurl={https://openreview.net/forum?id=S1x522NFvS}\n}",
        "github": "https://framagit.org/anymus/ocrisk",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1x522NFvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "1458;141;531",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "276;215;179",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            710.0,
            552.3603896008475
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            223.33333333333334,
            40.036094825655624
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BqUIbugSqZYJ:scholar.google.com/&scioq=On+unsupervised-supervised+risk+and+one-class+neural+networks&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "S1x63TEYvr",
        "title": "Latent Question Reformulation and Information Accumulation for Multi-Hop Machine Reading",
        "track": "main",
        "status": "Reject",
        "tldr": "In this paper, we propose the Latent Question Reformulation Network (LQR-net), a multi-hop and parallel attentive network designed for question-answering tasks that require reasoning capabilities.",
        "abstract": "Multi-hop text-based question-answering is a current challenge in machine comprehension. \nThis task requires to sequentially integrate facts from multiple passages to answer complex natural language questions.\nIn this paper, we propose a novel architecture, called the Latent Question Reformulation Network (LQR-net), a multi-hop and parallel attentive network designed for question-answering tasks that require reasoning capabilities.\nLQR-net is composed of an association of \\textbf{reading modules} and \\textbf{reformulation modules}.\nThe purpose of the reading module is to produce a question-aware representation of the document.\nFrom this document representation, the reformulation module extracts essential elements to calculate an updated representation of the question.\nThis updated question is then passed to the following hop.\nWe evaluate our architecture on the \\hotpotqa question-answering dataset designed to assess multi-hop reasoning capabilities.\nOur model achieves competitive results on the public leaderboard and outperforms the best current \\textit{published} models in terms of Exact Match (EM) and $F_1$ score.\nFinally, we show that an analysis of the sequential reformulations can provide interpretable reasoning paths.",
        "keywords": "question-answering;machine comprehension;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Quentin Grail;Julien Perez;Eric Gaussier",
        "authorids": "quentin.grail@naverlabs.com;julien.perez@naverlabs.com;eric.gaussier@imag.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngrail2020latent,\ntitle={Latent Question Reformulation and Information Accumulation for Multi-Hop Machine Reading},\nauthor={Quentin Grail and Julien Perez and Eric Gaussier},\nyear={2020},\nurl={https://openreview.net/forum?id=S1x63TEYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1x63TEYvr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "171;636;75",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "322;436;64",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            294.0,
            244.98571386919687
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            274.0,
            155.61490931141526
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9053260710828373506&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1x6TlBtwB",
        "title": "Mixture Distributions for Scalable Bayesian Inference",
        "track": "main",
        "status": "Reject",
        "tldr": "Scalable Bayesian NN's Alternative to Deep Ensemble",
        "abstract": "Bayesian Neural Networks (BNNs) provides a mathematically grounded framework to quantify uncertainty. However BNNs are computationally inefficient,\nthus are generally not employed on complicated machine learning tasks. Deep\nEnsembles were introduced as a Bootstrap inspired frequentist approach to the\ncommunity, as an alternative to BNN\u2019s. Ensembles of deterministic and stochastic networks are a good uncertainty estimator in various applications (Although,\nthey are criticized for not being Bayesian). We show Ensembles of deterministic\nand stochastic Neural Networks can indeed be cast as an approximate Bayesian\ninference. Deep Ensembles have another weakness of having high space complexity, we provide an alternative to it by modifying the original Bayes by Backprop (BBB) algorithm to learn more general concrete mixture distributions over\nweights. We show our methods and its variants can give better uncertainty estimates at a significantly lower parametric overhead than Deep Ensembles. We\nvalidate our hypothesis through experiments like non-linear regression, predictive\nuncertainty estimation, detecting adversarial images and exploration-exploitation\ntrade-off in reinforcement learning.",
        "keywords": "uncertainty estimation;Deep Ensembles;Adverserial Robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pranav Poduval;Hrushikesh Loya;Rajat Patel;Sumit Jain",
        "authorids": "pranav97.poduval@gmail.com;loyahrushikesh@gmail.com;prajat5232@iitb.ac.in;sumitjain3033@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\npoduval2020mixture,\ntitle={Mixture Distributions for Scalable Bayesian Inference},\nauthor={Pranav Poduval and Hrushikesh Loya and Rajat Patel and Sumit Jain},\nyear={2020},\nurl={https://openreview.net/forum?id=S1x6TlBtwB}\n}",
        "github": "https://drive.google.com/open?id=1a-tweSF-WdpRUOJPPgnfK0lu4CnYHoKy",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=S1x6TlBtwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1712;306;339",
        "wc_reply_reviewers": "367;0;333",
        "wc_reply_authors": "1749;732;911",
        "reply_reviewers": "3;0;2",
        "reply_authors": "3;1;3",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            785.6666666666666,
            655.1551131008764
        ],
        "wc_reply_reviewers_avg": [
            233.33333333333334,
            165.57442099819914
        ],
        "wc_reply_authors_avg": [
            1130.6666666666667,
            443.2924793206199
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            1.247219128924647
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8XGG8aD2vdgJ:scholar.google.com/&scioq=Mixture+Distributions+for+Scalable+Bayesian+Inference&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1x6ueSKPr",
        "title": "Extreme Language Model Compression with Optimal Subwords and Shared Projections",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present novel distillation techniques that enable training student models with different vocabularies and compress BERT by 60x with minor performance drop.",
        "abstract": "Pre-trained deep neural network language models such as ELMo, GPT, BERT and XLNet have recently achieved state-of-the-art performance on a variety of language understanding tasks. However, their size makes them impractical for a number of scenarios, especially on mobile and edge devices. In particular, the input word embedding matrix accounts for a significant proportion of the model's memory footprint, due to the large input vocabulary and embedding dimensions. Knowledge distillation techniques have had success at compressing large neural network models, but they are ineffective at yielding student models with vocabularies different from the original teacher models. We introduce a novel knowledge distillation technique for training a student model with a significantly smaller vocabulary as well as lower embedding and hidden state dimensions. Specifically, we employ a dual-training mechanism that trains the teacher and student models simultaneously to obtain optimal word embeddings for the student vocabulary. We combine this approach with learning shared projection matrices that transfer layer-wise knowledge from the teacher model to the student model. Our method is able to compress the BERT-BASE model by more than 60x, with only a minor drop in downstream task metrics, resulting in a language model with a footprint of under 7MB. Experimental results also demonstrate higher compression efficiency and accuracy when compared with other state-of-the-art compression techniques.",
        "keywords": "NLP;BERT;Knowledge Distillation;Model Compression;Language Modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sanqiang Zhao;Raghav Gupta;Yang Song;Denny Zhou",
        "authorids": "sanqiang.zhao@pitt.edu;raghavgupta@google.com;yangso@google.com;dennyzhou@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1x6ueSKPr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "212;711;461",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "326;756;410",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            461.3333333333333,
            203.71603329689646
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            497.3333333333333,
            186.09197248194837
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 64,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3880274631773278319&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1xCPJHtDB",
        "title": "Model Based Reinforcement Learning for Atari",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We use video prediction models, a model-based reinforcement learning algorithm and 2h of gameplay per game to train agents for 26 Atari games.",
        "abstract": "Model-free reinforcement learning (RL) can be used to learn effective policies for complex tasks, such as Atari games, even from image observations. However, this typically requires very large amounts of interaction -- substantially more, in fact, than a human would need to learn the same games. How can people learn so quickly? Part of the answer may be that people can learn how the game works and predict which actions will lead to desirable outcomes. In this paper, we explore how video prediction models can similarly enable agents to solve Atari games with fewer interactions than model-free methods. We describe Simulated Policy Learning (SimPLe), a complete model-based deep RL algorithm based on video prediction models and present a comparison of several model architectures, including a novel architecture that yields the best results in our setting. Our experiments evaluate SimPLe on a range of Atari games in low data regime of 100k interactions between the agent and the environment, which corresponds to two hours of real-time play. In most games SimPLe outperforms state-of-the-art model-free algorithms, in some games by over an order of magnitude.",
        "keywords": "reinforcement learning;model based rl;video prediction model;atari",
        "primary_area": "",
        "supplementary_material": "",
        "author": "\u0141ukasz Kaiser;Mohammad Babaeizadeh;Piotr Mi\u0142os;B\u0142a\u017cej Osi\u0144ski;Roy H Campbell;Konrad Czechowski;Dumitru Erhan;Chelsea Finn;Piotr Kozakowski;Sergey Levine;Afroz Mohiuddin;Ryan Sepassi;George Tucker;Henryk Michalewski",
        "authorids": "lukaszkaiser@google.com;mbz@google.com;pmilos@mimuw.edu.pl;blazej.osinski@gmail.com;rhc@illinois.edu;konrad.czechowski@gmail.com;dumitru@google.com;chelseaf@google.com;kozak000@gmail.com;slevine@google.com;afrozm@google.com;rsepassi@google.com;gjt@google.com;henrykmichalewski@gmail.com",
        "gender": ";;;;;;;;;;;;;",
        "homepage": ";;;;;;;;;;;;;",
        "dblp": ";;;;;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;;;;",
        "orcid": ";;;;;;;;;;;;;",
        "linkedin": ";;;;;;;;;;;;;",
        "or_profile": ";;;;;;;;;;;;;",
        "aff": ";;;;;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;;;;",
        "position": ";;;;;;;;;;;;;",
        "bibtex": "@inproceedings{\nKaiser2020Model,\ntitle={Model Based Reinforcement Learning for Atari},\nauthor={\u0141ukasz Kaiser and Mohammad Babaeizadeh and Piotr Mi\u0142os and B\u0142a\u017cej Osi\u0144ski and Roy H Campbell and Konrad Czechowski and Dumitru Erhan and Chelsea Finn and Piotr Kozakowski and Sergey Levine and Afroz Mohiuddin and Ryan Sepassi and George Tucker and Henryk Michalewski},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xCPJHtDB}\n}",
        "github": "http://bit.ly/2wjgn1a",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xCPJHtDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "626;275;376",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "304;366;329",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            425.6666666666667,
            147.53605962234755
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            333.0,
            25.468935326524086
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            14,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1127,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4841006515344388997&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "S1xCcpNYPr",
        "title": "Cost-Effective Testing of a Deep Learning Model through Input Reduction",
        "track": "main",
        "status": "Reject",
        "tldr": "we propose DeepReduce, a software engineering approach to cost-effective testing of Deep Learning models.",
        "abstract": "With the increasing adoption of Deep Learning (DL) models in various applications, testing DL models is vitally important. However, testing DL models is costly and expensive, especially when developers explore alternative designs of DL models and tune the hyperparameters. To reduce testing cost, we propose to use only a selected subset of testing data, which is small but representative enough for quick estimation of the performance of DL models. Our approach, called DeepReduce, adopts a two-phase strategy. At first, our approach selects testing data for the purpose of satisfying testing adequacy. Then, it selects more testing data in order to approximate the distribution between the whole testing data and the selected data leveraging relative entropy minimization.\nExperiments with various DL models and datasets show that our approach can reduce the whole testing data to 4.6\\% on average, and can reliably estimate the performance of DL models. Our approach significantly outperforms the random approach, and is more stable and reliable than the state-of-the-art approach.",
        "keywords": "Software Testing;Deep Learning;Input Data Reduction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jianyi Zhou;Feng Li;Jinhao Dong;Hongyu Zhang;Dan Hao",
        "authorids": "zhoujianyi@pku.edu.cn;lifeng2014@pku.edu.cn;xdu_jhdong@163.com;hongyu.zhang@newcastle.edu.au;haod@sei.pku.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhou2020costeffective,\ntitle={Cost-Effective Testing of a Deep Learning Model through Input Reduction},\nauthor={Jianyi Zhou and Feng Li and Jinhao Dong and Hongyu Zhang and Dan Hao},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xCcpNYPr}\n}",
        "github": "https://github.com/DeepReduce/DeepReduce",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1xCcpNYPr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "185;104;476",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1064;57;199",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            255.0,
            159.7310239120754
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            440.0,
            445.0265909658283
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8625374447957845916&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1xCuTNYDr",
        "title": "Regularizing Black-box Models for Improved Interpretability",
        "track": "main",
        "status": "Reject",
        "tldr": "If you train your model with our regularizers, black-box explanations systems will work better on the resulting model.  Further, its likely that the resulting model will be more accurate as well.  ",
        "abstract": "Most of the work on interpretable machine learning has focused on designingeither inherently interpretable models, which typically trade-off accuracyfor interpretability, or post-hoc explanation systems, which lack guarantees about their explanation quality.  We explore an alternative to theseapproaches by directly regularizing a black-box model for interpretabilityat training time.  Our approach explicitly connects three key aspects ofinterpretable machine learning:  (i) the model\u2019s internal interpretability, (ii)the explanation system used at test time, and (iii) the metrics that measureexplanation quality.  Our regularization results in substantial improvementin terms of the explanation fidelity and stability metrics across a range ofdatasets and black-box explanation systems while slightly improving accuracy.  Finally, we justify theoretically that the benefits of our regularizationgeneralize to unseen points.",
        "keywords": "Interpretable Machine Learning;Local Explanations;Regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gregory Plumb;Maruan Al-Shedivat;Eric Xing;Ameet Talwalkar",
        "authorids": "gdplumb@andrew.cmu.edu;alshedivat@cs.cmu.edu;epxing@cs.cmu.edu;talwalkar@cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nplumb2020regularizing,\ntitle={Regularizing Black-box Models for Improved Interpretability},\nauthor={Gregory Plumb and Maruan Al-Shedivat and Eric Xing and Ameet Talwalkar},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xCuTNYDr}\n}",
        "github": "https://github.com/ForReview11235/CodeForICLR2020",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1xCuTNYDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "384;210;196",
        "wc_reply_reviewers": "423;0;0",
        "wc_reply_authors": "596;114;167",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            263.3333333333333,
            85.5154307063286
        ],
        "wc_reply_reviewers_avg": [
            141.0,
            199.4041122946064
        ],
        "wc_reply_authors_avg": [
            292.3333333333333,
            215.8121611237163
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 69,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8791844934310569033&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "S1xD6xHKDr",
        "title": "Building Hierarchical Interpretations in Natural Language via Feature Interaction Detection",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A novel approach to construct hierarchical explanations for text classification by detecting feature interactions.",
        "abstract": "The interpretability of neural networks has become crucial for their applications in real world with respect to the reliability and trustworthiness. Existing explanation generation methods usually provide important features by scoring their individual contributions to the model prediction and ignore the interactions between features, which eventually provide a bag-of-words representation as explanation. In natural language processing, this type of explanations is challenging for human user to understand the meaning of an explanation and draw the connection between explanation and model prediction, especially for long texts. In this work, we focus on detecting the interactions between features, and propose a novel approach to build a hierarchy of explanations based on feature interactions. The proposed method is evaluated with three neural classifiers, LSTM, CNN, and BERT, on two benchmark text classification datasets. The generated explanations are assessed by both automatic evaluation measurements and human evaluators. Experiments show the effectiveness of the proposed method in providing explanations that are both faithful to models, and understandable to humans.",
        "keywords": "Hierarchical Interpretations;Natural Language Processing;Feature Interaction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hanjie Chen;Guangtao Zheng;Yangfeng Ji",
        "authorids": "hc9mx@virginia.edu;gz5hp@virginia.edu;yangfeng@virginia.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1xD6xHKDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "389;453;1054",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            632.0,
            299.54075960821535
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6WNRUj3Wk0IJ:scholar.google.com/&scioq=Building+Hierarchical+Interpretations+in+Natural+Language+via+Feature+Interaction+Detection&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1xFl64tDr",
        "title": "Interpretable Complex-Valued Neural Networks for Privacy Protection",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Previous studies have found that an adversary attacker can often infer unintended input information from intermediate-layer features. We study the possibility of preventing such adversarial inference, yet without too much accuracy degradation. We propose a generic method to revise the neural network to boost the challenge of inferring input attributes from features, while maintaining highly accurate outputs. In particular, the method transforms real-valued features into complex-valued ones, in which the input is hidden in a randomized phase of the transformed features. The knowledge of the phase acts like a key, with which any party can easily recover the output from the processing result, but without which the party can neither recover the output nor distinguish the original input. Preliminary experiments on various datasets and network structures have shown that our method significantly diminishes the adversary's ability in inferring about the input while largely preserves the resulting accuracy.",
        "keywords": "Deep Learning;Privacy Protection;Complex-Valued Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liyao Xiang;Hao Zhang;Haotian Ma;Yifan Zhang;Jie Ren;Quanshi Zhang",
        "authorids": "xiangliyao08@sjtu.edu.cn;1603023-zh@sjtu.edu.cn;11612807@mail.sustc.edu.cn;zhangyf_sjtu@sjtu.edu.cn;ariesrj@sjtu.edu.cn;zqs1022@sjtu.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nXiang2020Interpretable,\ntitle={Interpretable Complex-Valued Neural Networks for Privacy Protection},\nauthor={Liyao Xiang and Hao Zhang and Haotian Ma and Yifan Zhang and Jie Ren and Quanshi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xFl64tDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xFl64tDr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "163;355;286",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "322;422;429",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            268.0,
            79.41032678436729
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            391.0,
            48.873987628048795
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7638317764152398224&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1xFm6VKDH",
        "title": "Meta Module Network for Compositional Visual Reasoning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a new Meta Module Network to resolve some of the restrictions of previous Neural Module Network to achieve strong performance on realistic visual reasoning dataset.",
        "abstract": "There are two main lines of research on visual reasoning: neural module network (NMN) with explicit multi-hop reasoning through handcrafted neural modules, and monolithic network with implicit reasoning in the latent feature space. The former excels in interpretability and compositionality, while the latter usually achieves better performance due to model flexibility and parameter efficiency.  \nIn order to bridge the gap of the two, we present Meta Module Network (MMN), a novel hybrid approach that can efficiently utilize a Meta Module to perform versatile functionalities, while preserving compositionality and interpretability through modularized design. The proposed model first parses an input question into a functional program through a Program Generator. Instead of handcrafting a task-specific network to represent each function like traditional NMN, we use Recipe Encoder to translate the functions into their corresponding recipes (specifications), which are used to dynamically instantiate the Meta Module into Instance Modules. To endow different instance modules with designated functionality, a Teacher-Student framework is proposed, where a symbolic teacher pre-executes against the scene graphs to provide guidelines for the instantiated modules (student) to follow. In a nutshell, MMN adopts the meta module to increase its parameterization efficiency, and uses recipe encoding to improve its generalization ability over NMN. Experiments conducted on the GQA benchmark demonstrates that: (1) MMN achieves significant improvement over both NMN and monolithic network baselines; (2) MMN is able to generalize to unseen but related functions.",
        "keywords": "Module Network;Visual Reasoning;Question Answering;Program Synthesis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenhu Chen;Zhe Gan;Linjie Li;Yu Cheng;William Wang;Jingjing Liu",
        "authorids": "wenhuchen@ucsb.edu;zhe.gan@microsoft.com;lindsey.li@microsoft.com;yu.cheng@microsoft.com;william@cs.ucsb.edu;jingjl@microsoft.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xFm6VKDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "525;277;240",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            347.3333333333333,
            126.53414119868553
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 85,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10910402252365759002&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "S1xGCAVKvr",
        "title": "LEARNING TO LEARN WITH BETTER CONVERGENCE",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We consider the learning to learn problem, where the goal is to leverage deeplearning  models  to  automatically  learn  (iterative)  optimization  algorithms  for training machine learning models. A natural way to tackle this problem is to replace the human-designed optimizer by an LSTM network and train the parameters on some simple optimization problems (Andrychowicz et al., 2016).  Despite their success compared to traditional optimizers such as SGD on a short horizon, theselearnt (meta-) optimizers suffer from two key deficiencies: they fail to converge(or can even diverge) on a longer horizon (e.g., 10000 steps). They also often fail to generalize to new tasks. To address the convergence problem, we rethink the architecture design of the meta-optimizer and develop an embarrassingly simple,yet powerful form of meta-optimizers\u2014a coordinate-wise RNN model. We provide insights into the problems with the previous designs of each component and re-design our SimpleOptimizer to resolve those issues. Furthermore, we propose anew mechanism to allow information sharing between coordinates which enables the meta-optimizer to exploit second-order information with negligible overhead.With these designs, our proposed SimpleOptimizer outperforms previous meta-optimizers and can successfully converge to optimal solutions in the long run.Furthermore, our empirical results show that these benefits can be obtained with much smaller models compared to the previous ones.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Patrick H. Chen;Sashank Reddi;Sanjiv Kumar;Cho-Jui Hsieh",
        "authorids": "patrickchen@g.ucla.edu;sashank@google.com;sanjivk@google.com;chohsieh@cs.ucla.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nchen2020learning,\ntitle={{\\{}LEARNING{\\}}  {\\{}TO{\\}} {\\{}LEARN{\\}}  {\\{}WITH{\\}}  {\\{}BETTER{\\}}  {\\{}CONVERGENCE{\\}}},\nauthor={Patrick H. Chen and Sashank Reddi and Sanjiv Kumar and Cho-Jui Hsieh},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xGCAVKvr}\n}",
        "github": "https://anonymous.4open.science/r/969ae045-a211-4138-bd2d-d9a5d99192af/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1xGCAVKvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "263;400;467",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            376.6666666666667,
            84.90124982720938
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12191371443224738843&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1xHfxHtPr",
        "title": "Online Learned Continual Compression with Stacked Quantization Modules",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an approach for learning to compress online from a non-iid data stream. We argue for the relevance of this problem and show promising results in downstream applications",
        "abstract": "We introduce and study the problem of Online Continual Compression, where one attempts to learn to compress and store a representative dataset from a non i.i.d data stream, while only observing each sample once. This problem is highly relevant for downstream online continual learning tasks, as well as standard learning methods under resource constrained data collection. We propose a new architecture which stacks Quantization Modules (SQM), consisting of a series of discrete autoencoders, each equipped with their own memory. Every added module is trained to reconstruct the latent space of the previous module using fewer bits, allowing the learned representation to become more compact as training progresses. This modularity has several advantages: 1) moderate compressions are quickly available early in training, which is crucial for remembering the early tasks, 2) as more data needs to be stored, earlier data becomes more compressed, freeing memory, 3) unlike previous methods, our approach does not require pretraining, even on challenging datasets. We show several potential applications of this method. We first replace the episodic memory used in Experience Replay with SQM, leading to significant gains on standard continual learning benchmarks using a fixed memory budget. We then apply our method to compressing larger images like those from Imagenet, and show that it is also effective with other modalities, such as LiDAR data.",
        "keywords": "continual learning;lifelong learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lucas Caccia;Eugene Belilovsky;Massimo Caccia;Joelle Pineau",
        "authorids": "lucas.page-caccia@mail.mcgill.ca;belilovsky.eugene@gmail.com;massimo.p.caccia@gmail.com;jpineau@cs.mcgill.ca",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ncaccia2020online,\ntitle={Online Learned Continual Compression with Stacked Quantization Modules},\nauthor={Lucas Caccia and Eugene Belilovsky and Massimo Caccia and Joelle Pineau},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xHfxHtPr}\n}",
        "github": "https://github.com/StackedQuantizationModules/stacked-quantization-modules",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer5;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1xHfxHtPr",
        "pdf_size": 0,
        "rating": "3;3;3;6;6",
        "confidence": "0;0;0;0;0",
        "wc_review": "291;441;203;322;222",
        "wc_reply_reviewers": "0;0;0;0;0",
        "wc_reply_authors": "476;0;0;315;200",
        "reply_reviewers": "0;0;0;0;0",
        "reply_authors": "1;0;0;1;1",
        "rating_avg": [
            4.2,
            1.469693845669907
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.8,
            84.67679729418207
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            198.2,
            184.05694770912615
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6,
            0.48989794855663565
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3801748156798651576&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1xI_TEtwS",
        "title": "Amata: An Annealing Mechanism for Adversarial Training Acceleration",
        "track": "main",
        "status": "Reject",
        "tldr": "Amata: a simple modification to PGD reduces the adversarial training time to 1/2~1/3.",
        "abstract": "Despite of the empirical success in various domains, it has been revealed that deep neural networks are vulnerable to maliciously perturbed input data that much degrade their performance. This is known as adversarial attacks. To counter adversarial attacks, adversarial training formulated as a form of robust optimization has been demonstrated to be effective. However, conducting adversarial training brings much computational overhead compared with standard training. In order to reduce the computational cost, we propose a simple yet effective modification to the commonly used projected gradient descent (PGD) adversarial training by increasing the number of adversarial training steps and decreasing the adversarial training step size gradually as training proceeds. We analyze the optimality of this annealing mechanism through the lens of optimal control theory, and we also prove the convergence of our proposed algorithm. Numerical experiments on standard datasets, such as MNIST and CIFAR10, show that our method can achieve similar or even better robustness with around 1/3 to 1/2 computation time compared with PGD.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nanyang Ye;Qianxiao Li;Zhanxing Zhu",
        "authorids": "yn272@cam.ac.uk;qianxiao@nus.edu.sg;zhanxing.zhu@pku.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nye2020amata,\ntitle={Amata: An Annealing Mechanism for Adversarial Training Acceleration},\nauthor={Nanyang Ye and Qianxiao Li and Zhanxing Zhu},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xI_TEtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1xI_TEtwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "322;242;331",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "553;251;301",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            298.3333333333333,
            40.00277768133386
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            368.3333333333333,
            132.16488018969167
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2715970882239311019&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "S1xJ4JHFvS",
        "title": "Acutum: When Generalization Meets Adaptability",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In spite of the slow convergence, stochastic gradient descent (SGD) is still the most practical optimization method due to its outstanding generalization ability and simplicity. On the other hand, adaptive methods have attracted much more attention of optimization and machine learning communities, both for the leverage of life-long information and for the deep and fundamental mathematical theory. Taking the best of both worlds is the most exciting and challenging question in the field of optimization for machine learning. \n\nIn this paper, we take a small step towards such ultimate goal. We revisit existing adaptive methods from a novel point of view, which reveals a fresh understanding of momentum. Our new intuition empowers us to remove the second moments in Adam without the loss of performance. Based on our view, we propose a new method, named acute adaptive momentum (Acutum). To the best of our knowledge, Acutum is the first adaptive gradient method without second moments. Experimentally, we demonstrate that our method has a faster convergence rate than Adam/Amsgrad, and generalizes as well as SGD with momentum. We also provide a convergence analysis of our proposed method to complement our intuition. ",
        "keywords": "optimization;momentum;adaptive gradient methods",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xunpeng Huang;Zhengyang Liu;Zhe Wang;Yue Yu;Lei Li",
        "authorids": "huangxunpeng@bytedance.com;liuzhengyang.lozycs@bytedance.com;wang.10982@osu.edu;yuyue.elaine@bytedance.com;lilei.02@bytedance.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nhuang2020acutum,\ntitle={Acutum: When Generalization Meets Adaptability},\nauthor={Xunpeng Huang and Zhengyang Liu and Zhe Wang and Yue Yu and Lei Li},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xJ4JHFvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xJ4JHFvS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "456;206;161",
        "wc_reply_reviewers": "0;75;0",
        "wc_reply_authors": "790;739;427",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            274.3333333333333,
            129.76474439881153
        ],
        "wc_reply_reviewers_avg": [
            25.0,
            35.35533905932738
        ],
        "wc_reply_authors_avg": [
            652.0,
            160.4556013356966
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ToRxcnVMC3sJ:scholar.google.com/&scioq=Acutum:+When+Generalization+Meets+Adaptability&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1xJFREKvB",
        "title": "Amortized Nesterov's Momentum: Robust and Lightweight Momentum for Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Amortizing Nesterov's momentum for more robust, lightweight and fast deep learning training.",
        "abstract": "Stochastic Gradient Descent (SGD) with Nesterov's momentum is a widely used optimizer in deep learning, which is observed to have excellent generalization performance. However, due to the large stochasticity, SGD with Nesterov's momentum is not robust, i.e., its performance may deviate significantly from the expectation. In this work, we propose Amortized Nesterov's Momentum, a special variant of Nesterov's momentum which has more robust iterates, faster convergence in the early stage and higher efficiency. Our experimental results show that this new momentum achieves similar (sometimes better) generalization performance with little-to-no tuning. In the convex case, we provide optimal convergence rates for our new methods and discuss how the theorems explain the empirical results. ",
        "keywords": "momentum;nesterov;optimization;deep learning;neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kaiwen Zhou;Yanghua Jin;Qinghua Ding;James Cheng",
        "authorids": "kwzhou@cse.cuhk.edu.hk;jinyh@preferred.jp;qhding@cse.cuhk.edu.hk;jcheng@cse.cuhk.edu.hk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhou2020amortized,\ntitle={Amortized Nesterov's Momentum: Robust and Lightweight  Momentum for Deep Learning},\nauthor={Kaiwen Zhou and Yanghua Jin and Qinghua Ding and James Cheng},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xJFREKvB}\n}",
        "github": "https://gofile.io/?c=e26cUT",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xJFREKvB",
        "pdf_size": 0,
        "rating": "1;3;3;8",
        "confidence": "0;0;0;0",
        "wc_review": "471;883;528;346",
        "wc_reply_reviewers": "153;0;0;0",
        "wc_reply_authors": "1360;646;808;535",
        "reply_reviewers": "1;0;0;0",
        "reply_authors": "3;1;2;1",
        "rating_avg": [
            3.75,
            2.5860201081971503
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            557.0,
            199.39533595347712
        ],
        "wc_reply_reviewers_avg": [
            38.25,
            66.25094338950956
        ],
        "wc_reply_authors_avg": [
            837.25,
            317.0389368831532
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.75,
            0.82915619758885
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lMyqTGmLAVgJ:scholar.google.com/&scioq=Amortized+Nesterov%27s+Momentum:+Robust+and+Lightweight+Momentum+for+Deep+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1xJikHtDH",
        "title": "Improving Model Compatibility of Generative Adversarial Networks by Boundary Calibration",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an auxiliary loss of GAN which improves the accuracy of classifiers trained on the generated data by leveraging pre-trained classifiers.",
        "abstract": "Generative Adversarial Networks (GANs) is a powerful family of models that learn an underlying distribution to generate synthetic data. Many existing studies of GANs focus on improving the realness of the generated image data for visual applications, and few of them concern about improving the quality of the generated data for training other classifiers---a task known as the model compatibility problem. As a consequence, existing GANs often prefer generating `easier' synthetic data that are far from the boundaries of the classifiers, and refrain from generating near-boundary data, which are known to play an important roles in training the classifiers. To improve GAN in terms of model compatibility, we propose Boundary-Calibration GANs (BCGANs), which leverage the boundary information from a set of pre-trained classifiers using the original data. In particular, we introduce an auxiliary Boundary-Calibration loss (BC-loss) into the generator of GAN to match the statistics between the posterior distributions of original data and generated data with respect to the boundaries of the pre-trained classifiers. The BC-loss is provably unbiased and can be easily coupled with different GAN variants to improve their model compatibility. Experimental results demonstrate that BCGANs not only generate realistic images like original GANs but also achieves superior model compatibility than the original GANs.",
        "keywords": "generative adversarial network;GAN;model compatibility;machine learning efficacy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Si-An Chen;Chun-Liang Li;Hsuan-Tien Lin",
        "authorids": "r05922089@csie.ntu.edu.tw;chunlial@cs.cmu.edu;htlin@csie.ntu.edu.tw",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchen2020improving,\ntitle={Improving Model Compatibility of Generative Adversarial Networks by Boundary Calibration},\nauthor={Si-An Chen and Chun-Liang Li and Hsuan-Tien Lin},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xJikHtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=S1xJikHtDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "761;289;288",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "532;466;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            446.0,
            222.7390102040203
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            332.6666666666667,
            236.76899196380333
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DhuUpOvfrswJ:scholar.google.com/&scioq=Improving+Model+Compatibility+of+Generative+Adversarial+Networks+by+Boundary+Calibration&hl=en&as_sdt=0,5",
        "gs_version_total": 5
    },
    {
        "id": "S1xKYJSYwS",
        "title": "VAENAS: Sampling Matters in Neural Architecture Search",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural Architecture Search (NAS) aims at automatically finding neural network architectures within an enormous designed search space. The search space usually contains billions of network architectures which causes extremely expensive computing costs in searching for the best-performing architecture. One-shot and gradient-based NAS approaches have recently shown to achieve superior results on various computer vision tasks such as image recognition. With the weight sharing mechanism, these methods lead to efficient model search. Despite their success, however, current sampling methods are either fixed or hand-crafted and thus ineffective. In this paper, we propose a learnable sampling module based on variational auto-encoder (VAE) for neural architecture search (NAS), named as VAENAS, which can be easily embedded into existing weight sharing NAS framework, e.g., one-shot approach and gradient-based approach, and significantly improve the performance of searching results. VAENAS generates a series of competitive results on CIFAR-10 and ImageNet in NasNet-like search space. Moreover, combined with one-shot approach, our method achieves a new state-of-the-art result for ImageNet classification model under 400M FLOPs with 77.4\\% in ShuffleNet-like search space. Finally, we conduct a thorough analysis of VAENAS on NAS-bench-101 dataset, which demonstrates the effectiveness of our proposed methods.\n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shizheng Qin;Yichen Zhu;Pengfei Hou;Xiangyu Zhang;Wenqiang Zhang;Jian Sun",
        "authorids": "szqin17@fudan.edu.cn;k.zhu@mail.utoronto.ca;houpengfei@megvii.com;zhangxiangyu@megvii.com;wqzhang@fudan.edu.cn;sunjian@megvii.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nqin2020vaenas,\ntitle={{\\{}VAENAS{\\}}: Sampling Matters in Neural Architecture Search},\nauthor={Shizheng Qin and Yichen Zhu and Pengfei Hou and Xiangyu Zhang and Wenqiang Zhang and Jian Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xKYJSYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xKYJSYwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "509;183;199",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "451;500;426",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            297.0,
            150.04888092440632
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            459.0,
            30.735430152621365
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FZQB_or2yfoJ:scholar.google.com/&scioq=VAENAS:+Sampling+Matters+in+Neural+Architecture+Search&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1xKd24twB",
        "title": "SQIL: Imitation Learning via Reinforcement Learning with Sparse Rewards",
        "track": "main",
        "status": "Poster",
        "tldr": "A simple and effective alternative to adversarial imitation learning: initialize experience replay buffer with demonstrations, set their reward to +1, set reward for all other data to 0, run Q-learning or soft actor-critic to train.",
        "abstract": "Learning to imitate expert behavior from demonstrations can be challenging, especially in environments with high-dimensional, continuous observations and unknown dynamics. Supervised learning methods based on behavioral cloning (BC) suffer from distribution shift: because the agent greedily imitates demonstrated actions, it can drift away from demonstrated states due to error accumulation. Recent methods based on reinforcement learning (RL), such as inverse RL and generative adversarial imitation learning (GAIL), overcome this issue by training an RL agent to match the demonstrations over a long horizon. Since the true reward function for the task is unknown, these methods learn a reward function from the demonstrations, often using complex and brittle approximation techniques that involve adversarial training. We propose a simple alternative that still uses RL, but does not require learning a reward function. The key idea is to provide the agent with an incentive to match the demonstrations over a long horizon, by encouraging it to return to demonstrated states upon encountering new, out-of-distribution states. We accomplish this by giving the agent a constant reward of r=+1 for matching the demonstrated action in a demonstrated state, and a constant reward of r=0 for all other behavior. Our method, which we call soft Q imitation learning (SQIL), can be implemented with a handful of minor modifications to any standard Q-learning or off-policy actor-critic algorithm. Theoretically, we show that SQIL can be interpreted as a regularized variant of BC that uses a sparsity prior to encourage long-horizon imitation. Empirically, we show that SQIL outperforms BC and achieves competitive results compared to GAIL, on a variety of image-based and low-dimensional tasks in Box2D, Atari, and MuJoCo. This paper is a proof of concept that illustrates how a simple imitation method based on RL with constant rewards can be as effective as more complex methods that use learned rewards.",
        "keywords": "Imitation Learning;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Siddharth Reddy;Anca D. Dragan;Sergey Levine",
        "authorids": "sgr@berkeley.edu;anca@berkeley.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nreddy2020sqil,\ntitle={{\\{}SQIL{\\}}: Imitation Learning via Reinforcement Learning with Sparse Rewards},\nauthor={Siddharth Reddy and Anca D. Dragan and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xKd24twB}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=S1xKd24twB)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1xKd24twB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "588;320;197",
        "wc_reply_reviewers": "36;0;0",
        "wc_reply_authors": "188;53;63",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            368.3333333333333,
            163.24283206996324
        ],
        "wc_reply_reviewers_avg": [
            12.0,
            16.97056274847714
        ],
        "wc_reply_authors_avg": [
            101.33333333333333,
            61.41841924229426
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 312,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13669871093279123439&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1xLuRVFvr",
        "title": "Visual Explanation for Deep Metric Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This work explores the visual explanation for deep metric learning and its applications. As an important problem for learning representation, metric learning has attracted much attention recently, while the interpretation of such model is not as well studied as classification. To this end, we propose an intuitive idea to show where contributes the most to the overall similarity of two input images by decomposing the final activation. Instead of only providing the overall activation map of each image, we propose to generate point-to-point activation intensity between two images so that the relationship between different regions is uncovered. We show that the proposed framework can be directly deployed to a large range of metric learning applications and provides valuable information for understanding the model. Furthermore, our experiments show its effectiveness on two potential applications, i.e. cross-view pattern discovery and interactive retrieval. ",
        "keywords": "Metric Learning;Visual Explanation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sijie Zhu;Taojiannan Yang;Chen Chen",
        "authorids": "szhu3@uncc.edu;tyang30@uncc.edu;chen.chen@uncc.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhu2020visual,\ntitle={Visual Explanation for Deep Metric Learning},\nauthor={Sijie Zhu and Taojiannan Yang and Chen Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xLuRVFvr}\n}",
        "github": "https://github.com/Jeff-Zilence/anonymous",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1xLuRVFvr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "285;504;232",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "655;1452;218",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            340.3333333333333,
            117.73510757440008
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            775.0,
            510.8744137913609
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 43,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16240397211077609880&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "S1xO4xHFvB",
        "title": "Atomic Compression Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We advance the state-of-the-art in model compression by proposing Atomic Compression Networks (ACNs), a novel architecture that is constructed by recursive repetition of a small set of neurons.",
        "abstract": "Compressed forms of deep neural networks are essential in deploying large-scale\ncomputational models on resource-constrained devices. Contrary to analogous\ndomains where large-scale systems are build as a hierarchical repetition of small-\nscale units, the current practice in Machine Learning largely relies on models with\nnon-repetitive components. In the spirit of molecular composition with repeating\natoms, we advance the state-of-the-art in model compression by proposing Atomic\nCompression Networks (ACNs), a novel architecture that is constructed by recursive\nrepetition of a small set of neurons. In other words, the same neurons with the\nsame weights are stochastically re-positioned in subsequent layers of the network.\nEmpirical evidence suggests that ACNs achieve compression rates of up to three\norders of magnitudes compared to fine-tuned fully-connected neural networks (88\u00d7\nto 1116\u00d7 reduction) with only a fractional deterioration of classification accuracy\n(0.15% to 5.33%). Moreover our method can yield sub-linear model complexities\nand permits learning deep ACNs with less parameters than a logistic regression\nwith no decline in classification accuracy.",
        "keywords": "Network Compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jonas Falkner;Josif Grabocka;Lars Schmidt-Thieme",
        "authorids": "falkner@ismll.uni-hildesheim.de;josif@ismll.uni-hildesheim.de;schmidt-thieme@ismll.uni-hildesheim.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfalkner2020atomic,\ntitle={Atomic Compression Networks},\nauthor={Jonas Falkner and Josif Grabocka and Lars Schmidt-Thieme},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xO4xHFvB}\n}",
        "github": "https://drive.google.com/open?id=1weAzCzlI4p0L9vXsBI12LfNMlfGZdjd_",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1xO4xHFvB",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "183;327;277",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "615;730;793",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            262.3333333333333,
            59.69552374806301
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            712.6666666666666,
            73.69456485310765
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8GMyvAmKv2AJ:scholar.google.com/&scioq=Atomic+Compression+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1xRbxHYDr",
        "title": "Robustness and/or Redundancy Emerge in Overparametrized Deep Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Probing robustness and redundancy in deep neural networks reveals capacity-constraining features which help to explain non-overfitting.",
        "abstract": "Deep neural networks (DNNs) perform well on a variety of tasks despite the fact that most used in practice are vastly overparametrized and even capable of perfectly fitting randomly labeled data. Recent evidence suggests that developing \"compressible\" representations is key for adjusting the complexity of overparametrized networks to the task at hand and avoiding overfitting (Arora et al., 2018; Zhou et al., 2018). In this paper, we provide new empirical evidence that supports this hypothesis, identifying two independent mechanisms that emerge when the network\u2019s width is increased: robustness (having units that can be removed without affecting accuracy) and redundancy (having units with similar activity). In a series of experiments with AlexNet, ResNet and Inception networks in the CIFAR-10 and ImageNet datasets, and also using shallow networks with synthetic data, we show that DNNs consistently increase either their robustness, their redundancy, or both at greater widths for a comprehensive set of hyperparameters. These results suggest that networks in the deep learning regime adjust their effective capacity by developing either robustness or redundancy.",
        "keywords": "overparametrized dnns;robustness;redundancy;compressibility;generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stephen Casper;Xavier Boix;Vanessa D'Amario;Christopher Rodriguez;Ling Guo;Kasper Vinken;Gabriel Kreiman",
        "authorids": ";;vanedamario@gmail.com;;;kasper.vinken@kuleuven.be;gabriel.kreiman@childrens.harvard.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1xRbxHYDr",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "588;494;531",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            537.6666666666666,
            38.663792996664064
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YLKp7rXTc6IJ:scholar.google.com/&scioq=Robustness+and/or+Redundancy+Emerge+in+Overparametrized+Deep+Neural+Networks&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "S1xRnxSYwS",
        "title": "Goten: GPU-Outsourcing Trusted Execution of Neural Network Training and Prediction",
        "track": "main",
        "status": "Reject",
        "tldr": "Leveraging GPU and Intel SGX to protect privacy of training data, model, and queries while achieving high-performance training and prediction",
        "abstract": "Before we saw worldwide collaborative efforts in training machine-learning models or widespread deployments of prediction-as-a-service, we need to devise an ef\ufb01cient privacy-preserving mechanism which guarantees the privacy of all stakeholders (data contributors, model owner, and queriers). Slaom (ICLR \u201919) preserves privacy only for prediction by leveraging both trusted environment (e.g., Intel SGX) and untrusted GPU. The challenges for enabling private training are explicitly left open \u2013 its pre-computation technique does not hide the model weights and fails to support dynamic quantization corresponding to the large changes in weight magnitudes during training. Moreover, it is not a truly outsourcing solution since (of\ufb02ine) pre-computation for a job takes as much time as computing the job locally by SGX, i.e., it only works before all pre-computations are exhausted.\n\nWe propose Goten, a privacy-preserving framework supporting both training and prediction. We tackle all the above challenges by proposing a secure outsourcing protocol which 1) supports dynamic quantization, 2) hides the model weight from GPU, and 3) performs better than a pure-SGX solution even if we perform the precomputation online. Our solution leverages a non-colluding assumption which is often employed by cryptographic solutions aiming for practical ef\ufb01ciency (IEEE SP \u201913, Usenix Security \u201917, PoPETs \u201919). We use three servers, which can be reduced to two if the pre-computation is done of\ufb02ine. Furthermore, we implement our tailor-made memory-aware measures for minimizing the overhead when the SGX memory limit is exceeded (cf., EuroSys \u201917, Usenix ATC \u201919). Compared to a pure-SGX solution, our experiments show that Goten can speed up linear-layer computations in VGG up to 40\u00d7, and overall speed up by 8.64\u00d7 on VGG11.",
        "keywords": "machine learning;security;privacy;TEE;trusted processors;Intel SGX;GPU;high-performance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lucien K.L. Ng;Sherman S.M. Chow;Anna P.Y. Woo;Donald P. H. Wong;Yongjun Zhao",
        "authorids": ";;;;",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nng2020goten,\ntitle={Goten: {\\{}GPU{\\}}-Outsourcing Trusted Execution of Neural Network Training and Prediction},\nauthor={Lucien K.L. Ng and Sherman S.M. Chow and Anna P.Y. Woo and Donald P. H. Wong and Yongjun Zhao},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xRnxSYwS}\n}",
        "github": "https://github.com/goten-team/Goten",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xRnxSYwS",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "854;375;625",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            618.0,
            195.6135646284957
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "S1xRxgSFvH",
        "title": "ShardNet: One Filter Set to Rule Them All",
        "track": "main",
        "status": "Reject",
        "tldr": "We compress deep CNNs by reusing a single convolutional layer in an iterative manner, thereby reducing their parameter counts by a factor proportional to their depth, whilst leaving their accuracies largely unaffected",
        "abstract": "Deep CNNs have achieved state-of-the-art performance for numerous machine learning and computer vision tasks in recent years, but as they have become increasingly deep, the number of parameters they use has also increased, making them hard to deploy in memory-constrained environments and difficult to interpret. Machine learning theory implies that such networks are highly over-parameterised and that it should be possible to reduce their size without sacrificing accuracy, and indeed many recent studies have begun to highlight specific redundancies that can be exploited to achieve this. In this paper, we take a further step in this direction by proposing a filter-sharing approach to compressing deep CNNs that reduces their memory footprint by repeatedly applying a single convolutional mapping of learned filters to simulate a CNN pipeline. We show, via experiments on CIFAR-10, CIFAR-100, Tiny ImageNet, and ImageNet that this allows us to reduce the parameter counts of networks based on common designs such as VGGNet and ResNet by a factor proportional to their depth, whilst leaving their accuracy largely unaffected. At a broader level, our approach also indicates how the scale-space regularities found in visual signals can be leveraged to build neural architectures that are more parsimonious and interpretable.",
        "keywords": "neural network compression;filter sharing;network interpretability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Saumya Jetley;Tommaso Cavallari;Philip Torr;Stuart Golodetz",
        "authorids": "sjetley@robots.ox.ac.uk;tommaso.cavallari@five.ai;phil@five.ai;stuart@five.ai",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njetley2020shardnet,\ntitle={ShardNet: One Filter Set to Rule Them All},\nauthor={Saumya Jetley and Tommaso Cavallari and Philip Torr and Stuart Golodetz},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xRxgSFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1xRxgSFvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "446;339;329",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "48;826;539",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            371.3333333333333,
            52.954907442296815
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            471.0,
            321.2361540466245
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vHmwVF00SJ4J:scholar.google.com/&scioq=ShardNet:+One+Filter+Set+to+Rule+Them+All&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "S1xSSTNKDB",
        "title": "FairFace: A Novel Face Attribute Dataset for Bias Measurement and Mitigation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A new face image dataset for balanced race, gender, and age which can be used for bias measurement and mitigation",
        "abstract": "Existing public face image datasets are strongly biased toward Caucasian faces, and other races (e.g., Latino) are significantly underrepresented. The models trained from such datasets suffer from inconsistent classification accuracy, which limits the applicability of face analytic systems to non-White race groups. To mitigate the race bias problem in these datasets, we constructed a novel face image dataset containing 108,501 images which is balanced on race. We define 7 race groups: White, Black, Indian, East Asian, Southeast Asian, Middle Eastern, and Latino. Images were collected from the YFCC-100M Flickr dataset and labeled with race, gender, and age groups. Evaluations were performed on existing face attribute datasets as well as novel image datasets to measure the generalization performance. We find that the model trained from our dataset is substantially more accurate on novel datasets and the accuracy is consistent across race and gender groups. We also compare several commercial computer vision APIs and report their balanced accuracy across gender, race, and age groups. ",
        "keywords": "dataset bias;face attribute recognition;bias measurement",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kimmo K\u00e4rkk\u00e4inen;Jungseock Joo",
        "authorids": "kimmo@cs.ucla.edu;jjoo@comm.ucla.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1xSSTNKDB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "228;528;182",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "153;555;209",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            312.6666666666667,
            153.4173682765054
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            305.6666666666667,
            177.7813888522143
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7117006121810299387&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "S1xSzyrYDB",
        "title": "Cyclic Graph Dynamic Multilayer Perceptron for Periodic Signals",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a feature extraction for periodic signals. Virtually every mechanized transportation vehicle, power generation, industrial machine, and robotic system contains rotating shafts. It is possible to collect data about periodicity by mea- suring a shaft\u2019s rotation. However, it is difficult to perfectly control the collection timing of the measurements. Imprecise timing creates phase shifts in the resulting data. Although a phase shift does not materially affect the measurement of any given data point collected, it does alter the order in which all of the points are col- lected. It is difficult for classical methods, like multi-layer perceptron, to identify or quantify these alterations because they depend on the order of the input vectors\u2019 components. This paper proposes a robust method for extracting features from phase shift data by adding a graph structure to each data point and constructing a suitable machine learning architecture for graph data with cyclic permutation. Simulation and experimental results illustrate its effectiveness.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mikio Furokawa;Erik Gest;Takayuki Hirano;Kamal Youcef-Toumi",
        "authorids": "mikiof@mit.edu;erikgest@mit.edu;takayuki_hirano@jsw.co.jp;youcef@mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfurokawa2020cyclic,\ntitle={Cyclic Graph Dynamic Multilayer Perceptron for Periodic Signals},\nauthor={Mikio Furokawa and Erik Gest and Takayuki Hirano and Kamal Youcef-Toumi},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xSzyrYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer5;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1xSzyrYDB",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "313;177;335;548",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            343.25,
            132.80130835198875
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2LadMWN-pFgJ:scholar.google.com/&scioq=Cyclic+Graph+Dynamic+Multilayer+Perceptron+for+Periodic+Signals&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1xTMyHYwB",
        "title": "Randomness in Deconvolutional Networks for Visual Representation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We investigate the deep representation of untrained, random weight CNN-DCN architectures, and show their image reconstruction quality and possible applications.",
        "abstract": "To understand the inner work of deep neural networks and provide possible theoretical explanations, we study the deep representations through the untrained, random weight CNN-DCN architecture. As a convolutional AutoEncoder, CNN indicates the portion of a convolutional neural network from the input to an intermediate convolutional layer, and DCN indicates the corresponding deconvolutional portion. As compared with DCN training for pre-trained CNN, training the DCN for random-weight CNN converges more quickly and yields higher quality image reconstruction. Then, what happens for the overall random CNN-DCN? We gain intriguing results that the image can be reconstructed with good quality. To gain more insight on the intermediate random representation, we investigate the impact of network width versus depth, number of random channels, and size of random kernels on the reconstruction quality, and provide theoretical justifications on empirical observations. We further provide a fast style transfer application using the random weight CNN-DCN architecture to show the potential of our observation.",
        "keywords": "Deep representation;random representation;untrained deconvolutional network;image reconstruction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kun He;Jingbo Wang;Haochuan Li;Yao Shu;Liwei Wang;John E. Hopcroft",
        "authorids": "brooklet60@hust.edu.cn;jingbow@usc.edu;lhchuan@pku.edu.cn;shuyao95@gmail.com;wanglw@cis.pku.edu.cn;jeh@cs.cornell.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xTMyHYwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "942;367;85",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            464.6666666666667,
            356.61962306574713
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WvNSVskKOG8J:scholar.google.com/&scioq=Randomness+in+Deconvolutional+Networks+for+Visual+Representation&hl=en&as_sdt=0,33",
        "gs_version_total": 4
    },
    {
        "id": "S1xWh1rYwB",
        "title": "Restricting the Flow: Information Bottlenecks for Attribution",
        "track": "main",
        "status": "Talk",
        "tldr": "We apply the informational bottleneck concept to attribution.",
        "abstract": "Attribution methods provide insights into the decision-making of machine learning models like artificial neural networks. For a given input sample, they assign a relevance score to each individual input variable, such as the pixels of an image. In this work, we adopt the information bottleneck concept for attribution. By adding noise to intermediate feature maps, we restrict the flow of information and can quantify (in bits) how much information image regions provide. We compare our method against ten baselines using three different metrics on VGG-16 and ResNet-50, and find that our methods outperform all baselines in five out of six settings. The method\u2019s information-theoretic foundation provides an absolute frame of reference for attribution values (bits) and a guarantee that regions scored close to zero are not necessary for the network's decision. ",
        "keywords": "Attribution;Informational Bottleneck;Interpretable Machine Learning;Explainable AI",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Karl Schulz;Leon Sixt;Federico Tombari;Tim Landgraf",
        "authorids": "karl.schulz@tum.de;leon.sixt@fu-berlin.de;tombari@in.tum.de;tim.landgraf@fu-berlin.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nSchulz2020Restricting,\ntitle={Restricting the Flow: Information Bottlenecks for Attribution},\nauthor={Karl Schulz and Leon Sixt and Federico Tombari and Tim Landgraf},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xWh1rYwB}\n}",
        "github": "https://github.com/BioroboticsLab/IBA-paper-code",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1xWh1rYwB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "733;481;890",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "666;470;476",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            701.3333333333334,
            168.46826275460774
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            537.3333333333334,
            91.01404043088931
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 237,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13508701398032815926&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "S1xXiREKDB",
        "title": "Adversarial training with perturbation generator networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We proposed the adaptive adversarial training algorithm with learnable perturbation generator networks.",
        "abstract": "Despite the remarkable development of recent deep learning techniques, neural networks are still vulnerable to adversarial attacks, i.e., methods that fool the neural networks with perturbations that are too small for human eyes to perceive. Many adversarial training methods were introduced as to solve this problem, using adversarial examples as a training data. However, these adversarial attack methods used in these techniques are fixed, making the model stronger only to attacks used in training, which is widely known as an overfitting problem. In this paper, we suggest a novel adversarial training approach. In addition to the classifier, our method adds another neural network that generates the most effective adversarial perturbation by finding the weakness of the classifier. This perturbation generator network is trained to produce perturbations that maximize the loss function of the classifier, and these adversarial examples train the classifier with a true label. In short, the two networks compete with each other, performing a minimax game. In this scenario, attack patterns created by the generator network are adaptively altered to the classifier, mitigating the overfitting problem mentioned above. We theoretically proved that our minimax optimization problem is equivalent to minimizing the adversarial loss after all. Beyond this, we proposed an evaluation method that could accurately compare a wide-range of adversarial algorithms. Experiments with various datasets show that our method outperforms conventional adversarial algorithms. ",
        "keywords": "Adversarial training;Generative model;Adaptive perturbation generator;Robust optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hyeungill Lee;Sungyeob Han;Jungwoo Lee",
        "authorids": "hyungil0113@snu.ac.kr;yubise7en@snu.ac.kr;junglee@snu.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlee2020adversarial,\ntitle={Adversarial training with perturbation generator networks},\nauthor={Hyeungill Lee and Sungyeob Han and Jungwoo Lee},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xXiREKDB}\n}",
        "github": "https://github.com/ATPGN/ATPGN",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xXiREKDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "523;294;870",
        "wc_reply_reviewers": "0;218;182",
        "wc_reply_authors": "524;839;1309",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;2;3",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            562.3333333333334,
            236.79010865227363
        ],
        "wc_reply_reviewers_avg": [
            133.33333333333334,
            95.41954144140962
        ],
        "wc_reply_authors_avg": [
            890.6666666666666,
            322.5505989591228
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nEIXwXRCEvUJ:scholar.google.com/&scioq=Adversarial+training+with+perturbation+generator+networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1xaf6VFPB",
        "title": "PDP: A General Neural Framework for Learning SAT Solvers",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a general neural message passing framework for SAT solving based on the idea of propagation, decimation and prediction (PDP). ",
        "abstract": "There have been recent efforts for incorporating Graph Neural Network models for learning fully neural solvers for constraint satisfaction problems (CSP) and particularly Boolean satisfiability (SAT). Despite the unique representational power of these neural embedding models, it is not clear to what extent they actually learn a search strategy vs. statistical biases in the training data. On the other hand, by fixing the search strategy (e.g. greedy search), one would effectively deprive the neural models of learning better strategies than those given. In this paper, we propose a generic neural framework for learning SAT solvers (and in general any CSP solver) that can be described in terms of probabilistic inference and yet learn search strategies beyond greedy search. Our framework is based on the idea of propagation, decimation and prediction (and hence the name PDP) in graphical models, and can be trained directly toward solving SAT in a fully unsupervised manner via energy minimization, as shown in the paper. Our experimental results demonstrate the effectiveness of our framework for SAT solving compared to both neural and the industrial baselines.",
        "keywords": "Neural SAT solvers;Graph Neural Networks;Neural Message Passing;Unsupervised Learning;Neural Decimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Saeed Amizadeh;Sergiy Matusevych;Markus Weimer",
        "authorids": "saamizad@microsoft.com;sergiym@microsoft.com;markus.weimer@microsoft.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\namizadeh2020pdp,\ntitle={{\\{}PDP{\\}}: A General Neural Framework for Learning {\\{}SAT{\\}} Solvers},\nauthor={Saeed Amizadeh and Sergiy Matusevych and Markus Weimer},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xaf6VFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1xaf6VFPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "575;237;267",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "762;324;154",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.6666666666667,
            152.7554327093548
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            413.3333333333333,
            256.12670475545673
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kR-gfb8H_a4J:scholar.google.com/&scioq=PDP:+A+General+Neural+Framework+for+Learning+SAT+Solvers&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "S1xipR4FPB",
        "title": "Teacher-Student Compression with Generative Adversarial Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "More accurate machine learning models often demand more computation and memory at test time, making them difficult to deploy on CPU- or memory-constrained devices. Teacher-student compression (TSC), also known as distillation, alleviates this burden by training a less expensive student model to mimic the expensive teacher model while maintaining most of the original accuracy. However, when fresh data is unavailable for the compression task, the teacher's training data is typically reused, leading to suboptimal compression. In this work, we propose to augment the compression dataset with synthetic data from a generative adversarial network (GAN) designed to approximate the training data distribution. Our GAN-assisted TSC (GAN-TSC) significantly improves student accuracy for expensive models such as large random forests and deep neural networks on both tabular and image datasets. Building on these results, we propose a comprehensive metric\u2014the TSC Score\u2014to evaluate the quality of synthetic datasets based on their induced TSC performance. The TSC Score captures both data diversity and class affinity, and we illustrate its benefits over the popular Inception Score in the context of image classification.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruishan Liu;Nicolo Fusi;Lester Mackey",
        "authorids": "ruishan@stanford.edu;lmackey@stanford.edu;fusi@microsoft.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nliu2020teacherstudent,\ntitle={Teacher-Student Compression with Generative Adversarial Networks},\nauthor={Ruishan Liu and Nicolo Fusi and Lester Mackey},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xipR4FPB}\n}",
        "github": "https://drive.google.com/drive/folders/1IovL_rAVKVnpG2enqAgfPMilOUH-n8tu?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1xipR4FPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "282;84;462",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            276.0,
            154.3761639632233
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4632448621920373156&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1xitgHtvS",
        "title": "Making Sense of Reinforcement Learning and Probabilistic Inference",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Popular algorithms that cast \"RL as Inference\" ignore the role of uncertainty and exploration. We highlight the importance of these issues and present a coherent framework for RL and inference that handles them gracefully.",
        "abstract": "Reinforcement learning (RL) combines a control problem with statistical estimation: The system dynamics are not known to the agent, but can be learned through experience. A recent line of research casts \u2018RL as inference\u2019 and suggests a particular framework to generalize the RL problem as probabilistic inference. Our paper surfaces a key shortcoming in that approach, and clarifies the sense in which RL can be coherently cast as an inference problem. In particular, an RL agent must consider the effects of its actions upon future rewards and observations: The exploration-exploitation tradeoff. In all but the most simple settings, the resulting inference is computationally intractable so that practical RL algorithms must resort to approximation. We demonstrate that the popular \u2018RL as inference\u2019 approximation can perform poorly in even very basic problems. However, we show that with a small modification the framework does yield algorithms that can provably perform well, and we show that the resulting algorithm is equivalent to the recently proposed K-learning, which we further connect with Thompson sampling.\n",
        "keywords": "Reinforcement learning;Bayesian inference;Exploration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Brendan O'Donoghue;Ian Osband;Catalin Ionescu",
        "authorids": "bodonoghue85@gmail.com;iosband@google.com;cdi@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nO'Donoghue2020Making,\ntitle={Making Sense of Reinforcement Learning and Probabilistic Inference},\nauthor={Brendan O'Donoghue and Ian Osband and Catalin Ionescu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xitgHtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1xitgHtvS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "799;190;584",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "459;201;191",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            524.3333333333334,
            252.17762699247442
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            283.6666666666667,
            124.04658623096226
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3850050308144150454&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "S1xjJpNYvB",
        "title": "Domain-Agnostic Few-Shot Classification by Learning Disparate Modulators",
        "track": "main",
        "status": "Reject",
        "tldr": "We address multi-domain few-shot classification by building multiple models to represent this complex task distribution in a collective way and simplifying task-specific adaptation as a selection problem from these pre-trained models.",
        "abstract": "Although few-shot learning research has advanced rapidly with the help of meta-learning, its practical usefulness is still limited because most of the researches assumed that all meta-training and meta-testing examples came from a single domain. We propose a simple but effective way for few-shot classification in which a task distribution spans multiple domains including previously unseen ones during meta-training.\nThe key idea is to build a pool of embedding models which have their own metric spaces and to learn to select the best one for a particular task through multi-domain meta-learning. This simplifies task-specific adaptation over a complex task distribution as a simple selection problem rather than modifying the model with a number of parameters at meta-testing time. Inspired by common multi-task learning techniques, we let all models in the pool share a base network and add a separate modulator to each model to refine the base network in its own way. This architecture allows the pool to maintain representational diversity and each model to have domain-invariant representation as well. \nExperiments show that our selection scheme outperforms other few-shot classification algorithms when target tasks could come from many different domains. They also reveal that aggregating outputs from all constituent models is effective for tasks from unseen domains showing the effectiveness of our framework.",
        "keywords": "Meta-learning;few-shot learning;multi-domain",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yongseok Choi;Junyoung Park;Subin Yi;Dong-Yeon Cho",
        "authorids": "yschoi@sktbrain.com;jypark@sktbrain.com;yisubin@sktbrain.com;dycho24@sktbrain.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nchoi2020domainagnostic,\ntitle={Domain-Agnostic Few-Shot Classification by Learning Disparate Modulators},\nauthor={Yongseok Choi and Junyoung Park and Subin Yi and Dong-Yeon Cho},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xjJpNYvB}\n}",
        "github": "https://drive.google.com/file/d/1FIP7lxc3bvM9kUoGbznLrEtKXqSxi4mo/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=S1xjJpNYvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "703;246;491",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "375;205;764",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            480.0,
            186.73153634741686
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            448.0,
            233.97578222257675
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15791496433323145385&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "S1xnXRVFwH",
        "title": "Playing the lottery with rewards and multiple languages: lottery tickets in RL and NLP",
        "track": "main",
        "status": "Poster",
        "tldr": "We find that the lottery ticket phenomenon is present in both NLP and RL, and find that it can be used to train compressed Transformers to high performance",
        "abstract": "The lottery ticket hypothesis proposes that over-parameterization of deep neural networks (DNNs) aids training by increasing the probability of a \u201clucky\u201d sub-network initialization being present rather than by helping the optimization process (Frankle& Carbin, 2019). Intriguingly, this phenomenon suggests that initialization strategies for DNNs can be improved substantially, but the lottery ticket hypothesis has only previously been tested in the context of supervised learning for natural image tasks. Here, we evaluate whether \u201cwinning ticket\u201d initializations exist in two different domains: natural language processing (NLP) and reinforcement learning (RL).For NLP, we examined both recurrent LSTM models and large-scale Transformer models (Vaswani et al., 2017). For RL, we analyzed a number of discrete-action space tasks, including both classic control and pixel control. Consistent with workin supervised image classification, we confirm that winning ticket initializations generally outperform parameter-matched random initializations, even at extreme pruning rates for both NLP and RL. Notably, we are able to find winning ticket initializations for Transformers which enable models one-third the size to achieve nearly equivalent performance. Together, these results suggest that the lottery ticket hypothesis is not restricted to supervised learning of natural images, but rather represents a broader phenomenon in DNNs.",
        "keywords": "lottery tickets;nlp;transformer;rl;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haonan Yu;Sergey Edunov;Yuandong Tian;Ari S. Morcos",
        "authorids": "haonanu@gmail.com;edunov@fb.com;yuandong@fb.com;arimorcos@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nYu2020Playing,\ntitle={Playing the lottery with rewards and multiple languages: lottery tickets in RL and NLP},\nauthor={Haonan Yu and Sergey Edunov and Yuandong Tian and Ari S. Morcos},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xnXRVFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xnXRVFwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "182;138;191",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            170.33333333333334,
            23.156472577277874
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 157,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14280061729508777257&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1xqRTNtDr",
        "title": "Learning a Behavioral Repertoire from Demonstrations",
        "track": "main",
        "status": "Reject",
        "tldr": "BRIL allows a single neural network to learn a repertoire of behaviors from a set of demonstrations that can be precisely modulated.",
        "abstract": "Imitation Learning (IL) is a machine learning approach to learn a policy from a set of demonstrations. IL can be useful to kick-start learning before applying reinforcement learning (RL) but it can also be useful on its own, e.g. to learn to imitate human players in video games. However, a major limitation of current IL approaches is that they learn only a single ``\"average\" policy based on a dataset that possibly contains demonstrations of numerous different types of behaviors. In this paper, we present a new approach called Behavioral Repertoire Imitation Learning (BRIL) that instead learns a repertoire of behaviors from a set of demonstrations by augmenting the state-action pairs with behavioral descriptions. The outcome of this approach is a single neural network policy conditioned on a behavior description that can be precisely modulated. We apply this approach to train a policy on 7,777 human demonstrations for the build-order planning task in StarCraft II. Dimensionality reduction techniques are applied to construct a low-dimensional behavioral space from the high-dimensional army unit composition of each demonstration. The results demonstrate that the learned policy can be effectively manipulated to express distinct behaviors. Additionally, by applying the UCB1 algorithm, the policy can adapt its behavior -in-between games- to reach a performance beyond that of the traditional IL baseline approach.",
        "keywords": "Behavioral Repertoires;Imitation Learning;Deep Learning;Adaptation;StarCraft 2",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Niels Justesen;Miguel Gonz\u00e1lez Duque;Daniel Cabarcas Jaramillo;Jean-Baptiste Mouret;Sebastian Risi",
        "authorids": "noju@itu.edu;migonzalez@unal.edu.co;dcarbarc@unal.edu.co;jean-baptiste.mouret@inria.fr;sebr@itu.dk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\njustesen2020learning,\ntitle={Learning a Behavioral Repertoire from Demonstrations},\nauthor={Niels Justesen and Miguel Gonz{\\'a}lez Duque and Daniel Cabarcas Jaramillo and Jean-Baptiste Mouret and Sebastian Risi},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xqRTNtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xqRTNtDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "520;479;554",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            517.6666666666666,
            30.663043264200347
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3156016089393658946&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "S1xsG0VYvB",
        "title": "Understanding the functional and structural differences across excitatory and inhibitory neurons",
        "track": "main",
        "status": "Reject",
        "tldr": "Deep nets with excitatory and inhibitory neurons developed functional and structural differences similar to the brain, and here's why.",
        "abstract": "One of the most fundamental organizational principles of the brain is the separation of excitatory (E) and inhibitory (I) neurons. In addition to their opposing effects on post-synaptic neurons, E and I cells tend to differ in their selectivity and connectivity. Although many such differences have been characterized experimentally, it is not clear why they exist in the first place. We studied this question in deep networks equipped with E and I cells. We found that salient distinctions between E and I neurons emerge across various deep convolutional recurrent networks trained to perform standard object classification tasks. We explored the necessary conditions for the networks to develop distinct selectivity and connectivity across cell types. We found that neurons that project to higher-order areas will have greater stimulus selectivity, regardless of whether they are excitatory or not. Sparser connectivity is required for higher selectivity, but only when the recurrent connections are excitatory. These findings demonstrate that the functional and structural differences observed across E and I neurons are not independent, and can be explained using a smaller number of factors.",
        "keywords": "Neuroscience",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sun Minni;Li Ji-An;Theodore Moskovitz;Grace Lindsay;Kenneth Miller;Mario Dipoppa;Guangyu Robert Yang",
        "authorids": "sunminni1031@gmail.com;jian.li.acad@gmail.com;thmoskovitz@gmail.com;gracewlindsay@gmail.com;kendmiller@gmail.com;mario.dipoppa@gmail.com;gyyang.neuro@gmail.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nminni2020understanding,\ntitle={Understanding the functional and structural differences across excitatory and inhibitory neurons},\nauthor={Sun Minni and Li Ji-An and Theodore Moskovitz and Grace Lindsay and Kenneth Miller and Mario Dipoppa and Guangyu Robert Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xsG0VYvB}\n}",
        "github": "https://anonymous.4open.science/repository/d0ae905f-4171-42b0-94b5-abf03d6414aa",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=S1xsG0VYvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "778;295;213",
        "wc_reply_reviewers": "37;36;0",
        "wc_reply_authors": "931;860;72",
        "reply_reviewers": "1;1;0",
        "reply_authors": "3;3;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            428.6666666666667,
            249.27405712499558
        ],
        "wc_reply_reviewers_avg": [
            24.333333333333332,
            17.21110752456745
        ],
        "wc_reply_authors_avg": [
            621.0,
            389.2822455066075
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8987681322271819056&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "S1xtORNFwH",
        "title": "FSNet: Compression of Deep Convolutional Neural Networks by Filter Summary",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a novel method of compression of deep Convolutional Neural Networks (CNNs) by weight sharing through a new representation of convolutional filters.",
        "abstract": "We present a novel method of compression of deep Convolutional Neural Networks (CNNs) by weight sharing through a new representation of convolutional filters. The proposed method reduces the number of parameters of each convolutional layer by learning a $1$D vector termed Filter Summary (FS). The convolutional filters are located in FS as overlapping $1$D segments, and nearby filters in FS share weights in their overlapping regions in a natural way. The resultant neural network based on such weight sharing scheme, termed Filter Summary CNNs or FSNet, has a FS in each convolution layer instead of a set of independent filters in the conventional convolution layer. FSNet has the same architecture as that of the baseline CNN to be compressed, and each convolution layer of FSNet has the same number of filters from FS as that of the basline CNN in the forward process. With compelling computational acceleration ratio, the parameter space of FSNet is much smaller than that of the baseline CNN. In addition, FSNet is quantization friendly. FSNet with weight quantization leads to even higher compression ratio without noticeable performance loss. We further propose Differentiable FSNet where the way filters share weights is learned in a differentiable and end-to-end manner. Experiments demonstrate the effectiveness of FSNet in compression of CNNs for computer vision tasks including image classification and object detection, and the effectiveness of DFSNet is evidenced by the task of Neural Architecture Search.",
        "keywords": "Compression of Convolutional Neural Networks;Filter Summary CNNs;Weight Sharing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yingzhen Yang;Jiahui Yu;Nebojsa Jojic;Jun Huan;Thomas S. Huang",
        "authorids": "superyyzg@gmail.com;jyu79@illinois.edu;jojic@microsoft.com;lukehuan@shenshangtech.com;t-huang1@illinois.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nYang2020FSNet:,\ntitle={FSNet: Compression of Deep Convolutional Neural Networks by Filter Summary},\nauthor={Yingzhen Yang and Jiahui Yu and Nebojsa Jojic and Jun Huan and Thomas S. Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xtORNFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=S1xtORNFwH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "469;316;607",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "442;354;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            464.0,
            118.85285019720814
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            265.3333333333333,
            191.0276303458627
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=913204867257783324&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "S1xxx64YwH",
        "title": "Ecological Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Reinforcement learning algorithms have been shown to effectively learn tasks in a variety of static, deterministic, and  simplistic environments, but their application to environments which are characteristic of dynamic lifelong settings encountered in the real world has been limited. Understanding the impact of specific environmental properties on the learning dynamics of reinforcement learning algorithms is important as we want to align the environments in which we develop our algorithms with the real world, and this is strongly coupled with the type of intelligence which can be learned. In this work, we study what we refer to as ecological reinforcement learning: the interaction between properties of the environment and the reinforcement learning agent. To this end, we introduce environments with characteristics that we argue better reflect natural environments: non-episodic learning, uninformative ``fundamental drive'' reward signals, and natural dynamics that cause the environment to change even when the agent fails to take intelligent actions. We show these factors can have a profound effect on the learning progress of reinforcement learning algorithms. Surprisingly, we find that these seemingly more challenging learning conditions can often make reinforcement learning agents learn more effectively. Through this study, we hope to shift the focus of the community towards learning in realistic, natural environments with dynamic elements.",
        "keywords": "non-episodic;environment analysis;reward shaping;curriculum learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "John D. Co-Reyes;Suvansh Sanjeev;Glen Berseth;Abhishek Gupta;Sergey Levine",
        "authorids": "jcoreyes@eecs.berkeley.edu;suvansh@berkeley.edu;gberseth@gmail.com;abhigupta@berkeley.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nco-reyes2020ecological,\ntitle={Ecological Reinforcement Learning},\nauthor={John D. Co-Reyes and Suvansh Sanjeev and Glen Berseth and Abhishek Gupta and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=S1xxx64YwH}\n}",
        "github": "https://anonymous.4open.science/r/292f3e8b-15f5-4b1a-84ec-dc233ae86cb4/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=S1xxx64YwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1301;220;203",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1106;654;195",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            574.6666666666666,
            513.6421149226592
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            651.6666666666666,
            371.9178523395844
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2709360442745507850&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJe-3REFwr",
        "title": "MUSE: Multi-Scale Attention Model for Sequence to Sequence Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper propose a new model  which combines multi scale information for sequence to sequence learning.",
        "abstract": "Transformers have achieved state-of-the-art results on a variety of natural language processing tasks. \nDespite good performance, Transformers are still weak in long sentence modeling where the global attention map is too dispersed to capture valuable information.\nIn such case, the local/token features that are also significant to sequence modeling are omitted to some extent.\nTo address this problem, we propose a Multi-scale attention model (MUSE) by concatenating attention networks with convolutional networks and position-wise feed-forward networks to explicitly capture local and token features. Considering the parameter size and computation efficiency, we re-use the feed-forward layer in the original Transformer and adopt a lightweight dynamic convolution as implementation. \nExperimental results show that the proposed model achieves substantial performance improvements over Transformer, especially on long sentences, and pushes the state-of-the-art from 35.6 to 36.2 on IWSLT 2014  German to English translation task,  from 30.6 to 31.3 on  IWSLT 2015 English to Vietnamese translation task. We also reach the state-of-art performance on  WMT 2014 English to French translation dataset, with a BLEU score of 43.2.",
        "keywords": "Attention;Sequence to sequence learning;Deep neural networks;Machine Translation;Natural Language Processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guangxiang Zhao;Xu Sun;Jingjing Xu;Zhiyuan Zhang;Liangchen Luo",
        "authorids": "1701214310@pku.edu.cn;xusun@pku.edu.cn;jingjingxu@pku.edu.cn;zzy1210@pku.edu.cn;luolc@pku.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJe-3REFwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "320;341;171",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "105;184;29",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            277.3333333333333,
            75.67621085181848
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            106.0,
            63.28243568848047
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eRDOERSM1xsJ:scholar.google.com/&scioq=MUSE:+Multi-Scale+Attention+Model+for+Sequence+to+Sequence+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJe-HkBKDS",
        "title": "Amharic Text Normalization with Sequence-to-Sequence Models",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "All areas of language and speech technology, directly or indirectly, require handling of real text. In addition to ordinary words and names, the real text contains non-standard words (NSWs), including numbers, abbreviations, dates, currency, amounts, and acronyms. Typically, one cannot find NSWs in a dictionary, nor can one find their pronunciation by an application of ordinary letter-to-sound rules. It is desirable to normalize text by replacing such non-standard words with a consistently formatted and contextually appropriate variant in several NLP applications. To address this challenge, in this paper, we model the problem as character-level sequence-to-sequence learning where we map a sequence of input characters to a sequence of output words. It consists of two neural networks, the encoder network, and the decoder network. The encoder maps the input characters to a fixed dimensional vector and the decoder generates the output words. We have achieved an accuracy of 94.8 % which is promising given the resource we use.",
        "keywords": "Text Normalization;Sequence-to-Sequence Model;Encoder-Decoder",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Seifedin Shifaw Mohamed;Solomon Teferra Abate (PhD)",
        "authorids": "seifedin28@gmail.com;solomon_teferra_7@yahoo.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmohamed2020amharic,\ntitle={Amharic Text Normalization with Sequence-to-Sequence Models},\nauthor={Seifedin Shifaw Mohamed and Solomon Teferra Abate (PhD)},\nyear={2020},\nurl={https://openreview.net/forum?id=SJe-HkBKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJe-HkBKDS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "87;83;107",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            92.33333333333333,
            10.498677165349081
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-ZcJutjK0xgJ:scholar.google.com/&scioq=Amharic+Text+Normalization+with+Sequence-to-Sequence+Models&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJe2YxBtPB",
        "title": "ON SOLVING COOPERATIVE DECENTRALIZED MARL PROBLEMS WITH SPARSE REINFORCEMENTS",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Decentralized decision makers learn to cooperate and make decisions in many domains including (but not limited to) search and rescue, drone delivery, box pushing and fire fighting problems. In these cooperative domains, a key challenge is one of sparse rewards, i.e., rewards/reinforcements are obtained only in a few situations (e.g., on extinguishing a fire, on moving a box) and in most other situations there is no reward/reinforcement. The problem of learning with sparse reinforcements is\nextremely challenging in cooperative Multi-Agent Reinforcement Learning (MARL) problems due to two reasons: (a) Compared to the single agent case, exploration is harder as multiple agents have to be coordinated to receive the reinforcements; and (b) Environment is not stationary as all the agents are learning at the same time (and therefore change policies) and therefore the limited (due to sparse rewards) good experiences can be quickly forgotten. \n\nOne approach that is scalable, decentralized and has shown great performance in general MARL problems is Neural  Fictitious Self Play (NFSP). However, since NFSP averages best response policies, a good policy can be drowned in a deluge of bad best-response policies that come about due to sparse rewards. In this paper, we provide a mechanism for imitation of good experiences within NFSP that ensures good policies do not get overwhelmed by bad policies. We then provide an\nintuitive justification for why self imitation within NFSP can improve performance and how imitation does not impact the fictitious play aspect of NFSP. Finally, we provide a thorough comparison (experimental or descriptive) against relevant cooperative MARL algorithms to demonstrate the utility of our approach.",
        "keywords": "Deep Reinforcement Learning;Cooperative Multi Agent Systems;Sparse Reward;Decentralized Decision Making",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rajiv Ranjan Kumar;Pradeep Varakantham",
        "authorids": "rajivrk.2017@phdis.smu.edu.sg;pradeepv@smu.edu.sg",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJe2YxBtPB",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "1409;623;174",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            735.3333333333334,
            510.4052855874002
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:m77A3vPnRgwJ:scholar.google.com/&scioq=ON+SOLVING+COOPERATIVE+DECENTRALIZED+MARL+PROBLEMS+WITH+SPARSE+REINFORCEMENTS&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJe3KCNKPr",
        "title": "Dual-module Inference for Efficient Recurrent Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We accelerate RNN inference by dynamically reducing redundant memory access using a mixture of accurate and approximate modules.",
        "abstract": "Using Recurrent Neural Networks (RNNs) in sequence modeling tasks is promising in delivering high-quality results but challenging to meet stringent latency requirements because of the memory-bound execution pattern of RNNs. We propose a big-little dual-module inference to dynamically skip unnecessary memory access and computation to speedup RNN inference. Leveraging the error-resilient feature of nonlinear activation functions used in RNNs, we propose to use a lightweight little module that approximates the original RNN layer, which is referred to as the big module, to compute activations of the insensitive region that are more error-resilient. The expensive memory access and computation of the big module can be reduced as the results are only used in the sensitive region. Our method can reduce the overall memory access by 40% on average and achieve 1.54x to 1.75x speedup on CPU-based server platform with negligible impact on model quality.",
        "keywords": "memory-efficient RNNs;dynamic execution;computation skipping",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liu Liu;Lei Deng;Shuangchen Li;Jingwei Zhang;Yihua Yang;Zhenyu Gu;Yufei Ding;Yuan Xie",
        "authorids": "liu_liu@ucsb.edu;leideng@ucsb.edu;shuangchen.li@alibaba-inc.com;jingwei.zhang@alibaba-inc.com;yihua.yang@alibaba-inc.com;zhenyu.gu@alibaba-inc.com;yufeiding@cs.ucsb.edu;yuanxie@ece.ucsb.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nliu2020dualmodule,\ntitle={Dual-module Inference for Efficient Recurrent Neural Networks},\nauthor={Liu Liu and Lei Deng and Shuangchen Li and Jingwei Zhang and Yihua Yang and Zhenyu Gu and Yufei Ding and Yuan Xie},\nyear={2020},\nurl={https://openreview.net/forum?id=SJe3KCNKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJe3KCNKPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "311;463;166",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "339;540;139",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            313.3333333333333,
            121.26096743067089
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            339.3333333333333,
            163.70773415517735
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qaR3WaIT4JEJ:scholar.google.com/&scioq=Dual-module+Inference+for+Efficient+Recurrent+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJe4SJrFDr",
        "title": "Adversarial Neural Pruning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a novel method for suppressing the vulnerability of latent feature space to achieve robust and compact networks.",
        "abstract": "Despite the remarkable performance of deep neural networks (DNNs) on various tasks, they are susceptible to adversarial perturbations which makes it difficult to deploy them in real-world safety-critical applications. In this paper, we aim to obtain robust networks by sparsifying DNN's latent features sensitive to adversarial perturbation. Specifically, we define vulnerability at the latent feature space and then propose a Bayesian framework to prioritize/prune features based on their contribution to both the original and adversarial loss. We also suggest regularizing the features' vulnerability during training to improve robustness further. While such network sparsification has been primarily studied in the literature for computational efficiency and regularization effect of DNNs, we confirm that it is also useful to design a defense mechanism through quantitative evaluation and qualitative analysis. We validate our method, \\emph{Adversarial Neural Pruning (ANP)} on multiple benchmark datasets, which results in an improvement in test accuracy and leads to state-of-the-art robustness. ANP also tackles the practical problem of obtaining sparse and robust networks at the same time, which could be crucial to ensure adversarial robustness on lightweight networks deployed to computation and memory-limited devices.",
        "keywords": "adversarial examples;robust machine learning;robust optimization;network pruning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Divyam Madaan;Jinwoo Shin;Sung Ju Hwang",
        "authorids": "dmadaan@kaist.ac.kr;jinwoos@kaist.ac.kr;sjhwang82@kaist.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJe4SJrFDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "319;263;343",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "405;487;656",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            308.3333333333333,
            33.51948023993743
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            516.0,
            104.50199360139818
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5803692191676477039&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJe5P6EYvS",
        "title": "Mogrifier LSTM",
        "track": "main",
        "status": "Talk",
        "tldr": "An LSTM extension with state-of-the-art language modelling results.",
        "abstract": "Many advances in Natural Language Processing have been based upon more expressive models for how inputs interact with the context in which they occur. Recurrent networks, which have enjoyed a modicum of success, still lack the generalization and systematicity ultimately required for modelling language. In this work, we propose an extension to the venerable Long Short-Term Memory in the form of mutual gating of the current input and the previous output. This mechanism affords the modelling of a richer space of interactions between inputs and their context. Equivalently, our model can be viewed as making the transition function given by the LSTM context-dependent. Experiments demonstrate markedly improved generalization on language modelling in the range of 3\u20134 perplexity points on Penn Treebank and Wikitext-2, and 0.01\u20130.05 bpc on four character-based datasets. We establish a new state of the art on all datasets with the exception of Enwik8, where we close a large gap between the LSTM and Transformer models.\n",
        "keywords": "lstm;language modelling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "G\u00e1bor Melis;Tom\u00e1\u0161 Ko\u010disk\u00fd;Phil Blunsom",
        "authorids": "melisgl@google.com;tkocisky@google.com;pblunsom@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nMelis2020Mogrifier,\ntitle={Mogrifier LSTM},\nauthor={G\u00e1bor Melis and Tom\u00e1\u0161 Ko\u010disk\u00fd and Phil Blunsom},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJe5P6EYvS}\n}",
        "github": "[![github](/images/github_icon.svg) deepmind/lamb](https://github.com/deepmind/lamb) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SJe5P6EYvS)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJe5P6EYvS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "678;267;307",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "624;283;93",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.3333333333333,
            185.04113656758116
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            333.3333333333333,
            219.68209354023878
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 185,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5142385516232440833&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJe7mC4twH",
        "title": "Defensive Quantization Layer For Convolutional Network Against Adversarial Attack",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a quantization-based method which regularizes a CNN's learned representations to be automatically aligned with trainable concept matrix hence effectively filtering out adversarial perturbations.",
        "abstract": "Recent research has intensively revealed the vulnerability of deep neural networks, especially for convolutional neural networks (CNNs) on the task of image recognition, through creating adversarial samples which `\"slightly\" differ from legitimate samples. This vulnerability indicates that these powerful models are sensitive to specific perturbations and cannot filter out these adversarial perturbations. In this work, we propose a quantization-based method which enables a CNN to filter out adversarial perturbations effectively. Notably, different from prior work on input quantization, we apply the quantization in the intermediate layers of a CNN. Our approach is naturally aligned with the clustering of the coarse-grained semantic information learned by a CNN. Furthermore, to compensate for the loss of information which is inevitably caused by the quantization, we propose the multi-head quantization, where we project data points to different sub-spaces and perform quantization within each sub-space. We enclose our design in a quantization layer named as the Q-Layer. The results obtained on MNIST and Fashion-MNSIT datasets demonstrate that only adding one Q-Layer into a CNN could significantly improve its robustness against both white-box and black-box attacks.",
        "keywords": "quantization;adversarial example;robustness;convolutional neural network;concept",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sirui Song;Qinglong Wang;Derek Yang;Yan Song;Xue Liu;Tong Zhang",
        "authorids": "siruisong97@gmail.com;qinglong.wang@mail.mcgill.ca;dyang1206@gmail.com;songyan@chuangxin.com;xueliu@cs.mcgill.ca;tongzhang0@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "https://github.com/AnonymousWorld123/QLayer",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJe7mC4twH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "809;464;534",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            602.3333333333334,
            148.90339896128035
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:675A7zQZrjgJ:scholar.google.com/&scioq=Defensive+Quantization+Layer+For+Convolutional+Network+Against+Adversarial+Attack&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJe9qT4YPr",
        "title": "RISE and DISE: Two Frameworks for Learning from Time Series with Missing Data",
        "track": "main",
        "status": "Reject",
        "tldr": "Two frameworks for learning from time series with missing data: (1) RISE generalizes multiple previous imputation-based methods, (2) DISE avoids imputation by using time information in representation learning",
        "abstract": "Time series with missing data constitute an important setting for machine learning. The most successful prior approaches for modeling such time series are based on recurrent neural networks that learn to impute unobserved values and then treat the imputed values as observed. We start by introducing Recursive Input and State Estimation (RISE), a general framework that encompasses such prior approaches as specific instances. Since RISE instances tend to suffer from poor long-term performance as errors are amplified in feedback loops, we propose Direct Input and State Estimation (DISE), a novel framework in which input and state representations are learned from observed data only. The key to DISE is to include time information in representation learning, which enables the direct modeling of arbitrary future time steps by effectively skipping over missing values, rather than imputing them, thus overcoming the error amplification encountered by RISE methods. We benchmark instances of both frameworks on two forecasting tasks, observing that DISE achieves state-of-the-art performance on both.",
        "keywords": "Time Series;Missing Data;RNN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alberto Garcia-Duran;Robert West",
        "authorids": "alberto.duran@epfl.ch;robert.west@epfl.ch",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngarcia-duran2020rise,\ntitle={{\\{}RISE{\\}} and {\\{}DISE{\\}}: Two Frameworks for Learning from Time Series with Missing Data},\nauthor={Alberto Garcia-Duran and Robert West},\nyear={2020},\nurl={https://openreview.net/forum?id=SJe9qT4YPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJe9qT4YPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "188;424;335",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "215;252;233",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            315.6666666666667,
            97.31164141846317
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            233.33333333333334,
            15.107025591499548
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QIiM9NG9cEUJ:scholar.google.com/&scioq=RISE+and+DISE:+Two+Frameworks+for+Learning+from+Time+Series+with+Missing+Data&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJeC2TNYwB",
        "title": "Unsupervised Out-of-Distribution Detection with Batch Normalization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Likelihood from a generative model is a natural statistic for detecting out-of-distribution (OoD) samples. However, generative models have been shown to assign higher likelihood to OoD samples compared to ones from the training distribution, preventing simple threshold-based detection rules. We demonstrate that OoD detection fails even when using more sophisticated statistics based on the likelihoods of individual samples. To address these issues, we propose a new method that leverages batch normalization. We argue that batch normalization for generative models challenges the traditional \\emph{i.i.d.} data assumption and changes the corresponding maximum likelihood objective. Based on this insight, we propose to exploit in-batch dependencies for OoD detection. Empirical results suggest that this leads to more robust detection for high-dimensional images.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiaming Song;Yang Song;Stefano Ermon",
        "authorids": "jiaming.tsong@gmail.com;yangsong@cs.stanford.edu;ermon@cs.stanford.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsong2020unsupervised,\ntitle={Unsupervised Out-of-Distribution Detection with Batch Normalization},\nauthor={Jiaming Song and Yang Song and Stefano Ermon},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeC2TNYwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeC2TNYwB",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "170;517;281",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            322.6666666666667,
            144.69354589000238
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1439294421037247286&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJeD3CEFPH",
        "title": "Meta-Q-Learning",
        "track": "main",
        "status": "Talk",
        "tldr": "MQL is a simple off-policy meta-RL algorithm that recycles data from the meta-training replay buffer to adapt to new tasks.",
        "abstract": "This paper introduces Meta-Q-Learning (MQL), a new off-policy algorithm for meta-Reinforcement Learning (meta-RL). MQL builds upon three simple ideas. First, we show that Q-learning is competitive with state-of-the-art meta-RL algorithms if given access to a context variable that is a representation of the past trajectory. Second, a multi-task objective to maximize the average reward across the training tasks is an effective method to meta-train RL policies. Third, past data from the meta-training replay buffer can be recycled to adapt the policy on a new task using off-policy updates. MQL draws upon ideas in propensity estimation to do so and thereby amplifies the amount of available data for adaptation. Experiments on standard continuous-control benchmarks suggest that MQL compares favorably with the state of the art in meta-RL.",
        "keywords": "meta reinforcement learning;propensity estimation;off-policy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rasool Fakoor;Pratik Chaudhari;Stefano Soatto;Alexander J. Smola",
        "authorids": "rasool.fakoor@mavs.uta.edu;pratikac@seas.upenn.edu;soatto@cs.ucla.edu;alex@smola.org",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nFakoor2020Meta-Q-Learning,\ntitle={Meta-Q-Learning},\nauthor={Rasool Fakoor and Pratik Chaudhari and Stefano Soatto and Alexander J. Smola},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeD3CEFPH}\n}",
        "github": "[![github](/images/github_icon.svg) amazon-research/meta-q-learning](https://github.com/amazon-research/meta-q-learning) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SJeD3CEFPH)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJeD3CEFPH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "677;393;763",
        "wc_reply_reviewers": "0;60;166",
        "wc_reply_authors": "980;320;903",
        "reply_reviewers": "0;1;1",
        "reply_authors": "2;2;3",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            611.0,
            158.09701662797647
        ],
        "wc_reply_reviewers_avg": [
            75.33333333333333,
            68.63105872869967
        ],
        "wc_reply_authors_avg": [
            734.3333333333334,
            294.65950217534515
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 193,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2865388954464396222&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SJeFNlHtPS",
        "title": "Hidden incentives for self-induced distributional shift",
        "track": "main",
        "status": "Reject",
        "tldr": "Performance metrics are incomplete specifications; the ends don't always justify the means.",
        "abstract": "Decisions made by machine learning systems have increasing influence on the world. Yet it is common for machine learning algorithms to assume that no such influence exists. An example is the use of the i.i.d. assumption in online learning for applications such as content recommendation, where the (choice of) content displayed can change users' perceptions and preferences, or even drive them away, causing a shift in the distribution of users. Generally speaking, it is possible for an algorithm to change the distribution of its own inputs. We introduce the term self-induced distributional shift (SIDS) to describe this phenomenon. A large body of work in reinforcement learning and causal machine learning aims to deal with distributional shift caused by deploying learning systems previously trained offline. Our goal is similar, but distinct: we point out that changes to the learning algorithm, such as the introduction of meta-learning, can reveal hidden incentives for distributional shift (HIDS), and aim to diagnose and prevent problems associated with hidden incentives. We design a simple \u00a0environment as a \"unit test\" for HIDS, as well as a content recommendation environment which allows us to disentangle different types of SIDS.\u00a0 We demonstrate the potential for HIDS to cause unexpected or undesirable behavior in these environments, and propose and test a mitigation strategy.\u00a0",
        "keywords": "distributional shift;safety;incentives;specification;content recommendation;reinforcement learning;online learning;ethics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David Scott Krueger;Tegan Maharaj;Shane Legg;Jan Leike",
        "authorids": "davidscottkrueger@gmail.com;tegan.jrm@gmail.com;legg@google.com;leike@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkrueger2020hidden,\ntitle={Hidden incentives for self-induced distributional shift},\nauthor={David Scott Krueger and Tegan Maharaj and Shane Legg and Jan Leike},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeFNlHtPS}\n}",
        "github": "https://anonymous.4open.science/r/66c5e3a4-2a45-4d71-ae58-d097e12ebae1/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJeFNlHtPS",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "286;306;367",
        "wc_reply_reviewers": "0;236;0",
        "wc_reply_authors": "256;938;417",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            319.6666666666667,
            34.451253807211266
        ],
        "wc_reply_reviewers_avg": [
            78.66666666666667,
            111.25146690668346
        ],
        "wc_reply_authors_avg": [
            537.0,
            291.06814780505727
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8456731399998195520&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJeF_h4FwB",
        "title": "Label Cleaning with Likelihood Ratio Test",
        "track": "main",
        "status": "Reject",
        "tldr": "Use likelihood ratio test to perform label correction ",
        "abstract": "To collect large scale annotated data, it is inevitable to introduce label noise, i.e., incorrect class labels. A major challenge is to develop robust deep learning models that achieve high test performance despite training set label noise.  We introduce a novel approach that directly cleans labels in order to train a high quality model. Our method leverages statistical principles to correct data labels and has a theoretical guarantee of the correctness.  In particular, we use a likelihood ratio test(LRT) to flip the labels of training data.  We prove that our LRT label correction algorithm is guaranteed to flip the label so it is consistent with the true Bayesian optimal decision rule with high probability.  We incorporate our label correction algorithm into the training of deep neural networks and train models that achieve superior testing performance on multiple public datasets.",
        "keywords": "Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Songzhu Zheng;Pengxiang Wu;Aman Goswami;Mayank Goswami;Dimitris Metaxas;Chao Chen",
        "authorids": "zheng.songzhu@stonybrook.edu;pxiangwu@gmail.com;ag77in@gmail.com;mayank.isi@gmail.com;dnm@cs.rutgers.edu;chao.chen.1@stonybrook.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nzheng2020label,\ntitle={Label Cleaning with Likelihood Ratio Test},\nauthor={Songzhu Zheng and Pengxiang Wu and Aman Goswami and Mayank Goswami and Dimitris Metaxas and Chao Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeF_h4FwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJeF_h4FwB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "893;483;222",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "830;665;234",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            532.6666666666666,
            276.1766745320023
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            576.3333333333334,
            251.26391614307767
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SaGH9GbIN_QJ:scholar.google.com/&scioq=Label+Cleaning+with+Likelihood+Ratio+Test&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJeHwJSYvH",
        "title": "Learning De-biased Representations with Biased Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Many machine learning algorithms are trained and evaluated by splitting data from a single source into training and test sets. While such focus on in-distribution learning scenarios has led interesting advances, it has not been able to tell if models are relying on dataset biases as shortcuts for successful prediction (e.g., using snow cues for recognising snowmobiles). Such biased models fail to generalise when the bias shifts to a different class. The cross-bias generalisation problem has been addressed by de-biasing training data through augmentation or re-sampling, which are often prohibitive due to the data collection cost (e.g., collecting images of snowmobile on a desert) and the difficulty of quantifying or expressing biases in the first place. In this work, we propose a novel framework to train a de-biased representation by encouraging it to be different from a set of representations that are biased by design. This tactic is feasible in many scenarios where it is much easier to define a set of biased representations than to define and quantify bias. Our experiments and analyses show that our method discourages models from taking bias shortcuts, resulting in improved performances on de-biased test data.",
        "keywords": "Generalization;Bias;Dataset bias",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hyojin Bahng;Sanghyuk Chun;Sangdoo Yun;Jaegul Choo;Seong Joon Oh",
        "authorids": "hjj552@korea.ac.kr;sanghyuk.c@navercorp.com;sangdoo.yun@navercorp.com;jchoo@korea.ac.kr;coallaoh@linecorp.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nbahng2020learning,\ntitle={Learning De-biased Representations with Biased Representations},\nauthor={Hyojin Bahng and Sanghyuk Chun and Sangdoo Yun and Jaegul Choo and Seong Joon Oh},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeHwJSYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeHwJSYvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "973;251;406",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1006;450;1090",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            543.3333333333334,
            310.33995696475966
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            848.6666666666666,
            283.97808992166205
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 354,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2454950202861832490&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "SJeItTEKvr",
        "title": "MULTI-LABEL METRIC LEARNING WITH BIDIRECTIONAL REPRESENTATION DEEP NEURAL NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Multi-Label Learning task simultaneously predicting multiple labels has attracted researchers' interest for its wide application. \nMetric Learning crucially determines the performance of the k nearest neighbor algorithms, the most popular framework handling the multi-label problem.\nHowever, the existing advanced multiple-label metric learning suffers the inferior capacity and application restriction. \nWe propose an extendable and end-to-end deep representation approach for metric learning on multi-label data set that is based on neural networks able to operate on feature data or directly on raw image data. \nWe motivate the choice of our network architecture via a Bidirectional Representation learning where the label dependency is also integrated and deep convolutional networks that handle image data. \nIn multi-label metric learning, instances with the more different labels will be dragged the more far away, but ones with identical labels will concentrate together. \nOur model scales linearly in the number of instances and trains deep neural networks that encode both input data and output labels, then, obtains a metric space for testing data. \nIn a number of experiments on multi-labels tasks, we demonstrate that our approach is better than related methods based on the systematic metric and its extendability.   \n",
        "keywords": "metric learning;representation learning;multi-label classification;multi-output",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tao Zheng;Ivor Tsang;Xin Yao",
        "authorids": "tao.zheng@student.uts.edu.au;ivor.tsang@uts.edu.au;xiny@sustech.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzheng2020multilabel,\ntitle={{\\{}MULTI{\\}}-{\\{}LABEL{\\}} {\\{}METRIC{\\}} {\\{}LEARNING{\\}} {\\{}WITH{\\}} {\\{}BIDIRECTIONAL{\\}} {\\{}REPRESENTATION{\\}} {\\{}DEEP{\\}} {\\{}NEURAL{\\}} {\\{}NETWORKS{\\}}},\nauthor={Tao Zheng and Ivor Tsang and Xin Yao},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeItTEKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeItTEKvr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "367;204;212",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            261.0,
            75.02444046220316
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:o__Mo_RKUvcJ:scholar.google.com/&scioq=MULTI-LABEL+METRIC+LEARNING+WITH+BIDIRECTIONAL+REPRESENTATION+DEEP+NEURAL+NETWORKS&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJeLIgBKPS",
        "title": "Gradient Descent Maximizes the Margin of Homogeneous Neural Networks",
        "track": "main",
        "status": "Talk",
        "tldr": "We study the implicit bias of gradient descent and prove under a minimal set of assumptions that the parameter direction of homogeneous models converges to KKT points of a natural margin maximization problem.",
        "abstract": "In this paper, we study the implicit regularization of the gradient descent algorithm in homogeneous neural networks, including fully-connected and convolutional neural networks with ReLU or LeakyReLU activations. In particular, we study the gradient descent or gradient flow (i.e., gradient descent with infinitesimal step size) optimizing the logistic loss or cross-entropy loss of any homogeneous model (possibly non-smooth), and show that if the training loss decreases below a certain threshold, then we can define a smoothed version of the normalized margin which increases over time. We also formulate a natural constrained optimization problem related to margin maximization, and prove that both the normalized margin and its smoothed version converge to the objective value at a KKT point of the optimization problem. Our results generalize the previous results for logistic regression with one-layer or multi-layer linear networks, and provide more quantitative convergence results with weaker assumptions than previous results for homogeneous smooth neural networks. We conduct several experiments to justify our theoretical finding on MNIST and CIFAR-10 datasets. Finally, as margin is closely related to robustness, we discuss potential benefits of training longer for improving the robustness of the model.",
        "keywords": "margin;homogeneous;gradient descent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kaifeng Lyu;Jian Li",
        "authorids": "vfleaking@gmail.com;lijian83@mail.tsinghua.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLyu2020Gradient,\ntitle={Gradient Descent Maximizes the Margin of Homogeneous Neural Networks},\nauthor={Kaifeng Lyu and Jian Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeLIgBKPS}\n}",
        "github": "https://github.com/vfleaking/max-margin",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeLIgBKPS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "238;407;237",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "148;237;12",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            294.0,
            79.9041091976293
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            132.33333333333334,
            92.52146898002766
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 385,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=383487913613560767&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJeLO34KwS",
        "title": "Dimensional Reweighting Graph Convolution Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a simple yet effective reweighting scheme for GCNs, theoretically supported by the mean field theory.",
        "abstract": "In this paper, we propose a method named Dimensional reweighting Graph Convolutional Networks (DrGCNs), to tackle the problem of variance between dimensional information in the node representations of GCNs. We prove that DrGCNs can reduce the variance of the node representations by connecting our problem to the theory of the mean field. However, practically, we find that the degrees DrGCNs help vary severely on different datasets. We revisit the problem and develop a new measure K to quantify the effect. This measure guides when we should use dimensional reweighting in GCNs and how much it can help. Moreover, it offers insights to explain the improvement obtained by the proposed DrGCNs. The dimensional reweighting block is light-weighted and highly flexible to be built on most of the GCN variants. Carefully designed experiments, including several fixes on duplicates, information leaks, and wrong labels of the well-known node classification benchmark datasets, demonstrate the superior performances of DrGCNs over the existing state-of-the-art approaches. Significant improvements can also be observed on a large scale industrial dataset.",
        "keywords": "graph convolutional networks;representation learning;mean field theory;variance reduction;node classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xu Zou;Qiuye Jia;Jianwei Zhang;Chang Zhou;Zijun Yao;Hongxia Yang;Jie Tang",
        "authorids": "zoux18@mails.tsinghua.edu.cn;jqy@stanford.edu;zhangjianwei.zjw@alibaba-inc.com;ericzhou.zc@alibaba-inc.com;yaozijun@bupt.edu.cn;yang.yhx@alibaba-inc.com;jietang@tsinghua.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nzou2020dimensional,\ntitle={Dimensional Reweighting Graph Convolution Networks},\nauthor={Xu Zou and Qiuye Jia and Jianwei Zhang and Chang Zhou and Zijun Yao and Hongxia Yang and Jie Tang},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeLO34KwS}\n}",
        "github": "https://drive.google.com/open?id=1VvqiJqXDxL-yLY2Y8iasEU8qxjvrYQdR",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeLO34KwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "725;633;97",
        "wc_reply_reviewers": "0;109;0",
        "wc_reply_authors": "563;608;41",
        "reply_reviewers": "0;2;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            485.0,
            276.9163531947268
        ],
        "wc_reply_reviewers_avg": [
            36.333333333333336,
            51.383092766222454
        ],
        "wc_reply_authors_avg": [
            404.0,
            257.3363557680881
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6391994423043384487&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJeLopEYDH",
        "title": "V4D: 4D Convolutional Neural Networks for Video-level Representation Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "A novel 4D CNN structure for video-level representation learning, surpassing  recent 3D CNNs.",
        "abstract": "Most existing 3D CNN structures for video representation learning are clip-based methods, and do not consider video-level temporal evolution of spatio-temporal features. In this paper, we propose Video-level 4D Convolutional Neural Networks, namely V4D, to model the evolution of long-range spatio-temporal representation with 4D convolutions, as well as preserving 3D spatio-temporal representations with residual connections. We further introduce the training and inference methods for the proposed V4D. Extensive experiments are conducted on three video recognition benchmarks, where V4D achieves excellent results, surpassing recent 3D CNNs by a large margin.",
        "keywords": "video-level representation learning;video action recognition;4D CNNs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shiwen Zhang;Sheng Guo;Weilin Huang;Matthew R. Scott;Limin Wang",
        "authorids": "shizhang@malong.com;sheng@malong.com;whuang@malong.com;mscott@malong.com;07wanglimin@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nZhang2020V4D:,\ntitle={V4D: 4D Convolutional Neural Networks for Video-level Representation Learning},\nauthor={Shiwen Zhang and Sheng Guo and Weilin Huang and Matthew R. Scott and Limin Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeLopEYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJeLopEYDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "237;453;122",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "428;270;428",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            270.6666666666667,
            137.21111066123214
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            375.3333333333333,
            74.481914284983
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 123,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16933764180023788324&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SJeNA6EtDB",
        "title": "Learning Low-rank Deep Neural Networks via Singular Vector Orthogonality Regularization and Singular Value Sparsification",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Efficiently inducing low-rank deep neural networks via SVD training with sparse singular values and orthogonal singular vectors.",
        "abstract": "Modern deep neural networks (DNNs) require high memory consumption and large computational loads.  In order to deploy DNN algorithms efficiently on edge or mobile devices, a series of DNN compression algorithms have been explored, including the line of works on factorization methods. Factorization methods approximate the weight matrix of a DNN layer with multiplication of two or multiple low-rank matrices. However, it is hard to measure the ranks of DNN layers during the training process. Previous works mainly induce low-rank through implicit approximations or via costly singular value decomposition (SVD) process on every training step. The former approach usually induces a high accuracy loss while the latter prevents DNN factorization from efficiently reaching a high compression rate. In this work, we propose SVD training, which first applies SVD to decompose DNN's layers and then performs training on the full-rank decomposed weights. To improve the training quality and convergence, we add orthogonality regularization to the singular vectors, which ensure the valid form of SVD and avoid gradient vanishing/exploding. Low-rank is encouraged by applying sparsity-inducing regularizers on the singular values of each layer. Singular value pruning is applied at the end to reach a low-rank model. We empirically show that SVD training can significantly reduce the rank of DNN layers and achieve higher reduction on computation load under the same accuracy, comparing to not only previous factorization methods but also state-of-the-art filter pruning methods. ",
        "keywords": "Deep neural network;low-rank factorization;singular value decomposition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Huanrui Yang;Minxue Tang;Wei Wen;Feng Yan;Daniel Hu;Ang Li;Hai Li",
        "authorids": "huanrui.yang@duke.edu;tangmx16@mails.tsinghua.edu.cn;wei.wen@duke.edu;fyan@unr.edu;danielhu2003@gmail.com;ang.li630@duke.edu;hai.li@duke.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJeNA6EtDB",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "317;396",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            356.5,
            39.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 91,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13502520269437720457&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "SJeNz04tDS",
        "title": "Overlearning Reveals Sensitive Attributes",
        "track": "main",
        "status": "Poster",
        "tldr": "Overlearning means that a model trained for a seemingly simple objective implicitly learns to recognize attributes and concepts that are (1) not part of the learning objective, and (2) sensitive from a privacy or bias perspective.",
        "abstract": "``\"Overlearning'' means that a model trained for a seemingly simple\nobjective implicitly learns to recognize attributes and concepts that are\n(1) not part of the learning objective, and (2) sensitive from a privacy\nor bias perspective.  For example, a binary gender classifier of facial\nimages also learns to recognize races, even races that are\nnot represented in the training data, and identities.\n\nWe demonstrate overlearning in several vision and NLP models and analyze\nits harmful consequences.  First, inference-time representations of an\noverlearned model reveal sensitive attributes of the input, breaking\nprivacy protections such as model partitioning.  Second, an overlearned\nmodel can be \"`re-purposed'' for a different, privacy-violating task\neven in the absence of the original training data.\n\nWe show that overlearning is intrinsic for some tasks and cannot be\nprevented by censoring unwanted attributes.  Finally, we investigate\nwhere, when, and why overlearning happens during model training.",
        "keywords": "privacy;censoring representation;transfer learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Congzheng Song;Vitaly Shmatikov",
        "authorids": "cs2296@cornell.edu;shmat@cs.cornell.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nSong2020Overlearning,\ntitle={Overlearning Reveals Sensitive Attributes},\nauthor={Congzheng Song and Vitaly Shmatikov},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeNz04tDS}\n}",
        "github": "https://drive.google.com/file/d/1hu0PhN3pWXe6LobxiPFeYBm8L-vQX2zJ/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJeNz04tDS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "286;146;434",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "549;38;265",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            288.6666666666667,
            117.59062698852982
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            284.0,
            209.0470441471648
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 189,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15471148232914274644&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SJeOAJStwB",
        "title": "On Federated Learning of Deep Networks from Non-IID Data: Parameter Divergence and the Effects of Hyperparametric Methods",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigate the internal reasons of our observations, the diminishing effects of the well-known hyperparameter optimization methods on federated learning from decentralized non-IID data.",
        "abstract": "Federated learning, where a global model is trained by iterative parameter averaging of locally-computed updates, is a promising approach for distributed training of deep networks; it provides high communication-efficiency and privacy-preservability, which allows to fit well into decentralized data environments, e.g., mobile-cloud ecosystems. However, despite the advantages, the federated learning-based methods still have a challenge in dealing with non-IID training data of local devices (i.e., learners). In this regard, we study the effects of a variety of hyperparametric conditions under the non-IID environments, to answer important concerns in practical implementations: (i) We first investigate parameter divergence of local updates to explain performance degradation from non-IID data. The origin of the parameter divergence is also found both empirically and theoretically. (ii) We then revisit the effects of optimizers, network depth/width, and regularization techniques; our observations show that the well-known advantages of the hyperparameter optimization strategies could rather yield diminishing returns with non-IID data. (iii) We finally provide the reasons of the failure cases in a categorized way, mainly based on metrics of the parameter divergence.",
        "keywords": "Federated learning;Iterative parameter averaging;Deep networks;Decentralized non-IID data;Hyperparameter optimization methods",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Heejae Kim;Taewoo Kim;Chan-Hyun Youn",
        "authorids": "kim881019@kaist.ac.kr;taewoo_kim@kaist.ac.kr;chyoun@kaist.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkim2020on,\ntitle={On Federated Learning of Deep Networks from Non-{\\{}IID{\\}} Data: Parameter Divergence and the Effects of Hyperparametric Methods},\nauthor={Heejae Kim and Taewoo Kim and Chan-Hyun Youn},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeOAJStwB}\n}",
        "github": "https://github.com/fl-noniid/fl-noniid",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJeOAJStwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "632;416;682",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "965;1793;2939",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;3;6",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            576.6666666666666,
            115.42770705318352
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1899.0,
            809.360241178179
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            4.0,
            1.4142135623730951
        ],
        "replies_avg": [
            20,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14639107728210829451&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJeQEp4YDH",
        "title": "GAT: Generative Adversarial Training for Adversarial Example Detection and Robust Classification",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose an objective that could be used for training adversarial example detection and robust classification systems.",
        "abstract": "The vulnerabilities of deep neural networks against adversarial examples have become a significant concern for deploying these models in sensitive domains. Devising a definitive defense against such attacks is proven to be challenging, and the methods relying on detecting adversarial samples are only valid when the attacker is oblivious to the detection mechanism. In this paper we propose a principled adversarial example detection method that can withstand norm-constrained white-box attacks. Inspired by one-versus-the-rest classification, in a K class classification problem, we train K binary classifiers where the i-th binary classifier is used to distinguish between clean data of class i and adversarially perturbed samples of other classes. At test time, we first use a trained classifier to get the predicted label (say k) of the input, and then use the k-th binary classifier to determine whether the input is a clean sample (of class k) or an adversarially perturbed example (of other classes). We further devise a generative approach to detecting/classifying adversarial examples by interpreting each binary classifier as an unnormalized density model of the class-conditional data. We provide comprehensive evaluation of the above adversarial example detection/classification methods, and demonstrate their competitive performances and compelling properties. Code is available at https://github.com/xuwangyin/GAT-Generative-Adversarial-Training",
        "keywords": "adversarial example detection;adversarial examples classification;robust optimization;ML security;generative modeling;generative classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xuwang Yin;Soheil Kolouri;Gustavo K Rohde",
        "authorids": "xy4cm@virginia.edu;skolouri@hrl.com;gustavo@virginia.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nYin2020GAT:,\ntitle={GAT: Generative Adversarial Training for Adversarial Example Detection and Robust Classification},\nauthor={Xuwang Yin and Soheil Kolouri and Gustavo K Rohde},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeQEp4YDH}\n}",
        "github": "https://github.com/xuwangyin/GAT-Generative-Adversarial-Training",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJeQEp4YDH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "189;318;174",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "516;790;521",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            227.0,
            64.63745044476924
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            609.0,
            128.00260414017626
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 60,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11402188250493503654&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJeQGJrKwH",
        "title": "DS-VIC: Unsupervised Discovery of Decision States for Transfer in RL",
        "track": "main",
        "status": "Reject",
        "tldr": "Identify decision states (where agent can take actions that matter) without reward supervision, use it for transfer.",
        "abstract": "We learn to identify decision states, namely the parsimonious set of states where decisions meaningfully affect the future states an agent can reach in an environment. We utilize the VIC framework, which maximizes an agent\u2019s `empowerment\u2019, ie the ability to reliably reach a diverse set of states -- and formulate a sandwich bound on the empowerment objective that allows identification of decision states. Unlike previous work, our decision states are discovered without extrinsic rewards -- simply by interacting with the world. Our results show that our decision states are: 1) often interpretable, and 2) lead to better exploration on downstream goal-driven tasks in partially observable environments.",
        "keywords": "reinforcement learning;probabilistic inference;variational inference;intrinsic control;transfer learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nirbhay Modhe;Prithvijit Chattopadhyay;Mohit Sharma;Abhishek Das;Devi Parikh;Dhruv Batra;Ramakrishna Vedantam",
        "authorids": "nirbhaym@gatech.edu;prithvijit3@gatech.edu;sharma.mohit.916@gmail.com;abhshkdz@gatech.edu;parikh@gatech.edu;dbatra@gatech.edu;ramav@fb.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nmodhe2020dsvic,\ntitle={{\\{}DS{\\}}-{\\{}VIC{\\}}: Unsupervised Discovery of Decision States for Transfer in {\\{}RL{\\}}},\nauthor={Nirbhay Modhe and Prithvijit Chattopadhyay and Mohit Sharma and Abhishek Das and Devi Parikh and Dhruv Batra and Ramakrishna Vedantam},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeQGJrKwH}\n}",
        "github": "https://anonymous.4open.science/r/90a4a23e-38d1-435a-8fb4-dc6795f79615/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJeQGJrKwH",
        "pdf_size": 0,
        "rating": "3;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "605;649;1087;337",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "1275;726;1533;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;1;2;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            669.5,
            268.9995353155838
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            883.5,
            587.4736164288572
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.25,
            0.82915619758885
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:d7vWx0XitawJ:scholar.google.com/&scioq=DS-VIC:+Unsupervised+Discovery+of+Decision+States+for+Transfer+in+RL&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "SJeQdeBtwB",
        "title": "Adversarially learned anomaly detection for time series data",
        "track": "main",
        "status": "Reject",
        "tldr": "The paper describes an approach for GAN based anomaly detection in time series data",
        "abstract": "Anomaly detection in time series data is an important topic in many domains. However, time series are known to be particular hard to analyze. Based on the recent developments in adversarially learned models, we propose a new approach for anomaly detection in time series data. We build upon the idea to use a combination of a reconstruction error and the output of a Critic network. To this end we propose a cycle-consistent GAN architecture for sequential data and a new way of measuring the reconstruction error. We then show in a detailed evaluation how the different parts of our model contribute to the final anomaly score and demonstrate how the method improves the results on several data sets. We also compare our model to other baseline anomaly detection methods to verify its performance.",
        "keywords": "anomaly detection;gan",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexander Geiger;Alfredo Cuesta-Infante;Kalyan Veeramachaneni",
        "authorids": "geigera@mit.edu;alfredo.cuesta@urjc.es;kalyanv@mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngeiger2020adversarially,\ntitle={Adversarially learned anomaly detection for time series data},\nauthor={Alexander Geiger and Alfredo Cuesta-Infante and Kalyan Veeramachaneni},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeQdeBtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJeQdeBtwB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "504;379;385",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "461;609;338",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            422.6666666666667,
            57.56349151637308
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            469.3333333333333,
            110.79209759224207
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12980364510631390475&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJeQi1HKDH",
        "title": "Learning with Social Influence through Interior Policy Differentiation",
        "track": "main",
        "status": "Reject",
        "tldr": "A new RL algorithm called Interior Policy Differentiation is proposed to learn a collection of diverse policies for a given primal task.",
        "abstract": "Animals develop novel skills not only through the interaction with the environment but also from the influence of the others. In this work we model the social influence into the scheme of reinforcement learning, enabling the agents to learn both from the environment and from their peers. Specifically, we first define a metric to measure the distance between policies then quantitatively derive the definition of uniqueness. Unlike previous precarious joint optimization approaches, the social uniqueness motivation in our work is imposed as a constraint to encourage the agent to learn a policy different from the existing agents while still solve the primal task. The resulting algorithm, namely Interior Policy Differentiation (IPD), brings about performance improvement as well as a collection of policies that solve a given task with distinct behaviors",
        "keywords": "Reinforcement Learning;Social Uniqueness;Policy Differentiation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hao Sun;Bo Dai;Jiankai Sun;Zhenghao Peng;Guodong Xu;Dahua Lin;Bolei Zhou",
        "authorids": "sh018@ie.cuhk.edu.hk;doubledaibo@gmail.com;sunjiankai@sensetime.com;pengzh@ie.cuhk.edu.hk;xg018@ie.cuhk.edu.hk;dhlin@ie.cuhk.edu.hk;bzhou@ie.cuhk.edu.hk",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nsun2020learning,\ntitle={Learning with Social Influence through  Interior Policy Differentiation},\nauthor={Hao Sun and Bo Dai and Jiankai Sun and Zhenghao Peng and Guodong Xu and Dahua Lin and Bolei Zhou},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeQi1HKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJeQi1HKDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "978;526;490",
        "wc_reply_reviewers": "44;0;0",
        "wc_reply_authors": "769;659;542",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            664.6666666666666,
            222.04704206291262
        ],
        "wc_reply_reviewers_avg": [
            14.666666666666666,
            20.741798914805393
        ],
        "wc_reply_authors_avg": [
            656.6666666666666,
            92.68704811832605
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Yo0a7Wh_dWgJ:scholar.google.com/&scioq=Learning+with+Social+Influence+through+Interior+Policy+Differentiation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJeRVRVYwS",
        "title": "Improving Neural Abstractive Summarization Using Transfer Learning and Factuality-Based Evaluation: Towards Automating Science Journalism",
        "track": "main",
        "status": "Withdraw",
        "tldr": "New: application of seq2seq modelling to automating sciene journalism; highly abstractive dataset; transfer learning tricks; automatic evaluation measure.",
        "abstract": "We propose Automating Science Journalism (ASJ), the process of producing a press release from a scientific paper, as a novel task that can serve as a new benchmark for neural abstractive summarization. ASJ is a challenging task as it requires long source texts to be summarized to long target texts, while also paraphrasing complex scientific concepts to be understood by the general audience. For this purpose, we introduce a specialized dataset for ASJ that contains scientific papers and their press releases from Science Daily. While state-of-the-art sequence-to-sequence (seq2seq) models could easily generate convincing press releases for ASJ, these are generally nonfactual and deviate from the source. To address this issue, we improve seq2seq generation via transfer learning by co-training with new targets: (i) scientific abstracts of sources and (ii) partitioned press releases. We further design a measure for factuality that scores how pertinent to the scientific papers the press releases under our seq2seq models are. Our quantitative and qualitative evaluation shows sizable improvements over a strong baseline, suggesting that the proposed framework could improve seq2seq summarization beyond ASJ.",
        "keywords": "neural abstractive summarization;transfer learning;multitask learning;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rumen Dangovski*;Michelle Shen*;Dawson Byrd*;Li Jing*;Preslav Nakov;Marin Soljacic",
        "authorids": "rumenrd@mit.edu;mcshen99@mit.edu;dbyrd@exeter.edu;ljing@mit.edu;pnakov@qf.org.qa;soljacic@mit.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeRVRVYwS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "294;386;869",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            516.3333333333334,
            252.1855577854441
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xq2EyzJjYbQJ:scholar.google.com/&scioq=Improving+Neural+Abstractive+Summarization+Using+Transfer+Learning+and+Factuality-Based+Evaluation:+Towards+Automating+Science+Journalism&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJeS16EKPr",
        "title": "Learning relevant features for statistical inference",
        "track": "main",
        "status": "Reject",
        "tldr": "Given bipartite data and two neural nets, this new objective based on Fisher information teaches them to extract the most correlated features, which can then be used to do inference.",
        "abstract": "We introduce an new technique to learn correlations between two types of data.\nThe learned representation can be used to directly compute the expectations of functions over one type of data conditioned on the other, such as Bayesian estimators and their standard deviations. \nSpecifically, our loss function teaches two neural nets to extract features representing the probability vectors of highest singular value for the stochastic map (set of conditional probabilities) implied by the joint dataset, relative to the inner product defined by the Fisher information metrics evaluated at the marginals.\nWe test the approach using a synthetic dataset, analytical calculations, and inference on occluded MNIST images. \nSurprisingly, when applied to supervised learning (one dataset consists of labels), this approach automatically provides regularization and faster convergence compared to the cross-entropy objective.\nWe also explore using this approach to discover salient independent features of a single dataset. ",
        "keywords": "unsupervised learning;non-parametric probabilistic model;singular value decomposition;fisher information metric;chi-squared distance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "C\u00e9dric B\u00e9ny",
        "authorids": "cedric.beny@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nb{\\'e}ny2020learning,\ntitle={Learning relevant features for statistical inference},\nauthor={C{\\'e}dric B{\\'e}ny},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeS16EKPr}\n}",
        "github": "https://github.com/cbeny/RFA",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeS16EKPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "303;186;516",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "563;0;577",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            335.0,
            136.6089308939939
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            380.0,
            268.76135634920934
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:t7iksnp_O3oJ:scholar.google.com/&scioq=Learning+relevant+features+for+statistical+inference&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "SJeUm1HtDH",
        "title": "Swoosh! Rattle! Thump! - Actions that Sound",
        "track": "main",
        "status": "Reject",
        "tldr": "We explore and study the synergies between sound and action.",
        "abstract": "Truly intelligent agents need to capture the interplay of all their senses to build a rich physical understanding of their world. In robotics, we have seen tremendous progress in using visual and tactile perception; however we have often ignored a key sense: sound. This is primarily due to lack of data that captures the interplay of action and sound. In this work, we perform the first large-scale study of the interactions between sound and robotic action. To do this, we create the largest available sound-action-vision dataset with 15,000 interactions on 60 objects using our robotic platform Tilt-Bot. By tilting objects and allowing them to crash into the walls of a robotic tray, we collect rich four-channel audio information. Using this data, we explore the synergies between sound and action, and present three key insights. First, sound is indicative of fine-grained object class information, e.g., sound can differentiate a metal screwdriver from a metal wrench. Second, sound also contains information about the causal effects of an action, i.e. given the sound produced, we can predict what action was applied on the object. Finally, object representations derived from audio embeddings are indicative of implicit physical properties. We demonstrate that on previously unseen objects, audio embeddings generated through interactions can predict forward models 24% better than passive visual embeddings. ",
        "keywords": "Sound;Action;Audio Representations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dhiraj Gandhi;Abhinav Gupta;Lerrel Pinto",
        "authorids": "g.prakashchand@gmail.com;abhinavg@cs.cmu.edu;lerrel.pinto@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngandhi2020swoosh,\ntitle={Swoosh! Rattle! Thump! - Actions that Sound},\nauthor={Dhiraj Gandhi and Abhinav Gupta and Lerrel Pinto},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeUm1HtDH}\n}",
        "github": "https://sites.google.com/view/iclr2020-sound-action",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJeUm1HtDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "283;155;122",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "222;180;177",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            186.66666666666666,
            69.43742186714276
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            193.0,
            20.54263858417414
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 44,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9643445946889687872&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SJeW-A4tDS",
        "title": "Detecting malicious PDF using CNN",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Malicious PDF files represent one of the biggest threats to computer security. To\ndetect them, significant research has been done using handwritten signatures or\nmachine learning based on manual feature extraction. Those approaches are both\ntime-consuming, requires significant prior knowledge and the list of features has\nto be updated with each newly discovered vulnerability. In this work, we propose\na novel algorithm that uses a Convolutional Neural Network (CNN) on the byte\nlevel of the file, without any handcrafted features. We show, using a data set\nof 130000 files, that our approach maintains a high detection rate (96%) of PDF\nmalware and even detects new malicious files, still undetected by most antiviruses.\nUsing automatically generated features from our CNN network, and applying a\nclustering algorithm, we also obtain high similarity between the antiviruses\u2019 labels\nand the resulting clusters.",
        "keywords": "Cybersecurity;Convolutional Neural Network;Malware",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Raphael Fettaya;Yishay Mansour",
        "authorids": "raphaelfettaya@gmail.com;mansour.yishay@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nfettaya2020detecting,\ntitle={Detecting malicious {\\{}PDF{\\}} using {\\{}CNN{\\}}},\nauthor={Raphael Fettaya and Yishay Mansour},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeW-A4tDS}\n}",
        "github": "https://anonymous.4open.science/r/e553967d-5dfc-4a99-8624-f47c6b5fac5e/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeW-A4tDS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "353;139;83",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "141;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            191.66666666666666,
            116.3481365939691
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            47.0,
            66.46803743153546
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=668286858042139202&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJeWHlSYDB",
        "title": "SPREAD DIVERGENCE",
        "track": "main",
        "status": "Reject",
        "tldr": "A new divergence family dealing with distributions with different supports for training implicit generative models.",
        "abstract": "For distributions $p$ and $q$ with different supports, the divergence $\\div{p}{q}$ may not exist. We define a spread divergence $\\sdiv{p}{q}$ on modified $p$ and $q$ and describe sufficient conditions for the existence of such a divergence. We demonstrate how to maximize the discriminatory power of a given divergence by parameterizing and learning the spread. We also give examples of using a spread divergence to train and improve implicit generative models, including linear models (Independent Components Analysis) and non-linear models (Deep Generative Networks).",
        "keywords": "divergence minimization;generative model;variational inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mingtian Zhang;David Barber;Thomas Bird;Peter Hayes;Raza Habib",
        "authorids": "mingtian.zhang.17@ucl.ac.uk;david.barber@ucl.ac.uk;thomas.bird.17@ucl.ac.uk;peter.hayes.15@ucl.ac.uk;r.habib@cs.ucl.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang2020spread,\ntitle={{\\{}SPREAD{\\}}  {\\{}DIVERGENCE{\\}}},\nauthor={Mingtian Zhang and David Barber and Thomas Bird and Peter Hayes and Raza Habib},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeWHlSYDB}\n}",
        "github": "https://drive.google.com/file/d/1p6l7J1HpcNTV1RrF12wwCza-98m1J8di/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJeWHlSYDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "83;336;275",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "120;763;761",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            231.33333333333334,
            107.80331885222994
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            548.0,
            302.6428037582699
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16017195806562829572&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SJeX2aVFwH",
        "title": "Project and Forget: Solving Large Scale Metric Constrained Problems",
        "track": "main",
        "status": "Reject",
        "tldr": "We can solve large-scale metric-constrained optimization problems provably with Project and Forget.",
        "abstract": "Given a set of distances amongst points, determining what metric representation is most \u201cconsistent\u201d with the input distances or the metric that captures the relevant geometric features of the data is a key step in many machine learning algorithms. In this paper, we focus on metric constrained problems, a class of optimization problems with metric constraints. In particular, we identify three types of metric constrained problems: metric nearness Brickell et al. (2008), weighted correlation clustering on general graphs Bansal et al. (2004), and metric learning Bellet et al. (2013); Davis et al. (2007). Because of the large number of constraints in these problems, however, researchers have been forced to restrict either the kinds of metrics learned or the size of the problem that can be solved.\nWe provide an algorithm, PROJECT AND FORGET, that uses Bregman projections with cutting planes, to solve metric constrained problems with many (possibly exponentially) inequality constraints. We also prove that our algorithm converges to the global optimal solution. Additionally, we show that the optimality error (L2 distance of the current iterate to the optimal) asymptotically decays at an exponential rate. We show that using our method we can solve large problem instances of three types of metric constrained problems, out-performing all state of the art methods with respect to CPU times and problem sizes.",
        "keywords": "metric constrained problems;metric learning;metric nearness;correlation clustering;Bregman projection;cutting planes;large scale optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anna C. Gilbert;Rishi Sonthalia",
        "authorids": "annacg@umich.edu;rsonthal@umich.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngilbert2020project,\ntitle={Project and Forget: Solving Large Scale Metric Constrained Problems},\nauthor={Anna C. Gilbert and Rishi Sonthalia},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeX2aVFwH}\n}",
        "github": "https://www.dropbox.com/sh/lq5nnhi4je2lh89/AABUUW7k5z3lXTSm8x1hhN1Da?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJeX2aVFwH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "364;265;154",
        "wc_reply_reviewers": "0;49;0",
        "wc_reply_authors": "873;528;306",
        "reply_reviewers": "0;1;0",
        "reply_authors": "4;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            261.0,
            85.77878525602937
        ],
        "wc_reply_reviewers_avg": [
            16.333333333333332,
            23.098821518760555
        ],
        "wc_reply_authors_avg": [
            569.0,
            233.285233137462
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17359503190401628383&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SJeXJANFPr",
        "title": "Regularizing Deep Multi-Task Networks using Orthogonal Gradients",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel gradient regularization term that minimizes task interference by enforcing near orthogonal gradients.",
        "abstract": "Deep neural networks are a promising approach towards multi-task learning because of their capability to leverage knowledge across domains and learn general purpose representations. Nevertheless, they can fail to live up to these promises as tasks often compete for a model's limited resources, potentially leading to lower overall performance. In this work we tackle the issue of interfering tasks through a comprehensive analysis of their training, derived from looking at the interaction between gradients within their shared parameters. Our empirical results show that well-performing models have low variance in the angles between task gradients and that popular regularization methods implicitly reduce this measure. Based on this observation, we propose a novel gradient regularization term that minimizes task interference by enforcing near orthogonal gradients. Updating the shared parameters using this property encourages task specific decoders to optimize different parts of the feature extractor, thus reducing competition. We evaluate our method with classification and regression tasks on the multiDigitMNIST and NYUv2 dataset where we obtain competitive results. This work is a first step towards non-interfering multi-task optimization.",
        "keywords": "multi-task learning;gradient regularization;orthogonal gradients",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mihai Suteu;Yi-ke Guo",
        "authorids": "m.suteu16@imperial.ac.uk;y.guo@imperial.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsuteu2020regularizing,\ntitle={Regularizing Deep Multi-Task Networks using Orthogonal Gradients},\nauthor={Mihai Suteu and Yi-ke Guo},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeXJANFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJeXJANFPr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "538;619;806",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "368;553;514",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            654.3333333333334,
            112.22695259557256
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            478.3333333333333,
            79.62551238279238
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 64,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6132953115515751964&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJeY-1BKDS",
        "title": "Understanding l4-based Dictionary Learning: Interpretation, Stability, and Robustness",
        "track": "main",
        "status": "Poster",
        "tldr": "We compare the l4-norm based dictionary learning with PCA, ICA and show its stability as well as robustness.",
        "abstract": "Recently, the $\\ell^4$-norm maximization has been proposed to solve the sparse dictionary learning (SDL) problem. The simple MSP (matching, stretching, and projection) algorithm proposed by \\cite{zhai2019a} has proved surprisingly efficient and effective.  This paper aims to better understand this algorithm from its strong geometric and statistical connections with the classic PCA and ICA, as well as their associated fixed-point style algorithms. Such connections provide a unified way of viewing problems that pursue {\\em principal}, {\\em independent}, or {\\em sparse} components of high-dimensional data. Our studies reveal additional good properties of $\\ell^4$-maximization: not only is the MSP algorithm for sparse coding insensitive to small noise, but it is also robust to outliers and resilient to sparse corruptions. We provide statistical justification for such inherently nice properties. To corroborate the theoretical analysis, we also provide extensive and compelling experimental evidence with both synthetic data and real images.",
        "keywords": "L4-norm Maximization;Robust Dictionary Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuexiang Zhai;Hermish Mehta;Zhengyuan Zhou;Yi Ma",
        "authorids": "ysz@berkeley.edu;hermish@berkeley.edu;zyzhou@stanford.edu;yima@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nZhai2020Understanding,\ntitle={Understanding l4-based Dictionary Learning: Interpretation, Stability, and Robustness},\nauthor={Yuexiang Zhai and Hermish Mehta and Zhengyuan Zhou and Yi Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeY-1BKDS}\n}",
        "github": "https://github.com/hermish/ZMZM-ICLR-2020",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJeY-1BKDS",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "296;465",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "375;442",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            380.5,
            84.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            408.5,
            33.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10434994015315606220&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SJeYe0NtvH",
        "title": "Neural Text Generation With Unlikelihood Training",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Neural text generation is a key tool in natural language applications, but it is well known there are major problems at its core. In particular, standard likelihood training and decoding leads to dull and repetitive outputs. While some post-hoc fixes have been proposed, in particular top-k and nucleus sampling, they do not address the fact that the token-level probabilities predicted by the model are poor. In this paper we show that the likelihood objective itself is at fault, resulting in a model that assigns too much probability to sequences containing repeats and frequent words, unlike those from the human training distribution. We propose a new objective, unlikelihood training, which forces unlikely generations to be assigned lower probability by the model. We show that both token and sequence level unlikelihood training give less repetitive, less dull text while maintaining perplexity, giving superior generations using standard greedy or beam search. According to human evaluations, our approach with standard beam search also outperforms the currently popular decoding methods of nucleus sampling or beam blocking, thus providing a strong alternative to existing techniques.",
        "keywords": "language modeling;machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sean Welleck;Ilia Kulikov;Stephen Roller;Emily Dinan;Kyunghyun Cho;Jason Weston",
        "authorids": "wellecks@nyu.edu;kulikov@cs.nyu.edu;roller@fb.com;edinan@fb.com;kyunghyun.cho@nyu.edu;jase@fb.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nWelleck2020Neural,\ntitle={Neural Text Generation With Unlikelihood Training},\nauthor={Sean Welleck and Ilia Kulikov and Stephen Roller and Emily Dinan and Kyunghyun Cho and Jason Weston},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeYe0NtvH}\n}",
        "github": "https://drive.google.com/open?id=1rTksP8hubiXcYzJ8RBl83R8Ent5EtLOj",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJeYe0NtvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "282;444;310",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "657;590;278",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            345.3333333333333,
            70.69810621383165
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            508.3333333333333,
            165.15110925721598
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 622,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16638535268657480159&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJe_D1SYvr",
        "title": "Partial Simulation for Imitation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "A formulation of solving imitation problems using RL, without requiring full knowledge about the state dynamics",
        "abstract": "Model-based imitation learning methods require full knowledge of the transition kernel for policy evaluation. In this work, we introduce the Expert Induced Markov Decision Process (eMDP) model as a formulation of solving imitation problems using Reinforcement Learning (RL), when only partial knowledge about the transition kernel is available. The idea of eMDP is to replace the unknown transition kernel with a synthetic kernel that: a) simulate the transition of state components for which the transition kernel is known (s_r), and b) extract from demonstrations the state components for which the kernel is unknown (s_u). The next state is then stitched from the two components: s={s_r,s_u}. We describe in detail the recipe for building an eMDP and analyze the errors caused by its synthetic kernel. Our experiments include imitation tasks in multiplayer games, where the agent has to imitate one expert in the presence of other experts for whom we cannot provide a transition model. We show that combining a policy gradient algorithm with our model achieves superior performance compared to the simulation-free alternative.",
        "keywords": "Reinforcement Learning;Imitation Learning;Behavior Cloning;Partial Simulation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nir Baram;Shie Mannor",
        "authorids": "nirb@campus.technion.ac.il;shie@ee.technion.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nbaram2020partial,\ntitle={Partial Simulation for Imitation Learning},\nauthor={Nir Baram and Shie Mannor},\nyear={2020},\nurl={https://openreview.net/forum?id=SJe_D1SYvr}\n}",
        "github": "https://drive.google.com/open?id=10Z5RIfkH1mwXRfMmbbeXeS1WrWz-sC8G",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJe_D1SYvr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "299;897;333",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "480;608;99",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            509.6666666666667,
            274.23752883626673
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            395.6666666666667,
            216.18561983217623
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7_w2M7iNM4MJ:scholar.google.com/&scioq=Partial+Simulation+for+Imitation+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJecKyrKPH",
        "title": "ICNN: INPUT-CONDITIONED FEATURE REPRESENTATION LEARNING FOR TRANSFORMATION-INVARIANT NEURAL NETWORK",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a novel framework, ICNN, which combines the input-conditioned filter generation module and a decoder based network to incorporate contextual information present in images into Convolutional Neural Networks (CNNs). In contrast to traditional CNNs, we do not employ the same set of learned convolution filters for all input image instances. And our proposed decoder network serves the purpose of reducing the transformation present in the input image by learning to construct a representative image of the input image class. Our proposed joint supervision of input-aware framework when combined with techniques inspired by Multi-instance learning and max-pooling, results in a transformation-invariant neural network. We investigated the performance of our proposed framework on three MNIST variations, which covers both rotation and scaling variance, and achieved 0.98% error on MNIST-rot-12k, 1.12% error on Half-rotated MNIST and 0.68% error on Scaling MNIST, which is significantly better than the state-of-the-art results. Our proposed model also showcased consistent improvement on the CIFAR dataset. We make use of visualization to further prove the effectiveness of our input-aware convolution filters. Our proposed convolution filter generation framework can also serve as a plugin for any CNN based architecture and enhance its modeling capacity.",
        "keywords": "Transformation-invariance;Reconstruction;Run-time Convolution Filter generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Suraj Tripathi;Chirag Singh;Abhay Kumar",
        "authorids": "surajtripathi93@gmail.com;c.singh@samsung.com;abykumar12011@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntripathi2020icnn,\ntitle={{\\{}ICNN{\\}}: {\\{}INPUT{\\}}-{\\{}CONDITIONED{\\}} {\\{}FEATURE{\\}} {\\{}REPRESENTATION{\\}} {\\{}LEARNING{\\}} {\\{}FOR{\\}} {\\{}TRANSFORMATION{\\}}-{\\{}INVARIANT{\\}} {\\{}NEURAL{\\}} {\\{}NETWORK{\\}}},\nauthor={Suraj Tripathi and Chirag Singh and Abhay Kumar},\nyear={2020},\nurl={https://openreview.net/forum?id=SJecKyrKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJecKyrKPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "484;333;213",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;312;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            343.3333333333333,
            110.87630745815608
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            104.0,
            147.07821048680188
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Ic8fthkDgdwJ:scholar.google.com/&scioq=ICNN:+INPUT-CONDITIONED+FEATURE+REPRESENTATION+LEARNING+FOR+TRANSFORMATION-INVARIANT+NEURAL+NETWORK&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJeeL04KvH",
        "title": "Robust Federated Learning Through Representation Matching and Adaptive Hyper-parameters",
        "track": "main",
        "status": "Reject",
        "tldr": "We describe a cheap, online, and automated hyper-parameter tuning scheme for Federated learning settings and a novel mechanism for mitigating model divergence in the presence of non-iid client data.",
        "abstract": "  Federated learning is a distributed, privacy-aware learning scenario which trains a single model on data belonging to several clients. Each client trains a local model on its data and the local models are then aggregated by a central party. Current federated learning methods struggle in cases with heterogeneous client-side data distributions which can quickly lead to divergent local models and a collapse in performance. Careful hyper-parameter tuning is particularly important in these cases but traditional automated hyper-parameter tuning methods  would require several training trials which is often impractical in a federated learning setting. We describe a two-pronged solution to the issues of robustness and hyper-parameter tuning in federated learning settings. We propose a novel representation matching scheme that reduces the divergence of local models by ensuring the feature representations in the global (aggregate) model can be derived from the locally learned representations. We also propose an online hyper-parameter tuning scheme which uses an online version of the REINFORCE algorithm to find a hyper-parameter distribution that maximizes the expected improvements in training loss. We show on several benchmarks that our two-part scheme of local representation matching and global adaptive hyper-parameters significantly improves performance and training robustness.",
        "keywords": "federated learning;hyper-parameter tuning;regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hesham Mostafa",
        "authorids": "hesham.mostafa@intel.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nmostafa2020robust,\ntitle={Robust Federated Learning Through Representation Matching and Adaptive Hyper-parameters},\nauthor={Hesham Mostafa},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeeL04KvH}\n}",
        "github": "https://gitlab.com/anon.iclr2020/robust_federated_learning",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJeeL04KvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "282;234;365",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "764;415;679",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            293.6666666666667,
            54.11305038733469
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            619.3333333333334,
            148.594152718814
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 41,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13671553793218566724&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJefGpEtDB",
        "title": "A Dynamic Approach to Accelerate Deep Learning Training",
        "track": "main",
        "status": "Reject",
        "tldr": "Dynamic precision technique to train deep neural networks",
        "abstract": "Mixed-precision arithmetic combining both single- and half-precision operands in the same operation have been successfully applied to train deep neural networks. Despite the advantages of mixed-precision arithmetic in terms of reducing the need for key resources like memory bandwidth or register file size, it has a limited capacity for diminishing computing costs and requires 32 bits to represent its output operands. This paper proposes two approaches to replace mixed-precision for half-precision arithmetic during a large portion of the training. The first approach achieves accuracy ratios slightly slower than the state-of-the-art by using half-precision arithmetic during more than 99% of training. The second approach reaches the same accuracy as the state-of-the-art by dynamically switching between half- and mixed-precision arithmetic during training. It uses half-precision during more than 94% of the training process. This paper is the first in demonstrating that half-precision can be used for a very large portion of DNNs training and still reach state-of-the-art accuracy.",
        "keywords": "reduced precision;bfloat16;CNN;DNN;dynamic precision;mixed precision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "John Osorio;Adri\u00e0 Armejach;Eric Petit;Marc Casas",
        "authorids": "john.osorio@bsc.es;adria.armejach@bsc.es;eric.petit@intel.com;marc.casas@bsc.es",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nosorio2020a,\ntitle={A Dynamic Approach to Accelerate Deep Learning Training},\nauthor={John Osorio and Adri{\\`a} Armejach and Eric Petit and Marc Casas},\nyear={2020},\nurl={https://openreview.net/forum?id=SJefGpEtDB}\n}",
        "github": "https://github.com/dynamicprec/dynamic",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJefGpEtDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "417;201;443",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "557;371;530",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            353.6666666666667,
            108.47221866552846
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            486.0,
            82.06095295571457
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7813113417553139679&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJefPkSFPr",
        "title": "Regulatory Focus: Promotion and Prevention Inclinations in Policy Search",
        "track": "main",
        "status": "Reject",
        "tldr": "We implemented and tested the regulatory fit theory from psychology in RL using order statistics over path ensembles.",
        "abstract": "The estimation of advantage is crucial for a number of reinforcement learning algorithms, as it directly influences the choices of future paths. In this work, we propose a family of estimates based on the order statistics over the path ensemble, which allows one to flexibly drive the learning process in a promotion focus or prevention focus. On top of this formulation, we systematically study the impacts of different regulatory focuses. Our findings reveal that regulatory focus, when chosen appropriately, can result in significant benefits. In particular, for the environments with sparse rewards, promotion focus would lead to more efficient exploration of the policy space; while for those where individual actions can have critical impacts, prevention focus is preferable. On various benchmarks, including MuJoCo continuous control, Terrain locomotion, Atari games, and sparse-reward environments, the proposed schemes consistently demonstrate improvement over mainstream methods, not only accelerating the learning process but also obtaining substantial performance gains.",
        "keywords": "Reinforcement Learning;Regulatory Focus;Promotion and Prevention;Exploration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lanxin Lei;Zhizhong Li;Xiaoyang Li;Cong Qiu;Dahua Lin",
        "authorids": "leilansen@gmail.com;lizz@sensetime.com;lixiaoyang@nbu.edu.cn;qiucong@sensetime.com;dhlin@ie.cuhk.edu.hk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nlei2020regulatory,\ntitle={Regulatory Focus: Promotion and Prevention Inclinations in Policy Search},\nauthor={Lanxin Lei and Zhizhong Li and Xiaoyang Li and Cong Qiu and Dahua Lin},\nyear={2020},\nurl={https://openreview.net/forum?id=SJefPkSFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJefPkSFPr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "145;669;476",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            430.0,
            216.38083710593844
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oQAi2L7ZAHoJ:scholar.google.com/&scioq=Regulatory+Focus:+Promotion+and+Prevention+Inclinations+in+Policy+Search&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJegkkrYPS",
        "title": "Starfire: Regularization-Free Adversarially-Robust Structured Sparse Training",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper studies structured sparse training of CNNs that leads to fixed, sparse weight matrices after a set number of epochs.",
        "abstract": "This paper studies structured sparse training of CNNs with a gradual pruning technique that leads to fixed, sparse weight matrices after a set number of epochs. We simplify the structure of the enforced sparsity so that it reduces overhead caused by regularization. The proposed training methodology explores several options for structured sparsity.\n\nWe study various tradeoffs with respect to pruning duration, learning-rate configuration, and the total length of training.\nWe show that our method creates a sparse version of ResNet50 and ResNet50v1.5 on full ImageNet while remaining within a negligible <1% margin of accuracy loss. To make sure that this type of sparse training does not harm the robustness of the network, we also demonstrate how the network behaves in the presence of adversarial attacks.  Our results show that with 70% target sparsity, over 75% top-1 accuracy is achievable. ",
        "keywords": "Structured Sparsity;Sparsity;Training;Compression;Adversarial;Regularization;Acceleration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Noah Gamboa;Kais Kudrolli;Anand Dhoot;Ardavan Pedram",
        "authorids": "ngamboa@stanford.edu;kudrolli@stanford.edu;anandd@stanford.edu;perdavan@stanford.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ngamboa2020starfire,\ntitle={Starfire: Regularization-Free Adversarially-Robust Structured Sparse Training},\nauthor={Noah Gamboa and Kais Kudrolli and Anand Dhoot and Ardavan Pedram},\nyear={2020},\nurl={https://openreview.net/forum?id=SJegkkrYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJegkkrYPS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "498;1634;376",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "372;1380;333",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            836.0,
            566.4650621765359
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            695.0,
            484.62975558667466
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fFGqorkfxtcJ:scholar.google.com/&scioq=Starfire:+Regularization-Free+Adversarially-Robust+Structured+Sparse+Training&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJem8lSFwB",
        "title": "Dynamic Model Pruning with Feedback",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Deep neural networks often have millions of parameters. This can hinder their deployment to low-end devices, not only due to high memory requirements but also because of increased latency at inference. We propose a novel model compression method that generates a sparse trained model without additional overhead: by allowing (i) dynamic allocation of the sparsity pattern and  (ii) incorporating feedback signal to reactivate prematurely pruned weights we obtain a performant sparse model in one single training pass (retraining is not needed, but can further improve the performance). We evaluate the method on CIFAR-10 and ImageNet, and show that the obtained sparse models can reach the state-of-the-art performance of dense models and further that their performance surpasses all previously proposed pruning schemes (that come without feedback mechanisms).",
        "keywords": "network pruning;dynamic reparameterization;model compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tao Lin;Sebastian U. Stich;Luis Barba;Daniil Dmitriev;Martin Jaggi",
        "authorids": "tao.lin@epfl.ch;sebastian.stich@epfl.ch;luis.barba@inf.ethz.ch;daniil.dmitriev@epfl.ch;martin.jaggi@epfl.ch",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLin2020Dynamic,\ntitle={Dynamic Model Pruning with Feedback},\nauthor={Tao Lin and Sebastian U. Stich and Luis Barba and Daniil Dmitriev and Martin Jaggi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJem8lSFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJem8lSFwB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "716;287;360",
        "wc_reply_reviewers": "36;0;0",
        "wc_reply_authors": "135;181;442",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            454.3333333333333,
            187.4110159219273
        ],
        "wc_reply_reviewers_avg": [
            12.0,
            16.97056274847714
        ],
        "wc_reply_authors_avg": [
            252.66666666666666,
            135.18957882256393
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 260,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15973326881389909041&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SJenMR4twH",
        "title": "Interpretable Deep Neural Network Models: Hybrid of Image Kernels and Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "We introduce interpretable components into a Deep Neural Network (DNN) to explain its decision mechanism. Instead of reasoning a decision from a given trained neural network, we design an interpretable neural network architecture before training. Weight values in the first layers of a Convolutional Neural Network (CNN) and a ResNet-50 are replaced by well-known predefined kernels such as sharpening, embossing, color filters, etc. Each filter's relative importance is measured with a variant of the saliency map and Layer-wise Relevance Propagation (LRP) proposed by Simonyan et al. and Bach et al. We suggest that images processed by predefined kernels still contain enough information for DNNs to extract features without degrading performances on MNIST, and ImageNet datasets. Our model based on the ResNet-50 shows 92.1% top-5 and 74.6% top-1 accuracy on the ImageNet dataset. At the same time, our model provides three different tools to explain individual classification and overall properties of a certain class; the relative importance scores with respects to (1) each color, (2) each filter, and (3) each pixel of the image.  \n",
        "keywords": "Interpretability;DNN;Hybrid Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mr. Jay Hoon Jung;and Prof. YoungMin Kwon",
        "authorids": "jay.jung@stonybrook.edu;youngmin.kwon@sunykorea.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJenMR4twH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "270;286;395",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            317.0,
            55.53977553669682
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uMpmzxEP9A8J:scholar.google.com/&scioq=Interpretable+Deep+Neural+Network+Models:+Hybrid+of+Image+Kernels+and+Neural+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJeo46VFDr",
        "title": "Understanding Distributional Ambiguity via Non-robust Chance Constraint",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "We propose a non-robust interpretation of the distributionally robust optimization (DRO) problem by relating the impact of uncertainties around the distribution on the impact of constraining the objective through tail probabilities. Our interpretation allows utility maximizers to understand the size of the ambiguity set through parameters that are directly linked to the chance parameters. We first show that for general $\\phi$-divergences, a DRO problem is asymptotically equivalent to a class of mean-deviation problem, where the ambiguity radius controls investor's risk preference. Based on this non-robust reformulation, we then show that when a boundedness constraint is added to the investment strategy, the DRO problem can be cast as a chance-constrained optimization (CCO) problem without distributional uncertainties. Without the boundedness constraint, the CCO problem is shown to perform uniformly better than the DRO problem, irrespective of the radius of the ambiguity set, the choice of the divergence measure, or the tail heaviness of the center distribution. Besides the widely-used Kullback-Leibler (KL) divergence which requires the distribution of the objective function to be exponentially bounded, our results apply to divergence measures that accommodate well heavy tail distributions such as the student $t$-distribution and the lognormal distribution. Comprehensive testing on synthetic data and real data are provided.  ",
        "keywords": "Heavy tail distribution;Chance constraint;Distributionally robust optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shumin MA;LEUNG Cheuk Hang;Qi WU;Wei Liu",
        "authorids": "shuminma@cityu.edu.hk;chleung87@cityu.edu.hk;qiwu55@cityu.edu.hk;wl2223@columbia.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://github.com/RobustInterpretation/Understanding-Distributional-Ambiguity-via-Non-robust-Chance-Constraint",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeo46VFDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "208;1587;117",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            637.3333333333334,
            672.5426050114264
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1476524867891597876&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SJeoE0VKDS",
        "title": "Novelty Search in representational space for sample efficient exploration",
        "track": "main",
        "status": "Reject",
        "tldr": "We conduct exploration using intrinsic rewards that are based on a weighted distance of nearest neighbors in representational space.",
        "abstract": "We present a new approach for efficient exploration which leverages a low-dimensional encoding of the environment learned with a combination of model-based and model-free objectives. Our approach uses intrinsic rewards that are based on a weighted distance of nearest neighbors in the low dimensional representational space to gauge novelty.\nWe then leverage these intrinsic rewards for sample-efficient exploration with planning routines in representational space.\nOne key element of our approach is that we perform more gradient steps in-between every environment step in order to ensure the model accuracy. We test our approach on a number of maze tasks, as well as a control problem and show that our exploration approach is more sample-efficient compared to strong baselines. ",
        "keywords": "Reinforcement Learning;Exploration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruo Yu Tao;Vincent Fran\u00e7ois-Lavet;Joelle Pineau",
        "authorids": "ruo.tao@mail.mcgill.ca;vincent.francois-lavet@mail.mcgill.ca;jpineau@cs.mcgill.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntao2020novelty,\ntitle={Novelty Search in representational space for sample efficient exploration},\nauthor={Ruo Yu Tao and Vincent Fran{\\c{c}}ois-Lavet and Joelle Pineau},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeoE0VKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJeoE0VKDS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "989;789;228",
        "wc_reply_reviewers": "562;0;0",
        "wc_reply_authors": "589;379;0",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            668.6666666666666,
            322.11833574359315
        ],
        "wc_reply_reviewers_avg": [
            187.33333333333334,
            264.9293406845598
        ],
        "wc_reply_authors_avg": [
            322.6666666666667,
            243.73528445608548
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 53,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15188964487009178721&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SJeq9JBFvH",
        "title": "Deep probabilistic subsampling for task-adaptive compressed sensing",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "The field of deep learning is commonly concerned with optimizing predictive models using large pre-acquired datasets of densely sampled datapoints or signals. In this work, we demonstrate that the deep learning paradigm can be extended to incorporate a subsampling scheme that is jointly optimized under a desired minimum sample rate. We present Deep Probabilistic Subsampling (DPS), a widely applicable framework for task-adaptive compressed sensing that enables end-to end optimization of an optimal subset of signal samples with a subsequent model that performs a required task. We demonstrate strong performance on reconstruction and classification tasks of a toy dataset, MNIST, and CIFAR10 under stringent subsampling rates in both the pixel and the spatial frequency domain. Due to the task-agnostic nature of the framework, DPS is directly applicable to all real-world domains that benefit from sample rate reduction.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Iris A.M. Huijben;Bastiaan S. Veeling;Ruud J.G. van Sloun",
        "authorids": "i.a.m.huijben@tue.nl;basveeling@gmail.com;r.j.g.v.sloun@tue.nl",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nHuijben2020Deep,\ntitle={Deep probabilistic subsampling for task-adaptive compressed sensing},\nauthor={Iris A.M. Huijben and Bastiaan S. Veeling and Ruud J.G. van Sloun},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeq9JBFvH}\n}",
        "github": "[![github](/images/github_icon.svg) IamHuijben/Deep-Probabilistic-Subsampling](https://github.com/IamHuijben/Deep-Probabilistic-Subsampling)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeq9JBFvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "212;149;330",
        "wc_reply_reviewers": "0;0;239",
        "wc_reply_authors": "471;890;1808",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;2;3",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            230.33333333333334,
            75.02147840600199
        ],
        "wc_reply_reviewers_avg": [
            79.66666666666667,
            112.66568046905657
        ],
        "wc_reply_authors_avg": [
            1056.3333333333333,
            558.3561189380443
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10812881230787929312&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJeqs6EFvB",
        "title": "HOPPITY: LEARNING GRAPH TRANSFORMATIONS TO DETECT AND FIX BUGS IN PROGRAMS",
        "track": "main",
        "status": "Spotlight",
        "tldr": "An learning-based approach for detecting and fixing bugs in Javascript",
        "abstract": "We present a learning-based approach to detect and fix a broad range of bugs in Javascript programs. We frame the problem in terms of learning a sequence of graph transformations: given a buggy program modeled by a graph structure, our model makes a sequence of predictions including the position of bug nodes and corresponding graph edits to produce a fix. Unlike previous works that use deep neural networks, our approach targets bugs that are more complex and semantic in nature (i.e.~bugs that require adding or deleting statements to fix). We have realized our approach in a tool called HOPPITY. By training on 290,715 Javascript code change commits on Github, HOPPITY correctly detects and fixes bugs in 9,490 out of 36,361 programs in an end-to-end fashion. Given the bug location and type of the fix, HOPPITY also outperforms the baseline approach by a wide margin.",
        "keywords": "Bug Detection;Program Repair;Graph Neural Network;Graph Transformation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Elizabeth Dinella;Hanjun Dai;Ziyang Li;Mayur Naik;Le Song;Ke Wang",
        "authorids": "edinella@seas.upenn.edu;hadai@google.com;liby99@seas.upenn.edu;mhnaik@cis.upenn.edu;lsong@cc.gatech.edu;kewang@visa.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nDinella2020HOPPITY:,\ntitle={HOPPITY: LEARNING GRAPH TRANSFORMATIONS TO DETECT AND FIX BUGS IN PROGRAMS},\nauthor={Elizabeth Dinella and Hanjun Dai and Ziyang Li and Mayur Naik and Le Song and Ke Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeqs6EFvB}\n}",
        "github": "https://github.com/AI-nstein/hoppity",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeqs6EFvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "1033;498;293",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1021;578;277",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            608.0,
            311.9561935058618
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            625.3333333333334,
            305.57523168971375
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 272,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3537740923229776123&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SJetQpEYvB",
        "title": "LEARNING EXECUTION THROUGH NEURAL CODE FUSION",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "As the performance of computer systems stagnates due to the end of Moore\u2019s Law,\nthere is a need for new models that can understand and optimize the execution\nof general purpose code. While there is a growing body of work on using Graph\nNeural Networks (GNNs) to learn static representations of source code, these\nrepresentations do not understand how code executes at runtime. In this work, we\npropose a new approach using GNNs to learn fused representations of general\nsource code and its execution. Our approach defines a multi-task GNN over\nlow-level representations of source code and program state (i.e., assembly code\nand dynamic memory states), converting complex source code constructs and data\nstructures into a simpler, more uniform format. We show that this leads to improved\nperformance over similar methods that do not use execution and it opens the door\nto applying GNN models to new tasks that would not be feasible from static code\nalone. As an illustration of this, we apply the new model to challenging dynamic\ntasks (branch prediction and prefetching) from the SPEC CPU benchmark suite,\noutperforming the state-of-the-art by 26% and 45% respectively. Moreover, we\nuse the learned fused graph embeddings to demonstrate transfer learning with high\nperformance on an indirectly related algorithm classification task.",
        "keywords": "code understanding;graph neural networks;learning program execution;execution traces;program performance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhan Shi;Kevin Swersky;Daniel Tarlow;Parthasarathy Ranganathan;Milad Hashemi",
        "authorids": "zshi17@cs.utexas.edu;kswersky@google.com;dtarlow@google.com;parthas@google.com;miladh@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nShi2020LEARNING,\ntitle={LEARNING EXECUTION THROUGH NEURAL CODE FUSION},\nauthor={Zhan Shi and Kevin Swersky and Daniel Tarlow and Parthasarathy Ranganathan and Milad Hashemi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJetQpEYvB}\n}",
        "github": "https://www.dropbox.com/s/yrjhx8ifowdktwh/ncf_code.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJetQpEYvB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "375;165;200",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "589;20;119",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            246.66666666666666,
            91.86342519680446
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            242.66666666666666,
            248.20735059399206
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=745787703319887809&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SJeuueSYDH",
        "title": "Distributed Training Across the World",
        "track": "main",
        "status": "Reject",
        "tldr": "Conventional distributed learning is only performed inside cluster because of latency requirements. We scale the distributed training across the world under high latency network.",
        "abstract": "Traditional synchronous distributed training is performed inside a cluster, since it requires high bandwidth and low latency network (e.g. 25Gb Ethernet or Infini-band). However, in many application scenarios, training data are often distributed across many geographic locations, where physical distance is long and latency is high.  Traditional synchronous distributed training cannot scale well under such limited network conditions. In this work, we aim to scale distributed learning un-der high-latency network. To achieve this, we propose delayed and temporally sparse (DTS) update that enables synchronous training to tolerate extreme network conditions without compromising accuracy.  We benchmark our algorithms on servers deployed across three continents in the world: London (Europe), Tokyo(Asia), Oregon (North America) and Ohio (North America). Under such challenging settings, DTS achieves90\u00d7speedup over traditional methods without loss of accuracy on ImageNet.",
        "keywords": "Distributed Training;Bandwidth",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ligeng Zhu;Yao Lu;Yujun Lin;Song Han",
        "authorids": "ligeng@mit.edu;luyao11175@gmail.com;yujunlin@mit.edu;songhan@mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhu2020distributed,\ntitle={Distributed Training Across the World},\nauthor={Ligeng Zhu and Yao Lu and Yujun Lin and Song Han},\nyear={2020},\nurl={https://openreview.net/forum?id=SJeuueSYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJeuueSYDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "290;531;264",
        "wc_reply_reviewers": "0;115;0",
        "wc_reply_authors": "720;974;318",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            361.6666666666667,
            120.2063041423184
        ],
        "wc_reply_reviewers_avg": [
            38.333333333333336,
            54.21151989096864
        ],
        "wc_reply_authors_avg": [
            670.6666666666666,
            270.07324109499547
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18396086398465756058&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SJev6JBtvH",
        "title": "Testing For Typicality with Respect to an Ensemble of Learned Distributions",
        "track": "main",
        "status": "Reject",
        "tldr": "We show theoretically and empirically that testing for typicality with respect to an ensemble of learned distributions can account for learning error in the hypothesis testing. ",
        "abstract": "Good methods of performing anomaly detection on high-dimensional data sets are\nneeded, since algorithms which are trained on data are only expected to perform\nwell on data that is similar to the training data. There are theoretical results on the\nability to detect if a population of data is likely to come from a known base distribution, \nwhich is known as the goodness-of-fit problem, but those results require\nknowing a model of the base distribution. The ability to correctly reject anomalous\ndata hinges on the accuracy of the model of the base distribution. For high dimensional \ndata, learning an accurate-enough model of the base distribution such that\nanomaly detection works reliably is very challenging, as many researchers have\nnoted in recent years. Existing methods for the goodness-of-fit problem do not ac-\ncount for the fact that a model of the base distribution is learned. To address that\ngap, we offer a theoretically motivated approach to account for the density learning \nprocedure. In particular, we propose training an ensemble of density models,\nconsidering data to be anomalous if the data is anomalous with respect to any\nmember of the ensemble. We provide a theoretical justification for this approach,\nproving first that a test on typicality is a valid approach to the goodness-of-fit\nproblem, and then proving that for a correctly constructed ensemble of models,\nthe intersection of typical sets of the models lies in the interior of the typical set\nof the base distribution. We present our method in the context of an example on\nsynthetic data in which the effects we consider can easily be seen.",
        "keywords": "anomaly detection;density estimation;generative models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Forrest Laine;Claire Tomlin",
        "authorids": "forrest.laine@berkeley.edu;tomlin@eecs.berkeley.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlaine2020testing,\ntitle={Testing For Typicality with Respect to an Ensemble of Learned Distributions},\nauthor={Forrest Laine and Claire Tomlin},\nyear={2020},\nurl={https://openreview.net/forum?id=SJev6JBtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJev6JBtvH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "604;212;1881",
        "wc_reply_reviewers": "119;68;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "1;1;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            899.0,
            712.5816912233058
        ],
        "wc_reply_reviewers_avg": [
            62.333333333333336,
            48.74650984657489
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dJIqWVYVxkoJ:scholar.google.com/&scioq=Testing+For+Typicality+with+Respect+to+an+Ensemble+of+Learned+Distributions&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "SJexHkSFPS",
        "title": "Thinking While Moving: Deep Reinforcement Learning with Concurrent Control",
        "track": "main",
        "status": "Poster",
        "tldr": "Reinforcement learning formulation that allows agents to think and act at the same time, demonstrated on real-world robotic grasping.",
        "abstract": "We study reinforcement learning in settings where sampling an action from the policy must be done concurrently with the time evolution of the controlled system, such as when a robot must decide on the next action while still performing the previous action. Much like a person or an animal, the robot must think and move at the same time, deciding on its next action before the previous one has completed. In order to develop an algorithmic framework for such concurrent control problems, we start with a continuous-time formulation of the Bellman equations, and then discretize them in a way that is aware of system delays. We instantiate this new class of approximate dynamic programming methods via a simple architectural extension to existing value-based deep reinforcement learning algorithms. We evaluate our methods on simulated benchmark tasks and a large-scale robotic grasping task where the robot must \"think while moving.\"",
        "keywords": "deep reinforcement learning;continuous-time;robotics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ted Xiao;Eric Jang;Dmitry Kalashnikov;Sergey Levine;Julian Ibarz;Karol Hausman;Alexander Herzog",
        "authorids": "tedxiao@google.com;ejang@google.com;dkalashnikov@google.com;slevine@google.com;julianibarz@google.com;karolhausman@google.com;alexherzog@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nXiao2020Thinking,\ntitle={Thinking While Moving: Deep Reinforcement Learning with Concurrent Control},\nauthor={Ted Xiao and Eric Jang and Dmitry Kalashnikov and Sergey Levine and Julian Ibarz and Karol Hausman and Alexander Herzog},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJexHkSFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJexHkSFPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "905;605;117",
        "wc_reply_reviewers": "0;34;0",
        "wc_reply_authors": "763;235;15",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            542.3333333333334,
            324.73715867178214
        ],
        "wc_reply_reviewers_avg": [
            11.333333333333334,
            16.027753706895076
        ],
        "wc_reply_authors_avg": [
            337.6666666666667,
            313.88037353247955
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 50,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7061079712723652354&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJezGp4YPr",
        "title": "Geometric Insights into the Convergence of Nonlinear TD Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "While there are convergence guarantees for temporal difference (TD) learning when using linear function approximators, the situation for nonlinear models is far less understood, and divergent examples are known. Here we take a first step towards extending theoretical convergence guarantees to TD learning with nonlinear function approximation. More precisely, we consider the expected learning dynamics of the TD(0) algorithm for value estimation. As the step-size converges to zero, these dynamics are defined by a nonlinear ODE which depends on the geometry of the space of function approximators, the structure of the underlying Markov chain, and their interaction. We find a set of function approximators that includes ReLU networks and has geometry amenable to TD learning regardless of environment, so that the solution performs about as well as linear TD in the worst case. Then, we show how environments that are more reversible induce dynamics that are better for TD learning and prove global convergence to the true value function for well-conditioned function approximators. Finally, we generalize a divergent counterexample to a family of divergent problems to demonstrate how the interaction between approximator and environment can go wrong and to motivate the assumptions needed to prove convergence. ",
        "keywords": "TD;nonlinear;convergence;value estimation;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David Brandfonbrener;Joan Bruna",
        "authorids": "david.brandfonbrener@nyu.edu;bruna@cims.nyu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nBrandfonbrener2020Geometric,\ntitle={Geometric Insights into the Convergence of Nonlinear TD Learning},\nauthor={David Brandfonbrener and Joan Bruna},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJezGp4YPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJezGp4YPr",
        "pdf_size": 0,
        "rating": "3;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "642;141;791;354",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "729;24;393;75",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            6.25,
            2.0463381929681126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            482.0,
            251.86603582063225
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            305.25,
            282.5689075252265
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9551017179191156871&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SJg1lxrYwS",
        "title": "PatchFormer: A neural architecture for self-supervised representation learning on images",
        "track": "main",
        "status": "Reject",
        "tldr": "Decoding pixels can still work for representation learning on images",
        "abstract": "Learning rich representations from predictive learning without labels has been a longstanding challenge in the field of machine learning. Generative pre-training has so far not been as successful as contrastive methods in modeling representations of raw images. In this paper, we propose a neural architecture for self-supervised representation learning on raw images called the PatchFormer which learns to model spatial dependencies across patches in a raw image. Our method learns to model the conditional probability distribution of missing patches given the context of surrounding patches. We evaluate the utility of the learned representations by fine-tuning the pre-trained model on low data-regime classification tasks. Specifically, we benchmark our model on semi-supervised ImageNet classification which has become a popular benchmark recently for semi-supervised and self-supervised learning methods. Our model is able to achieve 30.3% and 65.5% top-1 accuracies when trained only using 1% and 10% of the labels on ImageNet showing the promise for generative pre-training methods.",
        "keywords": "Unsupervised Learning;Representation Learning;Transformers",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aravind Srinivas;Pieter Abbeel",
        "authorids": "aravind@cs.berkeley.edu;pabbeel@cs.berkeley.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsrinivas2020patchformer,\ntitle={PatchFormer: A neural architecture for self-supervised representation learning on images},\nauthor={Aravind Srinivas and Pieter Abbeel},\nyear={2020},\nurl={https://openreview.net/forum?id=SJg1lxrYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJg1lxrYwS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "101;263;300",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            221.33333333333334,
            86.41887653876451
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mygLA8q5FqoJ:scholar.google.com/&scioq=PatchFormer:+A+neural+architecture+for+self-supervised+representation+learning+on+images&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJg2j0VFPB",
        "title": "Knowledge Graph Embedding: A Probabilistic Perspective and Generalization Bounds",
        "track": "main",
        "status": "Reject",
        "tldr": "We prove bounds on expected error of knowledge graph embedding methods from certain class and under certain assumptions.",
        "abstract": "We study theoretical properties of embedding methods for knowledge graph completion under the missing completely at random assumption. We prove generalization error bounds for this setting. Even though the missing completely at random setting may seem naive, it is actually how knowledge graph embedding methods are typically benchmarked in the literature. Our results provide, to certain extent, an explanation for why knowledge graph embedding methods work (as much as classical learning theory results provide explanations for classical learning from i.i.d. data).",
        "keywords": "knowledge graph embedding;generalization bounds",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ondrej Kuzelka;Yuyi Wang",
        "authorids": "kuzelo1@gmail.com;yuyiwang920@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkuzelka2020knowledge,\ntitle={Knowledge Graph Embedding: A Probabilistic Perspective and Generalization Bounds},\nauthor={Ondrej Kuzelka and Yuyi Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=SJg2j0VFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJg2j0VFPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "991;292;302",
        "wc_reply_reviewers": "385;0;0",
        "wc_reply_authors": "2189;557;145",
        "reply_reviewers": "4;0;0",
        "reply_authors": "5;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            528.3333333333334,
            327.18020858372364
        ],
        "wc_reply_reviewers_avg": [
            128.33333333333334,
            181.49074050454718
        ],
        "wc_reply_authors_avg": [
            963.6666666666666,
            882.6163127631144
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            1.8856180831641267
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.8856180831641267
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:E0NyREfYzkwJ:scholar.google.com/&scioq=Knowledge+Graph+Embedding:+A+Probabilistic+Perspective+and+Generalization+Bounds&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJg3T2EFvr",
        "title": "BERT Wears GloVes: Distilling Static Embeddings from Pretrained Contextual Representations",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A procedure for distilling contextual models into static embeddings; we apply our method to 9 popular models and demonstrate clear gains in representation quality wrt Word2Vec/GloVe and improved analysis potential by thoroughly studying social bias.",
        "abstract": "Contextualized word representations such as ELMo and BERT have become the de facto starting point for incorporating pretrained representations for downstream NLP tasks. In these settings, contextual representations have largely made obsolete their static embedding predecessors such as Word2Vec and GloVe. However, static embeddings do have their advantages in that they are straightforward to understand and faster to use. Additionally, embedding analysis methods for static embeddings are far more diverse and mature than those available for their dynamic counterparts. In this work, we introduce simple methods for generating static lookup table embeddings from existing pretrained contextual representations and demonstrate they outperform Word2Vec and GloVe embeddings on a variety of word similarity and word relatedness tasks. In doing so, our results also reveal insights that may be useful for subsequent downstream tasks using our embeddings or the original contextual models. Further, we demonstrate the increased potential for analysis by applying existing approaches for estimating social bias in word embeddings. Our analysis constitutes the most comprehensive study of social bias in contextual word representations (via the proxy of our distilled embeddings) and reveals a number of inconsistencies in current techniques for quantifying social bias in word embeddings. We publicly release our code and distilled word embeddings to support reproducible research and the broader NLP community. ",
        "keywords": "Pretrained Word Representations;Lightweight Representations;NLP;Social Bias;Word Embeddings",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rishi Bommasani;Kelly Davis;Claire Cardie",
        "authorids": "rb724@cornell.edu;kdavis@mozilla.com;cardie@cs.cornell.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://github.com/AnonymousICLR2020Submission/BERT-Wears-GloVes",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJg3T2EFvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "246;587;190",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            341.0,
            175.44419815618488
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7287693931476596349&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJg4Y3VFPS",
        "title": "Group-Connected Multilayer Perceptron Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "An architecture to learn and exploit expressive feature combinations",
        "abstract": "Despite the success of deep learning in domains such as image, voice, and graphs, there has been little progress in deep representation learning for domains without a known structure between features. For instance, a tabular dataset of different demographic and clinical factors where the feature interactions are not given as a prior. In this paper, we propose Group-Connected Multilayer Perceptron (GMLP) networks to enable deep representation learning in these domains. GMLP is based on the idea of learning expressive feature combinations (groups) and exploiting them to reduce the network complexity by defining local group-wise operations. During the training phase, GMLP learns a sparse feature grouping matrix using temperature annealing softmax with an added entropy loss term to encourage the sparsity. Furthermore, an architecture is suggested which resembles binary trees, where group-wise operations are followed by pooling operations to combine information; reducing the number of groups as the network grows in depth. To evaluate the proposed method, we conducted experiments on five different real-world datasets covering various application areas. Additionally, we provide visualizations on MNIST and synthesized data. According to the results, GMLP is able to successfully learn and exploit expressive feature combinations and achieve state-of-the-art classification performance on different datasets.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohammad Kachuee;Sajad Darabi;Shayan Fazeli;Majid Sarrafzadeh",
        "authorids": "mkachuee@ucla.edu;sajad.darabi@cs.ucla.edu;shayan@cs.ucla.edu;majid@cs.ucla.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkachuee2020groupconnected,\ntitle={Group-Connected Multilayer Perceptron Networks},\nauthor={Mohammad Kachuee and Sajad Darabi and Shayan Fazeli and Majid Sarrafzadeh},\nyear={2020},\nurl={https://openreview.net/forum?id=SJg4Y3VFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJg4Y3VFPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "275;480;329",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "905;1170;665",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            361.3333333333333,
            86.75764455590578
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            913.3333333333334,
            206.24957912414968
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10156315546428746348&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJg5J6NtDr",
        "title": "Watch, Try, Learn: Meta-Learning from Demonstrations and Rewards",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Imitation learning allows agents to learn complex behaviors from demonstrations. However, learning a complex vision-based task may require an impractical number of demonstrations. Meta-imitation learning is a promising approach towards enabling agents to learn a new task from one or a few demonstrations by leveraging experience from learning similar tasks. In the presence of task ambiguity or unobserved dynamics, demonstrations alone may not provide enough information; an agent must also try the task to successfully infer a policy. In this work, we propose a method that can learn to learn from both demonstrations and trial-and-error experience with sparse reward feedback. In comparison to meta-imitation, this approach enables the agent to effectively and efficiently improve itself autonomously beyond the demonstration data. In comparison to meta-reinforcement learning, we can scale to substantially broader distributions of tasks, as the demonstration reduces the burden of exploration. Our experiments show that our method significantly outperforms prior approaches on a set of challenging, vision-based control tasks.",
        "keywords": "meta-learning;reinforcement learning;imitation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Allan Zhou;Eric Jang;Daniel Kappler;Alex Herzog;Mohi Khansari;Paul Wohlhart;Yunfei Bai;Mrinal Kalakrishnan;Sergey Levine;Chelsea Finn",
        "authorids": "ayz@stanford.edu;ejang@google.com;kappler@google.com;alexherzog@google.com;khansari@google.com;wohlhart@google.com;yunfeibai@google.com;kalakris@google.com;slevine@google.com;cbfinn@cs.stanford.edu",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@inproceedings{\nZhou2020Watch,,\ntitle={Watch, Try, Learn: Meta-Learning from Demonstrations and Rewards},\nauthor={Allan Zhou and Eric Jang and Daniel Kappler and Alex Herzog and Mohi Khansari and Paul Wohlhart and Yunfei Bai and Mrinal Kalakrishnan and Sergey Levine and Chelsea Finn},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJg5J6NtDr}\n}",
        "github": "https://drive.google.com/open?id=1f1LzO0fe1m-kINY8DTgL6JGimVGiQOuz",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJg5J6NtDr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "235;115;477",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "398;51;341",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            275.6666666666667,
            150.5574825624936
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            263.3333333333333,
            151.9349275914601
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 69,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12843053102737293818&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJg7KhVKPH",
        "title": "Depth-Adaptive Transformer",
        "track": "main",
        "status": "Poster",
        "tldr": "Sequence model that dynamically adjusts the amount of computation for each input.",
        "abstract": "State of the art sequence-to-sequence models for large scale tasks perform a fixed number of computations for each input sequence regardless of whether it is easy or hard to process. In this paper, we train Transformer models which can make output predictions at different stages of the network and we investigate different ways to predict how much computation is required for a particular sequence. Unlike dynamic computation in Universal Transformers, which applies the same set of layers iteratively, we apply different layers at every step to adjust both the amount of computation as well as the model capacity. On IWSLT German-English translation our approach matches the accuracy of a well tuned baseline Transformer while using less than a quarter of the decoder layers.",
        "keywords": "Deep learning;natural language processing;sequence modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Maha Elbayad;Jiatao Gu;Edouard Grave;Michael Auli",
        "authorids": "maha.elbayad@inria.fr;thomagram@gmail.com;egrave@fb.com;michael.auli@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nElbayad2020Depth-Adaptive,\ntitle={Depth-Adaptive Transformer},\nauthor={Maha Elbayad and Jiatao Gu and Edouard Grave and Michael Auli},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJg7KhVKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJg7KhVKPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "1349;150;354",
        "wc_reply_reviewers": "610;0;0",
        "wc_reply_authors": "1115;169;426",
        "reply_reviewers": "2;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            617.6666666666666,
            523.7940647069439
        ],
        "wc_reply_reviewers_avg": [
            203.33333333333334,
            287.5567576825293
        ],
        "wc_reply_authors_avg": [
            570.0,
            399.4003839090126
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 215,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14129138176231859512&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "SJg7spEYDS",
        "title": "Generative Ratio Matching Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "MMD-based, saddle-point optimisation free, stable-to-train generative model that beats GAN on generative quality without playing any  zero-sum games.",
        "abstract": "Deep generative models can learn to generate realistic-looking images, but many of the most effective methods are adversarial and involve a saddlepoint optimization, which requires a careful balancing of training between a generator network and a critic network. Maximum mean discrepancy networks (MMD-nets) avoid this issue by using kernel as a fixed adversary, but unfortunately, they have not on their own been able to match the generative quality of adversarial training. In this work, we take their insight of using kernels as fixed adversaries further and present a novel method for training deep generative models that does not involve saddlepoint optimization. We call our method generative ratio matching or GRAM for short. In GRAM, the generator and the critic networks do not play a zero-sum game against each other, instead, they do so against a fixed kernel. Thus GRAM networks are not only stable to train like MMD-nets but they also match and beat the generative quality of adversarially trained generative networks.",
        "keywords": "deep generative model;deep learning;maximum mean discrepancy;density ratio estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akash Srivastava;Kai Xu;Michael U. Gutmann;Charles Sutton",
        "authorids": "akash.srivastava@me.com;kai.xu@ed.ac.uk;michael.gutmann@ed.ac.uk;charlessutton@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nSrivastava2020Generative,\ntitle={Generative Ratio Matching Networks},\nauthor={Akash Srivastava and Kai Xu and Michael U. Gutmann and Charles Sutton},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJg7spEYDS}\n}",
        "github": "https://github.com/GRAM-nets",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJg7spEYDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "862;184;312",
        "wc_reply_reviewers": "528;60;99",
        "wc_reply_authors": "1188;296;354",
        "reply_reviewers": "3;1;1",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.6666666666667,
            294.12166794637136
        ],
        "wc_reply_reviewers_avg": [
            229.0,
            212.02358359390118
        ],
        "wc_reply_authors_avg": [
            612.6666666666666,
            407.51059972580947
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=697075286636648941&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "SJg9z6VFDr",
        "title": "Ordinary differential equations on graph networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Apply ordinary differential equation model on graph structured data",
        "abstract": "Recently various neural networks have been proposed for irregularly structured data such as graphs and manifolds. To our knowledge, all existing graph networks have discrete depth. Inspired by neural ordinary differential equation (NODE) for data in the Euclidean domain, we extend the idea of continuous-depth models to graph data, and propose graph ordinary differential equation (GODE). The derivative of hidden node states are parameterized with a graph neural network, and the output states are the solution to this ordinary differential equation. We demonstrate two end-to-end methods for efficient training of GODE: (1) indirect back-propagation with the adjoint method; (2) direct back-propagation through the ODE solver, which accurately computes the gradient. We demonstrate that direct backprop outperforms the adjoint method in experiments. We then introduce a family of bijective blocks, which enables $\\mathcal{O}(1)$ memory consumption. We demonstrate that GODE can be easily adapted to different existing graph neural networks and improve accuracy. We validate the performance of GODE in both semi-supervised node classification tasks and graph classification tasks. Our GODE model achieves a continuous model in time, memory efficiency, accurate gradient estimation, and generalizability with different graph networks.",
        "keywords": "Graph Networks;Ordinary differential equation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Juntang Zhuang;Nicha Dvornek;Xiaoxiao Li;James S. Duncan",
        "authorids": "j.zhuang@yale.edu;nicha.dvornek@yale.edu;xiaoxiao.li@yale.edu;james.duncan@yale.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhuang2020ordinary,\ntitle={Ordinary differential equations on graph networks},\nauthor={Juntang Zhuang and Nicha Dvornek and Xiaoxiao Li and James S. Duncan},\nyear={2020},\nurl={https://openreview.net/forum?id=SJg9z6VFDr}\n}",
        "github": "https://www.dropbox.com/sh/sgaid3efh4eqmjl/AAB-DFXvNq_Pf313UqSLl4VPa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJg9z6VFDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "604;543;210",
        "wc_reply_reviewers": "269;111;0",
        "wc_reply_authors": "952;941;496",
        "reply_reviewers": "1;1;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.3333333333333,
            173.15567818841197
        ],
        "wc_reply_reviewers_avg": [
            126.66666666666667,
            110.37612463249872
        ],
        "wc_reply_authors_avg": [
            796.3333333333334,
            212.41521184280148
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12589072111449755172&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJgAURVtPH",
        "title": "Embodied Language Grounding with Implicit 3D Visual Feature Representations",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Consider the utterance \u201cthe tomato is to the left of the pot\u201d. Humans can answer numerous questions about the situation described, as well as reason through counterfactuals and alternatives, such as, \u201cis the pot larger than the tomato?\u201d, \u201ccan we move to a viewpoint from which the tomato is completely hidden behind the pot?\u201d, \u201ccan we have an object that is both to the left of the tomato and to the right of the pot?\u201d, \u201cwould the tomato fit inside the pot?\u201d, and so on. Such reasoning capability remains elusive from current computational models of language understanding. To link language processing with spatial reasoning, we propose associating natural language utterances to a mental workspace of their meaning, encoded as 3-dimensional visual feature representations of the world scenes they describe. We learn such 3-dimensional visual representations\u2014we call them visual imaginations\u2014  by predicting images a mobile agent sees while moving around in the 3D world.  The input image streams the agent collects are unprojected into egomotion-stable 3D scene feature maps of the scene, and projected from novel viewpoints to match the observed RGB image views in an end-to-end differentiable manner.  We then train modular neural models to generate such 3Dfeature representations given language utterances,  to localize the objects an utterance mentions in the 3D feature representation inferred from an image, and to predict the desired 3D object locations given a manipulation instruction.  We empirically show the proposed models outperform by a large margin existing 2D models in spatial reasoning, referential object detection and instruction following, and generalize better across camera viewpoints and object arrangements.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mihir Prabhudesai;Hsiao-Yu Fish Tung;Syed Ashar Javed;Maximilian Sieb;Adam W. Harley;Katerina Fragkiadaki",
        "authorids": ";;;;;",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJgAURVtPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "529;287;447",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            421.0,
            100.4921224109963
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16320558220021282065&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJgBQaVKwH",
        "title": "Effective Use of Variational Embedding Capacity in Expressive End-to-End Speech Synthesis",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce techniques that increase the versatility of variational models of speech, allowing the same model to perform well on multiple tasks, including prosody and style transfer. ",
        "abstract": "Recent work has explored sequence-to-sequence latent variable models for expressive speech synthesis (supporting control and transfer of prosody and style), but has not presented a coherent framework for understanding the trade-offs between the competing methods. In this paper, we propose embedding capacity (the amount of information the embedding contains about the data) as a unified method of analyzing the behavior of latent variable models of speech, comparing existing heuristic (non-variational) methods to variational methods that are able to explicitly constrain capacity using an upper bound on representational mutual information. In our proposed model (Capacitron), we show that by adding conditional dependencies to the variational posterior such that it matches the form of the true posterior, the same model can be used for high-precision prosody transfer, text-agnostic style transfer, and generation of natural-sounding prior samples. For multi-speaker models, Capacitron is able to preserve target speaker identity during inter-speaker prosody transfer and when drawing samples from the latent prior. Lastly, we introduce a method for decomposing embedding capacity hierarchically across two sets of latents, allowing a portion of the latent variability to be specified and the remaining variability sampled from a learned prior. Audio examples are available on the web.",
        "keywords": "Speech Synthesis;Deep Generative Models;Latent Variable Models;Unsupervised Representation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eric Battenberg;Soroosh Mariooryad;Daisy Stanton;RJ Skerry-Ryan;Matt Shannon;David Kao;Tom Bagby",
        "authorids": "ebattenberg@google.com;soroosh@google.com;daisy@google.com;rjryan@google.com;mattshannon@google.com;davidkao@google.com;tombagby@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nbattenberg2020effective,\ntitle={Effective Use of Variational Embedding Capacity in Expressive End-to-End Speech Synthesis},\nauthor={Eric Battenberg and Soroosh Mariooryad and Daisy Stanton and RJ Skerry-Ryan and Matt Shannon and David Kao and Tom Bagby},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgBQaVKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJgBQaVKwH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "402;399;233",
        "wc_reply_reviewers": "0;72;0",
        "wc_reply_authors": "409;402;229",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.6666666666667,
            78.96975511056257
        ],
        "wc_reply_reviewers_avg": [
            24.0,
            33.94112549695428
        ],
        "wc_reply_authors_avg": [
            346.6666666666667,
            83.25196027054791
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 58,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1482658218518844607&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJgBra4YDS",
        "title": "Manifold Modeling in Embedded Space: A Perspective for Interpreting \"Deep Image Prior\"",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new auto-encoder incorporated with multiway delay-embedding transform toward interpreting deep image prior.",
        "abstract": "Deep image prior (DIP), which utilizes a deep convolutional network (ConvNet) structure itself as an image prior, has attracted huge attentions in computer vision community.  It empirically shows the effectiveness of ConvNet structure for various image restoration applications.  However, why the DIP works so well is still unknown, and why convolution operation is essential for image reconstruction or enhancement is not very clear. In this study, we tackle these questions. The proposed approach is dividing the convolution into ``delay-embedding'' and ``transformation (\\ie encoder-decoder)'', and proposing a simple, but essential, image/tensor modeling method which is closely related to dynamical systems and self-similarity. The proposed method named as manifold modeling in embedded space (MMES) is implemented by using a novel denoising-auto-encoder in combination with multi-way delay-embedding transform. In spite of its simplicity, the image/tensor completion and super-resolution results of MMES are quite similar even competitive to DIP in our extensive experiments, and these results would help us for reinterpreting/characterizing the DIP from a perspective of ``low-dimensional patch-manifold prior''.",
        "keywords": "Deep image prior;Manifold model;Auto-encoder;Convolutional neural network;Delay-embedding;Hankelization;Tensor completion;Image inpainting;Supperresolution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tatsuya Yokota;Hidekata Hontani;Qibin Zhao;Andrzej Cichocki",
        "authorids": "t.yokota@nitech.ac.jp;hontani@nitech.ac.jp;qibin.zhao@riken.jp;a.cichocki@riken.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nyokota2020manifold,\ntitle={Manifold Modeling in Embedded Space: A Perspective for Interpreting ''Deep Image Prior''},\nauthor={Tatsuya Yokota and Hidekata Hontani and Qibin Zhao and Andrzej Cichocki},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgBra4YDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJgBra4YDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "201;306;243",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "268;682;822",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;3;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            250.0,
            43.15089802078283
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            590.6666666666666,
            235.209599199428
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            3.0,
            0.816496580927726
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12564395802160333477&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJgCEpVtvr",
        "title": "DYNAMIC SELF-TRAINING FRAMEWORK FOR GRAPH CONVOLUTIONAL NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "Propose a novel self-training framework which performs well in few-label cases combined with GCN.",
        "abstract": "Graph neural networks (GNN) such as GCN, GAT, MoNet have achieved state-of-the-art results on semi-supervised learning on graphs. However, when the number of labeled nodes is very small, the performances of GNNs downgrade dramatically. Self-training has proved to be effective for resolving this issue, however, the performance of self-trained GCN is still inferior to that of G2G and DGI for many settings. Moreover, additional model complexity make it more difficult to tune the hyper-parameters and do model selection. We argue that the power of self-training is still not fully explored for the node classification task. In this paper, we propose a unified end-to-end self-training framework called \\emph{Dynamic Self-traning}, which generalizes and simplifies prior work. A simple instantiation of the framework based on GCN is provided and empirical results show that our framework outperforms all previous methods including GNNs, embedding based method and  self-trained GCNs by a noticeable margin. Moreover, compared with standard self-training, hyper-parameter tuning for our framework is easier.",
        "keywords": "self-training;semi-supervised learning;graph convolutional networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ziang Zhou;Shenzhong Zhang;Zengfeng Huang",
        "authorids": "15300180085@fudan.edu.cn;17210980007@fudan.edu.cn;huangzf@fudan.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhou2020dynamic,\ntitle={{\\{}DYNAMIC{\\}} {\\{}SELF{\\}}-{\\{}TRAINING{\\}} {\\{}FRAMEWORK{\\}}  {\\{}FOR{\\}} {\\{}GRAPH{\\}} {\\{}CONVOLUTIONAL{\\}} {\\{}NETWORKS{\\}}},\nauthor={Ziang Zhou and Shenzhong Zhang and Zengfeng Huang},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgCEpVtvr}\n}",
        "github": "https://anonymous.4open.science/r/f7efb5cb-adfc-4f47-908a-edc4025c18d8/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJgCEpVtvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "199;224;179",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "454;85;460",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            200.66666666666666,
            18.408935028645434
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            333.0,
            175.37958832201653
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4765917415961539833&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SJgIPJBFvH",
        "title": "Fantastic Generalization Measures and Where to Find Them",
        "track": "main",
        "status": "Poster",
        "tldr": "We empirically study generalization measures over more than 2000 models, identify common pitfall in existing practice of studying generalization measures and provide some new bounds based on measures in our study.",
        "abstract": "Generalization of deep networks has been intensely researched in recent years, resulting in a number of theoretical bounds and empirically motivated measures. However, most papers proposing such measures only study a small set of models, leaving open the question of whether these measures are truly useful in practice. We present the first large scale study of generalization bounds and measures in deep networks. We train over two thousand CIFAR-10 networks with systematic changes in important hyper-parameters. We attempt to uncover potential causal relationships between each measure and generalization, by using rank correlation coefficient and its modified forms. We analyze the results and show that some of the studied measures are very promising for further research.",
        "keywords": "Generalization;correlation;experiments",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yiding Jiang*;Behnam Neyshabur*;Hossein Mobahi;Dilip Krishnan;Samy Bengio",
        "authorids": "ydjiang@google.com;neyshabur@google.com;dilipkay@google.com;hmobahi@google.com;bengio@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nJiang*2020Fantastic,\ntitle={Fantastic Generalization Measures and Where to Find Them},\nauthor={Yiding Jiang* and Behnam Neyshabur* and Hossein Mobahi and Dilip Krishnan and Samy Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgIPJBFvH}\n}",
        "github": "https://drive.google.com/open?id=1_6oUG94d0C3x7x2Vd935a2QqY-OaAWAM",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJgIPJBFvH",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "577;523;424",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "658;225;326",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            508.0,
            63.35613624582863
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            403.0,
            184.96666366312246
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 757,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10064823919367825311&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "SJgJ-AEYPB",
        "title": "Scholastic-Actor-Critic For Multi Agent Reinforcement Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "The Actor-Critic framework of multi-agent reinforcement learning(MARL) is gathering more attention nowadays. Centralized training with decentralized execution allows the policies to use extra information to ease the training while enhancing overall performance. In such a framework, the quality of critic profoundly impacts the final average rewards. Thus we present a method, called Scholastic-Actor-Critic(SMAC), that involves a more powerful critic to maintain efficiency in ample knowledge acquisition. The headmaster critic is designed to group agents with proper size and proper timing, while other critics update simultaneously at the decision time. The learning rule includes additional terms account for the impact of other agents within a group. Our method receives higher payouts compared to other state-of-the-art methods and is robust against the explosion of dimension during training. We apply our method to the Coin Game,  the Cooperative Treasure Collection(CTC) and a dynamic battle game, MAgent. Experiment results are all satisfying.",
        "keywords": "multi-agent reinforcement learning;Actor-Critic",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Weiying Chen\uff0cRuize Hou;Weiying Chen\uff0cRuize Hou",
        "authorids": "dissolution@126.com;fsszns@163.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJgJ-AEYPB",
        "pdf_size": 0,
        "rating": "1;1;1;1",
        "confidence": "0;0;0;0",
        "wc_review": "317;359;479;310",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "76;76;76;76",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            366.25,
            67.73985163845578
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            76.0,
            0.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gCG2YD3_C04J:scholar.google.com/&scioq=Scholastic-Actor-Critic+For+Multi+Agent+Reinforcement+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJgMK64Ywr",
        "title": "AssembleNet: Searching for Multi-Stream Neural Connectivity in Video Architectures",
        "track": "main",
        "status": "Poster",
        "tldr": "We search for multi-stream neural architectures with better connectivity and spatio-temporal interactions for video understanding.",
        "abstract": "Learning to represent videos is a very challenging task both algorithmically and computationally. Standard video CNN architectures have been designed by directly extending architectures devised for image understanding to include the time dimension, using modules such as 3D convolutions, or by using two-stream design to capture both appearance and motion in videos. We interpret a video CNN as a collection of multi-stream convolutional blocks connected to each other, and propose the approach of automatically finding neural architectures with better connectivity and spatio-temporal interactions for video understanding. This is done by evolving a population of overly-connected architectures guided by connection weight learning. \nArchitectures combining representations that abstract different input types (i.e., RGB and optical flow) at multiple temporal resolutions are searched for, allowing different types or sources of information to interact with each other. Our method, referred to as AssembleNet, outperforms prior approaches on public video datasets, in some cases by a great margin. We obtain 58.6% mAP on Charades and 34.27% accuracy on Moments-in-Time.",
        "keywords": "video representation learning;video understanding;activity recognition;neural architecture search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael S. Ryoo;AJ Piergiovanni;Mingxing Tan;Anelia Angelova",
        "authorids": "mryoo@google.com;ajpiergi@indiana.edu;tanmingxing@google.com;anelia@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nRyoo2020AssembleNet:,\ntitle={AssembleNet: Searching for Multi-Stream Neural Connectivity in Video Architectures},\nauthor={Michael S. Ryoo and AJ Piergiovanni and Mingxing Tan and Anelia Angelova},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgMK64Ywr}\n}",
        "github": "[![github](/images/github_icon.svg) tensorflow/models](https://github.com/tensorflow/models/tree/master/official/vision/beta/projects/assemblenet) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SJgMK64Ywr)",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJgMK64Ywr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "340;257;223",
        "wc_reply_reviewers": "23;0;0",
        "wc_reply_authors": "142;462;74",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            273.3333333333333,
            49.141518992486276
        ],
        "wc_reply_reviewers_avg": [
            7.666666666666667,
            10.842303978193728
        ],
        "wc_reply_authors_avg": [
            226.0,
            169.17052540755043
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 121,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4088147155924192014&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJgNkpVFPr",
        "title": "VILD: Variational Imitation Learning with Diverse-quality Demonstrations",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an imitation learning method to learn from diverse-quality demonstrations collected by demonstrators with different level of expertise.",
        "abstract": "The goal of imitation learning (IL) is to learn a good policy from high-quality demonstrations. However, the quality of demonstrations in reality can be diverse, since it is easier and cheaper to collect demonstrations from a mix of experts and amateurs. IL in such situations can be challenging, especially when the level of demonstrators' expertise is unknown. We propose a new IL paradigm called Variational Imitation Learning with Diverse-quality demonstrations (VILD), where we explicitly model the level of demonstrators' expertise with a probabilistic graphical model and estimate it along with a reward function. We show that a naive estimation approach is not suitable to large state and action spaces, and fix this issue by using a variational approach that can be easily implemented using existing reinforcement learning methods. Experiments on continuous-control benchmarks demonstrate that VILD outperforms state-of-the-art methods. Our work enables scalable and data-efficient IL under more realistic settings than before.",
        "keywords": "Imitation learning;inverse reinforcement learning;noisy demonstrations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Voot Tangkaratt;Bo Han;Mohammad Emtiyaz Khan;Masashi Sugiyama",
        "authorids": "voot.tangkaratt@riken.jp;bo.han@riken.jp;emtiyaz.khan@riken.jp;sugi@k.u-tokyo.ac.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntangkaratt2020vild,\ntitle={{\\{}VILD{\\}}: Variational Imitation Learning with Diverse-quality Demonstrations},\nauthor={Voot Tangkaratt and Bo Han and Mohammad Emtiyaz Khan and Masashi Sugiyama},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgNkpVFPr}\n}",
        "github": "https://www.dropbox.com/sh/jrp87a1aey8jplq/AACh1cFj9ce8tZnqLR9iKq7Ea?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJgNkpVFPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "303;441;399",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "667;753;485",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            381.0,
            57.758116312774604
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            635.0,
            111.72585496055363
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15793402539605170758&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJgSflHKDr",
        "title": "The Frechet Distance of training and test distribution predicts the generalization gap",
        "track": "main",
        "status": "Reject",
        "tldr": "The Frechet Distance between train and test distribution correlates with the change in performance for functions that are not invariant to the shift.",
        "abstract": "Learning theory tells us that more data is better when minimizing the generalization error of identically distributed training and test sets. However, when training and test distribution differ, this distribution shift can have a significant effect. With a novel perspective on function transfer learning, we are able to lower bound the change of performance when transferring from training to test set with the Wasserstein distance between the embedded training and test set distribution. We find that there is a trade-off affecting performance between how invariant a function is to changes in training and test distribution and how large this shift in distribution is. Empirically across several data domains, we substantiate this viewpoint by showing that test performance correlates strongly with the distance in data distributions between training and test set. Complementary to the popular belief that more data is always better, our results highlight the utility of also choosing a training data distribution that is close to the test data distribution when the learned function is not invariant to such changes.",
        "keywords": "Generalization;Transfer learning;Frechet distance;Optimal transport;Domain adaptation;Distribution shift;Invariance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Julian Zilly;Hannes Zilly;Oliver Richter;Roger Wattenhofer;Andrea Censi;Emilio Frazzoli",
        "authorids": "jzilly@ethz.ch;hzilly@ethz.ch;richtero@ethz.ch;wattenhofer@ethz.ch;acensi@ethz.ch;emilio.frazzoli@idsc.mavt.ethz.ch",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nzilly2020the,\ntitle={The Frechet Distance of training and test distribution predicts the generalization gap},\nauthor={Julian Zilly and Hannes Zilly and Oliver Richter and Roger Wattenhofer and Andrea Censi and Emilio Frazzoli},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgSflHKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJgSflHKDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "355;232;246",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            277.6666666666667,
            54.980804731186765
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15642657651021580268&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJgVHkrYDH",
        "title": "Learning to Retrieve Reasoning Paths over Wikipedia Graph for Question Answering",
        "track": "main",
        "status": "Poster",
        "tldr": "Graph-based recurrent retriever that learns to retrieve reasoning paths over Wikipedia Graph outperforms the most recent state of the art on HotpotQA by more than 14 points.",
        "abstract": "Answering questions that require multi-hop reasoning at web-scale necessitates retrieving multiple evidence documents, one of which often has little lexical or semantic relationship to the question. This paper introduces a new graph-based recurrent retrieval approach that learns to retrieve reasoning paths over the Wikipedia graph to answer multi-hop open-domain questions. Our retriever model trains a recurrent neural network that learns to sequentially retrieve evidence paragraphs in the reasoning path by conditioning on the previously retrieved documents. \nOur reader model ranks the reasoning paths and extracts the answer span included in the best reasoning path.\nExperimental results show state-of-the-art results in three open-domain QA datasets, showcasing the effectiveness and robustness of our method. Notably, our method achieves significant improvement in HotpotQA, outperforming the previous best model by more than 14 points.",
        "keywords": "Multi-hop Open-domain Question Answering;Graph-based Retrieval;Multi-step Retrieval",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akari Asai;Kazuma Hashimoto;Hannaneh Hajishirzi;Richard Socher;Caiming Xiong",
        "authorids": "akari@cs.washington.edu;k.hashimoto@salesforce.com;hannaneh@washington.edu;richard@socher.org;cxiong@salesforce.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nAsai2020Learning,\ntitle={Learning to Retrieve Reasoning Paths over Wikipedia Graph for Question Answering},\nauthor={Akari Asai and Kazuma Hashimoto and Hannaneh Hajishirzi and Richard Socher and Caiming Xiong},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgVHkrYDH}\n}",
        "github": "https://github.com/AkariAsai/learning_to_retrieve_reasoning_paths",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJgVHkrYDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "735;154;661",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1942;194;1396",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;3",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            516.6666666666666,
            258.2173933379048
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1177.3333333333333,
            730.1768431155717
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 329,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9983656712986759365&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJgVU0EKwS",
        "title": "Precision Gating: Improving Neural Network Efficiency with Dynamic Dual-Precision Activations",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose precision gating (PG), an end-to-end trainable dynamic dual-precision quantization technique for deep neural networks.",
        "abstract": "We propose precision gating (PG), an end-to-end trainable dynamic dual-precision quantization technique for deep neural networks.  PG computes most features in a low precision and only a small proportion of important features in a higher precision to preserve accuracy.  The proposed approach is applicable to a variety of DNN architectures and significantly reduces the computational cost of DNN execution with almost no accuracy loss.  Our experiments indicate that PG achieves excellent results on CNNs, including statically compressed mobile-friendly networks such as ShuffleNet. Compared to the state-of-the-art prediction-based quantization schemes, PG achieves the same or higher accuracy with 2.4\u00d7 less compute on ImageNet. PG furthermore applies to RNNs. Compared to 8-bit uniform quantization, PG obtains a 1.2% improvement in perplexity per word with 2.7\u00d7 computational cost reduction on LSTM on the Penn Tree Bank dataset.",
        "keywords": "deep learning;neural network;dynamic quantization;dual precision;efficient gating",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yichi Zhang;Ritchie Zhao;Weizhe Hua;Nayun Xu;G. Edward Suh;Zhiru Zhang",
        "authorids": "yz2499@cornell.edu;rz252@cornell.edu;wh399@cornell.edu;nx38@cornell.edu;edward.suh@cornell.edu;zhiruz@cornell.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nZhang2020Precision,\ntitle={Precision Gating: Improving Neural Network Efficiency with Dynamic Dual-Precision Activations},\nauthor={Yichi Zhang and Ritchie Zhao and Weizhe Hua and Nayun Xu and G. Edward Suh and Zhiru Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgVU0EKwS}\n}",
        "github": "https://github.com/cornell-zhang/dnn-gating",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJgVU0EKwS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "270;704;468",
        "wc_reply_reviewers": "0;59;123",
        "wc_reply_authors": "713;821;1100",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            480.6666666666667,
            177.40600014906173
        ],
        "wc_reply_reviewers_avg": [
            60.666666666666664,
            50.22836737232148
        ],
        "wc_reply_authors_avg": [
            878.0,
            163.0521389004143
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 33,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5604094105865350488&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJgXs1HtwH",
        "title": "TreeCaps: Tree-Structured Capsule Networks for Program Source Code Processing",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Program comprehension is a fundamental task in software development and maintenance processes. Software developers often need to understand a large amount of existing code before they can develop new features or fix bugs in existing programs. Being able to process programming language code automatically and provide summaries of code functionality accurately can significantly help developers to reduce time spent in code navigation and understanding, and thus increase productivity. Different from natural language articles, source code in programming languages often follows rigid syntactical structures and there can exist dependencies among code elements that are located far away from each other through complex control flows and data flows. Existing studies on tree-based convolutional neural networks (TBCNN) and gated graph neural networks (GGNN) are not able to capture essential semantic dependencies among code elements accurately. In this paper, we propose novel tree-based capsule networks (TreeCaps) and relevant techniques for processing program code in an automated way that encodes code syntactical structures and captures code dependencies more accurately. Based on evaluation on programs written in different programming languages, we show that our TreeCaps-based approach can outperform other approaches in classifying the functionalities of many programs.",
        "keywords": "Program Classification;Capsule Networks;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vinoj Jayasundara;Nghi Duy Quoc Bui;Lingxiao Jiang;David Lo",
        "authorids": "vinojjayasundara@gmail.com;dqnbui.2016@phdis.smu.edu.sg;lxjiang@smu.edu.sg;davidlo@smu.edu.sg",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njayasundara2020treecaps,\ntitle={TreeCaps: Tree-Structured Capsule Networks for Program Source Code Processing},\nauthor={Vinoj Jayasundara and Nghi Duy Quoc Bui and Lingxiao Jiang and David Lo},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgXs1HtwH}\n}",
        "github": "https://drive.google.com/open?id=1r_ZuutV0YFDBiMTcWZsnSbhKs65RZgfY",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJgXs1HtwH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "253;472;363",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "365;591;212",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            362.6666666666667,
            89.406686302287
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            389.3333333333333,
            155.67987096032107
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7793829478661302038&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SJgaRA4FPH",
        "title": "Generative Models for Effective ML on Private, Decentralized Datasets",
        "track": "main",
        "status": "Poster",
        "tldr": "Generative Models + Federated Learning + Differential Privacy gives data scientists a way to analyze private, decentralized data (e.g., on mobile devices) where direct inspection is prohibited.",
        "abstract": "To improve real-world applications of machine learning, experienced modelers develop intuition about their datasets, their models, and how the two interact. Manual inspection of raw data\u2014of representative samples, of outliers, of misclassifications\u2014is an essential tool in a) identifying and fixing problems in the data, b) generating new modeling hypotheses,\nand c) assigning or refining human-provided labels. However, manual data inspection is risky for privacy-sensitive datasets, such as those representing the behavior of real-world individuals. Furthermore, manual data inspection is impossible in the increasingly important setting of federated learning, where raw examples are stored at the edge and the modeler may only access aggregated outputs such as metrics or model parameters. This paper demonstrates that generative models\u2014trained using federated methods and with formal differential privacy guarantees\u2014can be used effectively to debug data issues even\nwhen the data cannot be directly inspected. We explore these methods in applications to text with differentially private federated RNNs and to images using a novel algorithm for differentially private federated GANs.",
        "keywords": "generative models;federated learning;decentralized learning;differential privacy;privacy;security;GAN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sean Augenstein;H. Brendan McMahan;Daniel Ramage;Swaroop Ramaswamy;Peter Kairouz;Mingqing Chen;Rajiv Mathews;Blaise Aguera y Arcas",
        "authorids": "saugenst@google.com;mcmahan@google.com;dramage@google.com;swaroopram@google.com;kairouz@google.com;mingqing@google.com;mathews@google.com;blaisea@google.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nAugenstein2020Generative,\ntitle={Generative Models for Effective ML on Private, Decentralized Datasets},\nauthor={Sean Augenstein and H. Brendan McMahan and Daniel Ramage and Swaroop Ramaswamy and Peter Kairouz and Mingqing Chen and Rajiv Mathews and Blaise Aguera y Arcas},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgaRA4FPH}\n}",
        "github": "https://github.com/tensorflow/federated/tree/master/tensorflow_federated/python/research/gans",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJgaRA4FPH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "320;525;874",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "746;282;773",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            573.0,
            228.7021352472833
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            600.3333333333334,
            225.36538233623097
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 232,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9442995492257337157&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJgdnAVKDH",
        "title": "Revisiting Self-Training for Neural Sequence Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "We revisit self-training as a semi-supervised learning method for neural sequence generation problem, and show that self-training can be quite successful with injected noise.",
        "abstract": "Self-training is one of the earliest and simplest semi-supervised methods. The key idea is to augment the original labeled dataset with unlabeled data paired with the model's prediction (i.e. the pseudo-parallel data). While self-training has been extensively studied on classification problems, in complex sequence generation tasks (e.g. machine translation) it is still unclear how self-training works due to the compositionality of the target space. In this work, we first empirically show that self-training is able to decently improve the supervised baseline on neural sequence generation tasks. Through careful examination of the performance gains, we find that the perturbation on the hidden states (i.e. dropout) is critical for self-training to benefit from the pseudo-parallel data, which acts as a regularizer and forces the model to yield close predictions for similar unlabeled inputs. Such effect helps the model correct some incorrect predictions on unlabeled data. To further encourage this mechanism, we propose to inject noise to the input space, resulting in a noisy version of self-training. Empirical study on standard machine translation and text summarization benchmarks shows that noisy self-training is able to effectively utilize unlabeled data and improve the performance of the supervised baseline by a large margin.",
        "keywords": "self-training;semi-supervised learning;neural sequence generatioin",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junxian He;Jiatao Gu;Jiajun Shen;Marc'Aurelio Ranzato",
        "authorids": "junxianh@cs.cmu.edu;thomagram@gmail.com;jiajunshen@fb.com;ranzato@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nHe2020Revisiting,\ntitle={Revisiting Self-Training for Neural Sequence Generation},\nauthor={Junxian He and Jiatao Gu and Jiajun Shen and Marc'Aurelio Ranzato},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgdnAVKDH}\n}",
        "github": "[![github](/images/github_icon.svg) jxhe/self-training-text-generation](https://github.com/jxhe/self-training-text-generation)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJgdnAVKDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "584;755;487",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "776;369;315",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            608.6666666666666,
            110.79209759224206
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            486.6666666666667,
            205.77387805280068
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 282,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7004703497998979134&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJgdpxHFvH",
        "title": "Meta-Learning Initializations for Image Segmentation",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that model agnostic meta-learning extends to the high dimensionality and dense prediction of image segmentation.",
        "abstract": "While meta-learning approaches that utilize neural network representations have made progress in few-shot image classification, reinforcement learning, and, more recently, image semantic segmentation, the training algorithms and model architectures have become increasingly specialized to the few-shot domain. A natural question that arises is how to develop learning systems that scale from few-shot to many-shot settings while yielding human level performance in both. One scalable potential approach that does not require ensembling many models nor the computational costs of relation networks, is to meta-learn an initialization. In this work, we study first-order meta-learning of initializations for deep neural networks that must produce dense, structured predictions given an arbitrary amount of train- ing data for a new task. Our primary contributions include (1), an extension and experimental analysis of first-order model agnostic meta-learning algorithms (including FOMAML and Reptile) to image segmentation, (2) a formalization of the generalization error of episodic meta-learning algorithms, which we leverage to decrease error on unseen tasks, (3) a novel neural network architecture built for parameter efficiency which we call EfficientLab, and (4) an empirical study of how meta-learned initializations compare to ImageNet initializations as the training set size increases. We show that meta-learned initializations for image segmentation smoothly transition from canonical few-shot learning problems to larger datasets, outperforming random and ImageNet-trained initializations. Finally, we show both theoretically and empirically that a key limitation of MAML-type algorithms is that when adapting to new tasks, a single update procedure is used that is not conditioned on the data. We find that our network, with an empirically estimated optimal update procedure yields state of the art results on the FSS-1000 dataset, while only requiring one forward pass through a single model at evaluation time.",
        "keywords": "meta-learning;image segmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sean M. Hendryx;Andrew B. Leach;Paul D. Hein;Clayton T. Morrison",
        "authorids": "seanmhendryx@gmail.com;imaleach@gmail.com;pauldhein@email.arizona.edu;claytonm@email.arizona.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhendryx2020metalearning,\ntitle={Meta-Learning Initializations for Image Segmentation},\nauthor={Sean M. Hendryx and Andrew B. Leach and Paul D. Hein and Clayton T. Morrison},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgdpxHFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJgdpxHFvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "359;544;770",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "196;364;1252",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            557.6666666666666,
            168.06810788751352
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            604.0,
            463.30983153824826
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4976423235286947827&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJgg_yHKvH",
        "title": "Molecule Property Prediction and Classification with Graph Hypernetworks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Graph neural networks are currently leading the performance charts in learning-based molecule property prediction and classification. Computational chemistry has, therefore, become the a prominent testbed for generic graph neural networks, as well as for specialized message passing methods. In this work, we demonstrate that the replacement of the underlying networks with hypernetworks leads to a boost in performance, obtaining state of the art results in various benchmarks.\n\nA major difficulty in the application of hypernetworks is their lack of stability. We tackle this by combining the current message and the first message. A recent work has tackled the training instability of hypernetworks in the context of error correcting codes, by replacing the activation function of the message passing network with a low-order Taylor approximation of it. We demonstrate that our generic solution can replace this domain-specific solution.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eliya Nachmani;Lior Wolf",
        "authorids": "enk100@gmail.com;wolf@fb.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJgg_yHKvH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "398;297;237",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            310.6666666666667,
            66.4345960743012
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7872260104398584086&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJgmR0NKPr",
        "title": "Training Recurrent Neural Networks Online by Learning Explicit State Variables",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Recurrent neural networks (RNNs) allow an agent to construct a state-representation from a stream of experience, which is essential in partially observable problems. However, there are two primary issues one must overcome when training an RNN: the sensitivity of the learning algorithm's performance to truncation length and and long training times. There are variety of strategies to improve training in RNNs, the mostly notably Backprop Through Time (BPTT) and by Real-Time Recurrent Learning. These strategies, however, are typically computationally expensive and focus computation on computing gradients back in time. In this work, we reformulate the RNN training objective to explicitly learn state vectors; this breaks the dependence across time and so avoids the need to estimate gradients far back in time. We show that for a fixed buffer of data, our algorithm---called Fixed Point Propagation (FPP)---is sound: it converges to a stationary point of the new objective. We investigate the empirical performance of our online FPP algorithm, particularly in terms of computation compared to truncated BPTT with varying truncation levels. ",
        "keywords": "Recurrent Neural Network;Partial Observability;Online Prediction;Incremental Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Somjit Nath;Vincent Liu;Alan Chan;Xin Li;Adam White;Martha White",
        "authorids": "somjit@ualberta.ca;vliu1@ualberta.ca;achan4@ualberta.ca;xzli@ualberta.ca;amw8@ualberta.ca;whitem@ualberta.ca",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nNath2020Training,\ntitle={Training Recurrent Neural Networks Online by Learning Explicit State Variables},\nauthor={Somjit Nath and Vincent Liu and Alan Chan and Xin Li and Adam White and Martha White},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgmR0NKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJgmR0NKPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "172;334;502",
        "wc_reply_reviewers": "0;236;176",
        "wc_reply_authors": "494;692;605",
        "reply_reviewers": "0;3;1",
        "reply_authors": "1;2;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            336.0,
            134.72935834479432
        ],
        "wc_reply_reviewers_avg": [
            137.33333333333334,
            100.15099711047425
        ],
        "wc_reply_authors_avg": [
            597.0,
            81.03085831953157
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            1.247219128924647
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6905085231654831662&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJgn3lBtwH",
        "title": "Re-Examining Linear Embeddings for High-dimensional Bayesian Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We study the use of linear embeddings for high-dimensional Bayesian optimization, identify issues that have caused poor performance, and develop new techniques for improving optimization performance in the embedding.",
        "abstract": "Bayesian optimization (BO) is a popular approach to optimize resource-intensive black-box functions.\nA significant challenge in BO is to scale to high-dimensional parameter spaces while retaining sample efficiency.\nA solution considered in previous literature is to embed the high-dimensional parameter space into a lower-dimensional manifold, often a random linear embedding. In this paper, we identify several crucial issues and misconceptions about the use of linear embeddings for BO. We thoroughly study and analyze the consequences of using linear embeddings and show that some of the design choices in current approaches adversely impact their performance. Based on this new theoretical understanding we propose ALEBO, a new algorithm for high-dimensional BO via linear embeddings that outperforms state-of-the-art methods on a range of problems.",
        "keywords": "Bayesian optimization;high-dimensional;Gaussian process",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Benjamin Letham;Roberto Calandra;Akshara Rai;Eytan Bakshy",
        "authorids": "bletham@fb.com;rcalandra@fb.com;akshararai@fb.com;ebakshy@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nletham2020reexamining,\ntitle={Re-Examining Linear Embeddings for High-dimensional Bayesian Optimization},\nauthor={Benjamin Letham and Roberto Calandra and Akshara Rai and Eytan Bakshy},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgn3lBtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJgn3lBtwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "639;420;443",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "837;815;825",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            500.6666666666667,
            98.26607869566294
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            825.6666666666666,
            8.993825042154695
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 144,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7963529277112461610&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SJgn464tPB",
        "title": "Stabilizing Off-Policy Reinforcement Learning with Conservative Policy Gradients",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a conservative update rule for off-policy policy-gradient methods (e.g., DDPG) in order to reduce the variance of the training regime.",
        "abstract": "In recent years, advances in deep learning have enabled the application of reinforcement learning algorithms in complex domains. However, they lack the theoretical guarantees which are present in the tabular setting and suffer from many stability and reproducibility problems \\citep{henderson2018deep}. In this work, we suggest a simple approach for improving stability and providing probabilistic performance guarantees in off-policy actor-critic deep reinforcement learning regimes. Experiments on continuous action spaces, in the MuJoCo control suite, show that our proposed method reduces the variance of the process and improves the overall performance.",
        "keywords": "Deep Reinforcement Learning;Variance Reduction;Policy Gradient",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chen Tessler;Nadav Merlis;Shie Mannor",
        "authorids": "chen.tessler@gmail.com;merlis.nadav@gmail.com;shiemannor@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntessler2020stabilizing,\ntitle={Stabilizing Off-Policy Reinforcement Learning with Conservative Policy Gradients},\nauthor={Chen Tessler and Nadav Merlis and Shie Mannor},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgn464tPB}\n}",
        "github": "https://anonymous.4open.science/r/1754f3b9-d618-4298-804c-c8e66d787fb7/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJgn464tPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "406;577;652",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "389;211;328",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            545.0,
            102.9465880930495
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            309.3333333333333,
            73.85721961249888
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15833529202804514508&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJgndT4KwB",
        "title": "Finite Depth and Width Corrections to the Neural Tangent Kernel",
        "track": "main",
        "status": "Spotlight",
        "tldr": "The neural tangent kernel in a randomly initialized ReLU net is non-trivial fluctuations as long as the depth and width are comparable. ",
        "abstract": "We prove the precise scaling, at finite depth and width, for the mean and variance of the neural tangent kernel (NTK) in a randomly initialized ReLU network. The standard deviation is exponential in the ratio of network depth to width. Thus, even in the limit of infinite overparameterization, the NTK is not deterministic if depth and width simultaneously tend to infinity. Moreover, we prove that for such deep and wide networks, the NTK has a non-trivial evolution during training by showing that the mean of its first SGD update is also exponential in the ratio of network depth to width. This is sharp contrast to the regime where depth is fixed and network width is very large. Our results suggest that, unlike relatively shallow and wide networks, deep and wide ReLU networks are capable of learning data-dependent features even in the so-called lazy training regime. ",
        "keywords": "Neural Tangent Kernel;Finite Width Corrections;Random ReLU Net;Wide Networks;Deep Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Boris Hanin;Mihai Nica",
        "authorids": "bhanin@math.tamu.edu;mnica@math.utoronto.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nHanin2020Finite,\ntitle={Finite Depth and Width Corrections to the Neural Tangent Kernel},\nauthor={Boris Hanin and Mihai Nica},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgndT4KwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJgndT4KwB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "518;366;314",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "795;279;219",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            399.3333333333333,
            86.55377262462657
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            431.0,
            258.549801779077
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 194,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3592854421365872482&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SJgob6NKvH",
        "title": "RTFM: Generalising to New Environment Dynamics via Reading",
        "track": "main",
        "status": "Poster",
        "tldr": "We show language understanding via reading is promising way to learn policies that generalise to new environments.",
        "abstract": "Obtaining policies that can generalise to new environments in reinforcement learning is challenging. In this work, we demonstrate that language understanding via a reading policy learner is a promising vehicle for generalisation to new environments. We propose a grounded policy learning problem, Read to Fight Monsters (RTFM), in which the agent must jointly reason over a language goal, relevant dynamics described in a document, and environment observations. We procedurally generate environment dynamics and corresponding language descriptions of the dynamics, such that agents must read to understand new environment dynamics instead of memorising any particular information. In addition, we propose txt2\u03c0, a model that captures three-way interactions between the goal, document, and observations. On RTFM, txt2\u03c0 generalises to new environments with dynamics not seen during training via reading. Furthermore, our model outperforms baselines such as FiLM and language-conditioned CNNs on RTFM. Through curriculum learning, txt2\u03c0 produces policies that excel on complex RTFM tasks requiring several reasoning and coreference steps.",
        "keywords": "reinforcement learning;policy learning;reading comprehension;generalisation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Victor Zhong;Tim Rockt\u00e4schel;Edward Grefenstette",
        "authorids": "victor@victorzhong.com;tim.rocktaeschel@gmail.com;egrefen@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nZhong2020RTFM:,\ntitle={RTFM: Generalising to New Environment Dynamics via Reading},\nauthor={Victor Zhong and Tim Rockt\u00e4schel and Edward Grefenstette},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgob6NKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJgob6NKvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "490;190;495",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1343;162;472",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            391.6666666666667,
            142.61447643520935
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            659.0,
            499.94466360455
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 68,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15826727973863827325&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJgs8TVtvr",
        "title": "Mixture-of-Experts Variational Autoencoder for clustering and generating from similarity-based representations",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Clustering high-dimensional data, such as images or biological measurements, is a long-standing problem and has been studied extensively. Recently, Deep Clustering gained popularity due to the non-linearity of neural networks, which allows for flexibility in fitting the specific peculiarities of complex data. Here we introduce the Mixture-of-Experts Similarity Variational Autoencoder (MoE-Sim-VAE), a novel generative clustering model. The model can learn multi-modal distributions of high-dimensional data and use these to generate realistic data with high efficacy and efficiency. MoE-Sim-VAE is based on a Variational Autoencoder (VAE), where the decoder consists of a Mixture-of-Experts (MoE) architecture. This specific architecture allows for various modes of the data to be automatically learned by means of the experts. Additionally, we encourage the latent representation of our model to follow a Gaussian mixture distribution and to accurately represent the similarities between the data points. We assess the performance of our model on synthetic data, the MNIST benchmark data set, and a challenging real-world task of defining cell subpopulations from mass cytometry (CyTOF) measurements on hundreds of different datasets. MoE-Sim-VAE exhibits superior clustering performance on all these tasks in comparison to the baselines and we show that the MoE architecture in the decoder reduces the computational cost of sampling specific data modes with high fidelity.",
        "keywords": "Variational Autoencoder;Clustering;Generative model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andreas Kopf;Vincent Fortuin;Vignesh Ram Somnath;Manfred Claassen",
        "authorids": "akopf@ethz.ch;fortuin@inf.ethz.ch;vsomnath@student.ethz.ch;mclaassen@ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkopf2020mixtureofexperts,\ntitle={Mixture-of-Experts Variational Autoencoder for clustering and generating from similarity-based representations},\nauthor={Andreas Kopf and Vincent Fortuin and Vignesh Ram Somnath and Manfred Claassen},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgs8TVtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJgs8TVtvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "301;113;464",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1014;13;673",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.6666666666667,
            143.41625508366275
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            566.6666666666666,
            415.51601119036985
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 53,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16497660046200612863&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 19
    },
    {
        "id": "SJgubJrKPr",
        "title": "Construction of Macro Actions for Deep Reinforcement Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose to construct macro actions by a genetic algorithm, which eliminates the dependency of the macro action derivation procedure from the past policies of the agent.",
        "abstract": "Conventional deep reinforcement learning typically determines an appropriate primitive action at each timestep, which requires enormous amount of time and effort for learning an effective policy, especially in large and complex environments. To deal with the issue fundamentally, we incorporate macro actions, defined as sequences of primitive actions, into the primitive action space to form an augmented action space. The problem lies in how to find an appropriate macro action to augment the primitive action space.  The agent using a proper augmented action space is able to jump to a farther state and thus speed up the exploration process as well as facilitate the learning procedure. In previous researches, macro actions are developed by mining the most frequently used action sequences or repeating previous actions. However, the most frequently used action sequences are extracted from a past policy, which may only reinforce the original behavior of that policy. On the other hand, repeating actions may limit the diversity of behaviors of the agent. Instead, we propose to construct macro actions by a genetic algorithm, which eliminates the dependency of the macro action derivation procedure from the past policies of the agent.  Our approach appends a macro action to the primitive action space once at a time and evaluates whether the augmented action space leads to promising performance or not.   We perform extensive experiments and show that the constructed macro actions are able to speed up the learning process for a variety of deep reinforcement learning methods. Our experimental results also demonstrate that the macro actions suggested by our approach are transferable among deep reinforcement learning methods and similar environments. We further provide a comprehensive set of ablation analysis to validate our methodology.",
        "keywords": "macro action;genetic algorithm;deep reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yi-Hsiang Chang;Kuan-Yu\tChang;Henry Kuo;Chun-Yi Lee",
        "authorids": "shawn420@gapp.nthu.edu.tw;kychang@elsa.cs.nthu.edu.tw;hkuo@college.harvard.edu;cylee@gapp.nthu.edu.tw",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJgubJrKPr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "431;383;358",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            390.6666666666667,
            30.291179500896884
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17309023271807316480&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJgvl6EFwH",
        "title": "InfoCNF: Efficient Conditional Continuous Normalizing Flow Using Adaptive Solvers",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose the InfoCNF, an efficient conditional CNF that employs gating networks to learn the error tolerances of the ODE solvers  ",
        "abstract": "Continuous Normalizing Flows (CNFs) have emerged as promising deep generative models for a wide range of tasks thanks to their invertibility and exact likelihood estimation. However, conditioning CNFs on signals of interest for conditional image generation and downstream predictive tasks is inefficient due to the high-dimensional latent code generated by the model, which needs to be of the same size as the input data. In this paper, we propose InfoCNF, an efficient conditional CNF that partitions the latent space into a class-specific supervised code and an unsupervised code that shared among all classes for efficient use of labeled information. Since the partitioning strategy (slightly) increases the number of function evaluations (NFEs),  InfoCNF also employs gating networks to learn the error tolerances of its ordinary differential equation (ODE) solvers for better speed and performance. We show empirically that InfoCNF improves the test accuracy over the baseline  while yielding comparable likelihood scores and reducing the NFEs on CIFAR10. Furthermore, applying the same partitioning strategy in InfoCNF on time-series data helps improve extrapolation performance. ",
        "keywords": "continuous normalizing flows;conditioning;adaptive solvers;gating networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tan M. Nguyen;Animesh Garg;Richard G. Baraniuk;Anima Anandkumar",
        "authorids": "mn15@rice.edu;garg@cs.toronto.edu;richb@rice.edu;anima@caltech.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nnguyen2020infocnf,\ntitle={Info{\\{}CNF{\\}}: Efficient Conditional Continuous Normalizing Flow Using Adaptive Solvers},\nauthor={Tan M. Nguyen and Animesh Garg and Richard G. Baraniuk and Anima Anandkumar},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgvl6EFwH}\n}",
        "github": "https://sites.google.com/view/infocnf-iclr/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJgvl6EFwH",
        "pdf_size": 0,
        "rating": "1;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "334;622;552;196",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "797;949;766;107",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "3;4;2;1",
        "rating_avg": [
            3.25,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            426.0,
            170.04117148502596
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            654.75,
            323.7378376093842
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.5,
            1.118033988749895
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4755454417118356487&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJgw51HFDr",
        "title": "Sparse Weight Activation Training",
        "track": "main",
        "status": "Reject",
        "tldr": "We sparsified computation by up to 70-80% in both forward and backward passes during CNN training with minimal loss in accuracy.",
        "abstract": "Training convolutional neural networks (CNNs) is time consuming. Prior work has explored how to reduce the computational demands of training by eliminating gradients with relatively small magnitude. We show that eliminating small magnitude components has limited impact on the direction of high-dimensional vectors. However, in the context of training a CNN, we find that eliminating small magnitude components of weight and activation vectors allows us to train deeper networks on more complex datasets versus eliminating small magnitude components of gradients. We propose Sparse Weight Activation Training (SWAT), an algorithm that embodies these observations. SWAT reduces computations by 50% to 80% with better accuracy at a given level of sparsity versus the Dynamic Sparse Graph algorithm. SWAT also reduces memory footprint by 23% to 37% for activations and 50% to 80% for weights.",
        "keywords": "Sparsity;Training;Acceleration;Pruning;Compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Md Aamir Raihan;Tor M. Aamodt",
        "authorids": "araihan@ece.ubc.ca;aamodt@ece.ubc.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nraihan2020sparse,\ntitle={Sparse Weight Activation Training},\nauthor={Md Aamir Raihan and Tor M. Aamodt},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgw51HFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJgw51HFDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "430;248;528",
        "wc_reply_reviewers": "0;0;147",
        "wc_reply_authors": "882;624;908",
        "reply_reviewers": "0;0;1",
        "reply_authors": "3;2;3",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            402.0,
            116.0114936834565
        ],
        "wc_reply_reviewers_avg": [
            49.0,
            69.29646455628166
        ],
        "wc_reply_authors_avg": [
            804.6666666666666,
            128.19082997191683
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 96,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13365043317939429653&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SJgwNerKvB",
        "title": "Continual learning with hypernetworks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "Artificial neural networks suffer from catastrophic forgetting when they are sequentially trained on multiple tasks. To overcome this problem, we present a novel approach based on task-conditioned hypernetworks, i.e., networks that generate the weights of a target model based on task identity. Continual learning (CL) is less difficult for this class of models thanks to a simple key feature: instead of recalling the input-output relations of all previously seen data, task-conditioned hypernetworks only require rehearsing task-specific weight realizations, which can be maintained in memory using a simple regularizer. Besides achieving state-of-the-art performance on standard CL benchmarks, additional experiments on long task sequences reveal that task-conditioned hypernetworks display a very large capacity to retain previous memories. Notably, such long memory lifetimes are achieved in a compressive regime, when the number of trainable hypernetwork weights is comparable or smaller than target network size. We provide insight into the structure of low-dimensional task embedding spaces (the input space of the hypernetwork) and show that task-conditioned hypernetworks demonstrate transfer learning. Finally, forward information transfer is further supported by empirical results on a challenging CL benchmark based on the CIFAR-10/100 image datasets.",
        "keywords": "Continual Learning;Catastrophic Forgetting;Meta Model;Hypernetwork",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Johannes von Oswald;Christian Henning;Benjamin F. Grewe;Jo\u00e3o Sacramento",
        "authorids": "voswaldj@ethz.ch;henningc@ethz.ch;bgrewe@ethz.ch;sacramento@ini.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nOswald2020Continual,\ntitle={Continual learning with hypernetworks},\nauthor={Johannes von Oswald and Christian Henning and Benjamin F. Grewe and Jo\u00e3o Sacramento},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgwNerKvB}\n}",
        "github": "https://github.com/chrhenning/hypercl",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJgwNerKvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "440;332;306",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "870;320;43",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.3333333333333,
            58.01915392542555
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            411.0,
            343.69851129538904
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 481,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12864438704892139972&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SJgwf04KPr",
        "title": "Confidence-Calibrated Adversarial Training: Towards Robust Models Generalizing Beyond the Attack Used During Training",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper introduces confidence-calibrated adversarial training to generalize adversarial robustness to attacks not used during training.",
        "abstract": "Adversarial training is the standard to train models robust against adversarial examples. However, especially for complex datasets, adversarial training incurs a significant loss in accuracy and is known to generalize poorly to stronger attacks, e.g., larger perturbations or other threat models. In this paper, we introduce  confidence-calibrated adversarial training (CCAT) where the key idea is to enforce that the confidence on adversarial examples decays with their distance to the attacked examples. We show that CCAT preserves better the accuracy of normal training while robustness against adversarial examples is achieved via confidence thresholding. Most importantly, in strong contrast to adversarial training, the robustness of CCAT generalizes to larger perturbations and other threat models, not encountered during training. We also discuss our extensive work to design strong adaptive attacks against CCAT and standard adversarial training which is of independent interest. We present experimental results on MNIST, SVHN and Cifar10.",
        "keywords": "Adversarial Training;Adversarial Examples;Adversarial Robustness;Confidence Calibration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David Stutz;Matthias Hein;Bernt Schiele",
        "authorids": "david.stutz@mpi-inf.mpg.de;schiele@mpi-inf.mpg.de;matthias.hein@uni-tuebingen.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nstutz2020confidencecalibrated,\ntitle={Confidence-Calibrated Adversarial Training: Towards Robust Models Generalizing Beyond the Attack Used During Training},\nauthor={David Stutz and Matthias Hein and Bernt Schiele},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgwf04KPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJgwf04KPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "646;619;738",
        "wc_reply_reviewers": "0;0;103",
        "wc_reply_authors": "871;784;739",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            667.6666666666666,
            50.94005191289943
        ],
        "wc_reply_reviewers_avg": [
            34.333333333333336,
            48.55466564147626
        ],
        "wc_reply_authors_avg": [
            798.0,
            54.7905101272109
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6009476495386302504&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJgwzCEKwH",
        "title": "Bridging Mode Connectivity in Loss Landscapes and Adversarial Robustness",
        "track": "main",
        "status": "Poster",
        "tldr": "A novel approach using mode connectivity in loss landscapes to mitigate adversarial effects, repair tampered models, and evaluate adversarial robustness",
        "abstract": "Mode connectivity provides novel geometric insights on analyzing loss landscapes and enables building high-accuracy pathways between well-trained neural networks. In this work, we propose to employ mode connectivity in loss landscapes to study the adversarial robustness of deep neural networks, and provide novel methods for improving this robustness.  Our experiments cover various types of adversarial attacks applied to different network architectures and datasets. When network models are tampered with backdoor or error-injection attacks, our results demonstrate that the path connection learned using limited amount of bonafide data can effectively mitigate adversarial effects while maintaining the original accuracy on clean data. Therefore, mode connectivity provides users with the power to repair backdoored or error-injected models.  We also use mode connectivity to investigate the loss landscapes of regular and robust models against evasion attacks. Experiments show that there exists a barrier in adversarial robustness loss on the path connecting regular and adversarially-trained models.  A high correlation is observed between the adversarial robustness loss and the largest eigenvalue of the input Hessian matrix, for which theoretical justifications are provided.  Our results suggest that mode connectivity offers a holistic tool and practical means for evaluating and improving adversarial robustness.",
        "keywords": "mode connectivity;adversarial robustness;backdoor attack;error-injection attack;evasion attacks;loss landscapes",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pu Zhao;Pin-Yu Chen;Payel Das;Karthikeyan Natesan Ramamurthy;Xue Lin",
        "authorids": "zhao.pu@husky.neu.edu;pin-yu.chen@ibm.com;daspa@us.ibm.com;knatesa@us.ibm.com;xue.lin@northeastern.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nZhao2020Bridging,\ntitle={Bridging Mode Connectivity in Loss Landscapes and Adversarial Robustness},\nauthor={Pu Zhao and Pin-Yu Chen and Payel Das and Karthikeyan Natesan Ramamurthy and Xue Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgwzCEKwH}\n}",
        "github": "[![github](/images/github_icon.svg) IBM/model-sanitization](https://github.com/IBM/model-sanitization) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SJgwzCEKwH)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJgwzCEKwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "766;148;396",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "509;217;211",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            436.6666666666667,
            253.9308742333017
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            312.3333333333333,
            139.08590470960345
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 249,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14988732432147772285&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJgzLkBKPB",
        "title": "Explain Your Move: Understanding Agent Actions Using Specific and Relevant Feature Attribution",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a model-agnostic approach to explain the behaviour of black-box deep RL agents, trained to play Atari and board games, by highlighting relevant portions of the input state.",
        "abstract": "As deep reinforcement learning (RL) is applied to more tasks, there is a need to visualize and understand the behavior of learned agents. Saliency maps explain agent behavior by highlighting the features of the input state that are most relevant for the agent in taking an action. Existing perturbation-based approaches to compute saliency often highlight regions of the input that are not relevant to the action taken by the agent. Our proposed approach, SARFA (Specific and Relevant Feature Attribution), generates more focused saliency maps by balancing two aspects (specificity and relevance) that capture different desiderata of saliency. The first captures the impact of perturbation on the relative expected reward of the action to be explained. The second downweighs irrelevant features that alter the relative expected rewards of actions other than the action to be explained. We compare SARFA with existing approaches on agents trained to play board games (Chess and Go) and Atari games (Breakout, Pong and Space Invaders). We show through illustrative examples (Chess, Atari, Go), human studies (Chess), and automated evaluation methods (Chess) that SARFA generates saliency maps that are more interpretable for humans than existing approaches. For the code release and demo videos, see: https://nikaashpuri.github.io/sarfa-saliency/.",
        "keywords": "Deep Reinforcement Learning;Saliency maps;Chess;Go;Atari;Interpretable AI;Explainable AI",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nikaash Puri;Sukriti Verma;Piyush Gupta;Dhruv Kayastha;Shripad Deshmukh;Balaji Krishnamurthy;Sameer Singh",
        "authorids": "nikpuri@adobe.com;dce.sukriti@gmail.com;piygupta@adobe.com;dhruvkayastha@iitkgp.ac.in;shripad@smail.iitm.ac.in;kbalaji@adobe.com;sameer@uci.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nPuri2020Explain,\ntitle={Explain Your Move: Understanding Agent Actions Using Specific and Relevant Feature Attribution},\nauthor={Nikaash Puri and Sukriti Verma and Piyush Gupta and Dhruv Kayastha and Shripad Deshmukh and Balaji Krishnamurthy and Sameer Singh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgzLkBKPB}\n}",
        "github": "https://nikaashpuri.github.io/sarfa-saliency/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJgzLkBKPB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "1111;309;187",
        "wc_reply_reviewers": "165;41;64",
        "wc_reply_authors": "760;138;166",
        "reply_reviewers": "1;1;1",
        "reply_authors": "2;2;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            535.6666666666666,
            409.859596555807
        ],
        "wc_reply_reviewers_avg": [
            90.0,
            53.85783756025363
        ],
        "wc_reply_authors_avg": [
            354.6666666666667,
            286.84180696373784
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 99,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5830219427979176885&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SJgzXaNFwS",
        "title": "HyperEmbed: Tradeoffs Between Resources and Performance in NLP Tasks with Hyperdimensional Computing enabled embedding of n-gram statistics",
        "track": "main",
        "status": "Reject",
        "tldr": "Tradeoffs Between Resources and Performance in NLP Tasks with Hyperdimensional Computing enabled embedding of n-gram statistics ",
        "abstract": "Recent advances in Deep Learning have led to a significant performance increase on several NLP tasks, however, the models become more and more computationally demanding. Therefore, this paper tackles the domain of computationally efficient algorithms for NLP tasks. In particular, it investigates distributed representations of n-gram statistics of texts. The representations are formed using hyperdimensional computing enabled embedding. These representations then serve as features, which are used as input to standard classifiers. We investigate the applicability of the embedding on one large and three small standard datasets for classification tasks using nine classifiers.  The embedding achieved on par F1 scores while decreasing the time and memory requirements by several times compared to the conventional n-gram statistics, e.g., for one of the classifiers on a small dataset, the memory reduction was 6.18 times; while train and test speed-ups were 4.62 and 3.84 times, respectively. For many classifiers on the large dataset, the memory reduction was about 100 times and train and test speed-ups were over 100 times. More importantly, the usage of distributed representations formed via hyperdimensional computing allows dissecting the strict dependency between the dimensionality of the representation and the parameters of n-gram statistics, thus, opening a room for tradeoffs.",
        "keywords": "NLP;Hyperdimensional computing;n-gram statistics;word representation;semantic hashing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pedro Alonso;Kumar Shridhar;Denis Kleyko;Evgeny Osipov;Marcus Liwicki",
        "authorids": "pedro.alonso@ltu.se;kumar@neuralspace.ai;denis.kleyko@ltu.se;evgeny.osipov@ltu.se;marcus.liwicki@ltu.se",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nalonso2020hyperembed,\ntitle={HyperEmbed:  Tradeoffs Between Resources and Performance in {\\{}NLP{\\}} Tasks with Hyperdimensional Computing enabled embedding of n-gram statistics },\nauthor={Pedro Alonso and Kumar Shridhar and Denis Kleyko and Evgeny Osipov and Marcus Liwicki},\nyear={2020},\nurl={https://openreview.net/forum?id=SJgzXaNFwS}\n}",
        "github": "https://drive.google.com/file/d/1OnrqbUMHBLBqCKwnJ85s3zm5sb7NWsSo/view",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJgzXaNFwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "107;247;250",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "523;1146;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            201.33333333333334,
            66.71498249185777
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            556.3333333333334,
            468.4458939467349
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.816496580927726
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10829001809352301810&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SJl1o2NFwS",
        "title": "Understanding and Improving Transformer From a Multi-Particle Dynamic System Point of View",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The Transformer architecture is widely used in natural language processing. Despite its success, the design principle of the Transformer remains elusive. In this paper, we provide a novel perspective towards understanding the architecture: we show that the Transformer can be mathematically interpreted as a numerical Ordinary Differential Equation (ODE) solver for a convection-diffusion equation in a multi-particle dynamic system. In particular, how words in a sentence are abstracted into contexts by passing through the layers of the Transformer can be interpreted as approximating multiple particles' movement in the space using the Lie-Trotter splitting scheme and the Euler's method. Given this ODE's perspective, the rich literature of numerical analysis can be brought to guide us in designing effective structures beyond the Transformer. As an example, we propose to replace the Lie-Trotter splitting scheme by the Strang-Marchuk splitting scheme, a scheme that is more commonly used and with much lower local truncation errors. The Strang-Marchuk splitting scheme suggests that the self-attention and position-wise feed-forward network (FFN) sub-layers should not be treated equally. Instead, in each layer, two position-wise FFN sub-layers should be used, and the self-attention sub-layer is placed in between. This leads to a brand new architecture. Such an FFN-attention-FFN layer is \"Macaron-like\", and thus we call the network with this new architecture the Macaron Net. Through extensive experiments, we show that the Macaron Net is superior to the Transformer on both supervised and unsupervised learning tasks. The reproducible code can be found on http://anonymized",
        "keywords": "Transformer;Ordinary Differential Equation;Multi-Particle Dynamic System;Natural Language Processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yiping Lu;Zhuohan Li;Di He;Zhiqing Sun;Bin Dong;Tao Qin;Liwei Wang;Tie-Yan Liu",
        "authorids": "yplu@stanford.edu;zhuohan@berkeley.edu;di_he@pku.edu.cn;zhiqings@andrew.cmu.edu;bindong@math.pku.edu.cn;taoqin@microsoft.com;wanglw@cis.pku.edu.cn;tyliu@microsoft.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nlu2020understanding,\ntitle={Understanding and Improving Transformer From a Multi-Particle Dynamic System Point of View},\nauthor={Yiping Lu and Zhuohan Li and Di He and Zhiqing Sun and Bin Dong and Tao Qin and Liwei Wang and Tie-Yan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=SJl1o2NFwS}\n}",
        "github": "[![github](/images/github_icon.svg) zhuohan123/macaron-net](https://github.com/zhuohan123/macaron-net) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SJl1o2NFwS)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJl1o2NFwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "430;893;444",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            589.0,
            215.03643102197046
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 225,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9145473984739564510&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJl28R4YPr",
        "title": "Graph Neural Networks for Reasoning 2-Quantified Boolean Formulas",
        "track": "main",
        "status": "Reject",
        "tldr": "Learn GNN-based 2QBF solvers and GNN-based 2QBF heuristics",
        "abstract": "It is valuable yet remains challenging to apply neural networks in logical reasoning tasks. Despite some successes witnessed in learning SAT (Boolean Satisfiability) solvers for propositional logic via Graph Neural Networks (GNN),  there haven't been any successes in learning solvers for more complex predicate logic. In this paper, we target the QBF (Quantified Boolean Formula) satisfiability problem, the complexity of which is in-between propositional logic and predicate logic, and investigate the feasibility of learning GNN-based solvers and GNN-based heuristics for the cases with a universal-existential quantifier alternation (so-called 2QBF problems).\n\nWe conjecture, with empirical support, that GNNs have certain limitations in learning 2QBF solvers, primarily due to the inability to reason about a set of assignments. Then we show the potential of GNN-based heuristics in CEGAR-based solvers and explore the interesting challenges to generalize them to larger problem instances. In summary, this paper provides a comprehensive surveying view of applying GNN-based embeddings to 2QBF problems and aims to offer insights in applying machine learning tools to more complicated symbolic reasoning problems.\n",
        "keywords": "Graph Neural Networks;2-Quantified Boolean Formula;Symbolic Reasoning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fei Wang;Zhanfu Yang;Ziliang Chen;Guannan Wei;Tiark Rompf",
        "authorids": "wang603@purdue.edu;yang1676@purdue.edu;c.ziliang@yahoo.com;wei220@purdue.edu;tiark@purdue.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwang2020graph,\ntitle={Graph Neural Networks for Reasoning 2-Quantified Boolean Formulas},\nauthor={Fei Wang and Zhanfu Yang and Ziliang Chen and Guannan Wei and Tiark Rompf},\nyear={2020},\nurl={https://openreview.net/forum?id=SJl28R4YPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJl28R4YPr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "200;165;367",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            244.0,
            88.14004008773009
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:w-TPTVKfzXoJ:scholar.google.com/&scioq=Graph+Neural+Networks+for+Reasoning+2-Quantified+Boolean+Formulas&hl=en&as_sdt=0,33",
        "gs_version_total": 2
    },
    {
        "id": "SJl3CANKvB",
        "title": "A SIMPLE AND EFFECTIVE FRAMEWORK FOR PAIRWISE DEEP METRIC LEARNING",
        "track": "main",
        "status": "Reject",
        "tldr": "We provide a general and flexible framework based on distributionally robust optimization for deep metric learning which is robust to imbalanced data pair.",
        "abstract": "Deep metric learning (DML) has received much attention in deep learning due to its wide applications in computer vision. Previous studies have focused on designing complicated losses and hard example mining methods, which are mostly heuristic and lack of theoretical understanding. In this paper, we cast DML as a simple pairwise binary classification problem that classifies a pair of examples as similar or dissimilar. It identifies the most critical issue in this problem---imbalanced data pairs. To tackle this issue, we propose a simple and effective framework to sample pairs in a batch of data for updating the model. The key to this framework is to define a robust loss for all pairs over a mini-batch of data, which is formulated by distributionally robust optimization. The flexibility in constructing the  {\\it uncertainty decision set} of the dual variable allows us to recover state-of-the-art complicated losses and also to induce novel variants.  Empirical studies on several benchmark data sets demonstrate that our simple and effective method outperforms the state-of-the-art results.",
        "keywords": "Deep Metric Learning;Distributionally Robust Optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qi Qi;Yan Yan;Zixuan Wu;Xiaoyu Wang;Tianbao Yang",
        "authorids": "qi-qi@uiowa.edu;yanyan.tju@gmail.com;wuzu@bc.edu;fanghuaxue@gmail.com;tianbao-yang@uiowa.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nqi2020a,\ntitle={A {\\{}SIMPLE{\\}} {\\{}AND{\\}} {\\{}EFFECTIVE{\\}} {\\{}FRAMEWORK{\\}} {\\{}FOR{\\}} {\\{}PAIRWISE{\\}} {\\{}DEEP{\\}} {\\{}METRIC{\\}} {\\{}LEARNING{\\}}},\nauthor={Qi Qi and Yan Yan and Zixuan Wu and Xiaoyu Wang and Tianbao Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=SJl3CANKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJl3CANKvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "391;215;330",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "492;67;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;0",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            312.0,
            72.97031359852215
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            186.33333333333334,
            217.86285186378657
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.816496580927726
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13095033614856237207&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "SJl3h2EYvS",
        "title": "CLAREL: classification via retrieval loss for zero-shot learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an instance-based deep metric learning approach in joint visual and textual space. We show that per-image semantic supervision leads to substantial improvement over class-only supervision in zero shot classification.",
        "abstract": "We address the problem of learning fine-grained cross-modal representations. We propose an instance-based deep metric learning approach in joint visual and textual space. The key novelty of this paper is that it shows that using per-image semantic supervision leads to substantial improvement in zero-shot performance over using class-only supervision. On top of that, we provide a probabilistic justification for a metric rescaling approach that solves a very common problem in the generalized zero-shot learning setting, i.e., classifying test images from unseen classes as one of the classes seen during training. We evaluate our approach on two fine-grained zero-shot learning datasets: CUB and FLOWERS. We find that on the generalized zero-shot classification task CLAREL consistently outperforms the existing approaches on both datasets.",
        "keywords": "zero-shot learning;representation learning;fine-grained classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Boris N. Oreshkin;Negar Rostamzadeh;Pedro O. Pinheiro;Christopher Pal",
        "authorids": "boris@elementai.com;negar@elementai.com;pedro@elementai.com;christopher.pal@elementai.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\noreshkin2020clarel,\ntitle={{\\{}CLAREL{\\}}: classification via retrieval loss for zero-shot learning},\nauthor={Boris N. Oreshkin and Negar Rostamzadeh and Pedro O. Pinheiro and Christopher Pal},\nyear={2020},\nurl={https://openreview.net/forum?id=SJl3h2EYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJl3h2EYvS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "482;551;402",
        "wc_reply_reviewers": "0;301;0",
        "wc_reply_authors": "2081;784;487",
        "reply_reviewers": "0;1;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            478.3333333333333,
            60.88422528774501
        ],
        "wc_reply_reviewers_avg": [
            100.33333333333333,
            141.89276075810054
        ],
        "wc_reply_authors_avg": [
            1117.3333333333333,
            692.118647503607
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2205946445483574227&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "SJl47yBYPS",
        "title": "Towards Simplicity in Deep Reinforcement Learning: Streamlined Off-Policy Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new DRL off-policy algorithm achieving state-of-the-art performance. ",
        "abstract": "The field of Deep Reinforcement Learning (DRL) has recently seen a surge in the popularity of maximum entropy reinforcement learning algorithms.  Their popularity stems from the intuitive interpretation of the maximum entropy objective and their superior sample efficiency on standard benchmarks. In this paper, we seek to understand the primary contribution  of the entropy term to the performance of maximum entropy algorithms. For the Mujoco benchmark, we demonstrate that the entropy term in Soft Actor Critic (SAC) principally addresses the bounded nature of the action spaces. With this insight, we propose a simple normalization scheme which allows a streamlined algorithm without entropy maximization match the performance of SAC. Our experimental results demonstrate a need to revisit the benefits of entropy regularization in DRL. We also propose a simple non-uniform sampling method for selecting transitions from the replay buffer during training.  We further show that the streamlined algorithm with the simple non-uniform sampling scheme outperforms SAC and achieves state-of-the-art performance on challenging continuous control tasks.",
        "keywords": "Deep Reinforcement Learning;Sample Efficiency;Off-Policy Algorithms",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Che Wang;Yanqiu Wu;Quan Vuong;Keith Ross",
        "authorids": "cw1681@nyu.edu;yanqiu.wu@nyu.edu;quan.hovuong@gmail.com;keithwross@nyu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020towards,\ntitle={Towards Simplicity in Deep Reinforcement Learning: Streamlined Off-Policy Learning},\nauthor={Che Wang and Yanqiu Wu and Quan Vuong and Keith Ross},\nyear={2020},\nurl={https://openreview.net/forum?id=SJl47yBYPS}\n}",
        "github": "https://anonymous.4open.science/r/e484a8c7-268a-4a66-a001-1e7676540237/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJl47yBYPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "345;672;370",
        "wc_reply_reviewers": "0;93;17",
        "wc_reply_authors": "671;523;396",
        "reply_reviewers": "0;1;1",
        "reply_authors": "3;3;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            462.3333333333333,
            148.6076115891182
        ],
        "wc_reply_reviewers_avg": [
            36.666666666666664,
            40.43375927228247
        ],
        "wc_reply_authors_avg": [
            530.0,
            112.37734053921487
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2796162636121433929&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJl5Np4tPr",
        "title": "Cross-Domain Few-Shot Classification via Learned Feature-Wise Transformation",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "Few-shot classification aims to recognize novel categories with only few labeled images in each class. Existing metric-based few-shot classification algorithms predict categories by comparing the feature embeddings of query images with those from a few labeled images (support examples) using a learned metric function. While promising performance has been demonstrated, these methods often fail to generalize to unseen domains due to large discrepancy of the feature distribution across domains. In this work, we address the problem of few-shot classification under domain shifts for metric-based methods. Our core idea is to use feature-wise transformation layers for augmenting the image features using affine transforms to simulate various feature distributions under different domains in the training stage. To capture variations of the feature distributions under different domains, we further apply a learning-to-learn approach to search for the hyper-parameters of the feature-wise transformation layers. We conduct extensive experiments and ablation studies under the domain generalization setting using five few-shot classification datasets: mini-ImageNet, CUB, Cars, Places, and Plantae. Experimental results demonstrate that the proposed feature-wise transformation layer is applicable to various metric-based models, and provides consistent improvements on the few-shot classification performance under domain shift.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hung-Yu Tseng;Hsin-Ying Lee;Jia-Bin Huang;Ming-Hsuan Yang",
        "authorids": "htseng6@ucmerced.edu;hlee246@ucmerced.edu;jbhuang@vt.edu;mhyang@ucmerced.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nTseng2020Cross-Domain,\ntitle={Cross-Domain Few-Shot Classification via Learned Feature-Wise Transformation},\nauthor={Hung-Yu Tseng and Hsin-Ying Lee and Jia-Bin Huang and Ming-Hsuan Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJl5Np4tPr}\n}",
        "github": "[![github](/images/github_icon.svg) hytseng0509/CrossDomainFewShot](https://github.com/hytseng0509/CrossDomainFewShot)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJl5Np4tPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "259;303;266",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1069;828;578",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            276.0,
            19.30457631409368
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            825.0,
            200.4611350528243
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 524,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7014117950265754591&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJl7tREFvr",
        "title": "Aging Memories Generate More Fluent Dialogue Responses with Memory Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Conventional memory networks generate many redundant latent vectors resulting in overfitting and the need for larger memories. We introduce memory dropout as an automatic technique that encourages diversity in the latent space.",
        "abstract": "The integration of a  Knowledge Base (KB) into a neural dialogue agent is one of the key challenges in Conversational AI. Memory networks has proven to be effective to encode KB information into an external memory to thus generate more fluent and informed responses. Unfortunately, such memory becomes full of latent representations during training, so the most common strategy is to overwrite old memory entries randomly. \n\nIn this paper, we question this approach and provide experimental evidence showing that conventional memory networks generate many redundant latent vectors resulting in overfitting and the need for larger memories. We introduce memory dropout as an automatic technique that encourages diversity in the latent space by 1) Aging redundant memories to increase their probability of being overwritten during training 2) Sampling new memories that summarize the knowledge acquired by redundant memories. This technique allows us to incorporate  Knowledge Bases to achieve state-of-the-art dialogue generation in the Stanford Multi-Turn Dialogue dataset. Considering the same architecture, its use provides an improvement of +2.2 BLEU points for the automatic  generation of responses and an increase of +8.1% in the recognition of named entities.",
        "keywords": "regularization;memory networks;response generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Omar U. Florez;Erik Mueller",
        "authorids": "omar.florez@aggiemail.usu.edu;erikmueller@capitalone.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=SJl7tREFvr",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7534175295958382615&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJl9PTNYDS",
        "title": "NPTC-net: Narrow-Band Parallel Transport Convolutional Neural Network on Point Clouds",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose the Narrow-Band Parallel Transport Convolution (NPTC) using a specifically defined connection on a voxelized narrow-band approximation of point cloud data and further propose a deep convolutional neural network based on NPTC .",
        "abstract": "Convolution plays a crucial role in various applications in signal and image processing, analysis and recognition. It is also the main building block of convolution neural networks (CNNs). Designing appropriate convolution neural networks on manifold-structured point clouds can inherit and empower recent advances of CNNs to analyzing and processing point cloud data. However, one of the major challenges is to define a proper way to \"sweep\" filters through the point cloud as a natural generalization of the planar convolution and to reflect the point cloud's geometry at the same time. In this paper, we consider generalizing convolution by adapting parallel transport on the point cloud. Inspired by a triangulated surface based method \\cite{DBLP:journals/corr/abs-1805-07857}, we propose the Narrow-Band Parallel Transport Convolution (NPTC) using a specifically defined connection on a voxelized narrow-band approximation of point cloud data. With that, we further propose a deep convolutional neural network based on NPTC (called NPTC-net) for point cloud classification and segmentation. Comprehensive experiments show that the proposed NPTC-net achieves similar or better results than current state-of-the-art methods on point clouds classification and segmentation.",
        "keywords": "geometric convolution;point cloud;parallel transport",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pengfei Jin;Tianhao Lai;Rongjie Lai;Bin Dong",
        "authorids": "jinpf@pku.edu.cn;howeverlth@pku.edu.cn;lair@rpi.edu;dongbin@math.pku.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njin2020nptcnet,\ntitle={{\\{}NPTC{\\}}-net: Narrow-Band Parallel Transport Convolutional Neural Network on Point Clouds},\nauthor={Pengfei Jin and Tianhao Lai and Rongjie Lai and Bin Dong},\nyear={2020},\nurl={https://openreview.net/forum?id=SJl9PTNYDS}\n}",
        "github": "https://github.com/31415927Anonymous/anonymous",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJl9PTNYDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "2298;240;127",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "667;90;43",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            888.3333333333334,
            997.85180373752
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            266.6666666666667,
            283.7279604754448
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12625833823504615282&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "SJlCK1rYwB",
        "title": "Simple but effective techniques to reduce dataset biases",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose several general debiasing strategies to address common biases seen in different datasets and obtain substantial improved out-of-domain performance in all settings.",
        "abstract": "There have been several studies recently showing that strong natural language understanding (NLU) models are prone to relying on unwanted dataset biases without learning the underlying task, resulting in models which fail to generalize to out-of-domain datasets, and are likely to perform poorly in real-world scenarios. We propose several learning strategies to train neural models which are more robust to such biases and transfer better to out-of-domain datasets. We introduce an additional lightweight bias-only model which learns dataset biases and uses its prediction to adjust the loss of the base model to reduce the biases. In other words, our methods down-weight the importance of the biased examples, and focus training on hard examples, i.e. examples that cannot be correctly classified by only relying on biases. Our approaches are model agnostic and simple to implement.  We experiment on large-scale natural language inference and fact verification datasets and their out-of-domain datasets and show that our debiased models significantly improve the robustness in all settings, including gaining 9.76 points on the FEVER symmetric evaluation dataset, 5.45 on the HANS dataset and 4.78 points on the SNLI hard set.  These datasets are specifically designed to assess the robustness of models in the out-of-domain setting where typical biases in the training data do not exist in the evaluation set.\n",
        "keywords": "bias reduction;natural language inference;robust models;out-of-domain performance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rabeeh Karimi Mahabadi;James Henderson",
        "authorids": "rkarimi@idiap.ch;james.henderson@idiap.ch",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJlCK1rYwB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "513;431;221",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            388.3333333333333,
            122.96702358310903
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5118198264467988499&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJlDDnVKwS",
        "title": "Improving Evolutionary Strategies with Generative Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new algorithm leveraging the expressiveness of Generative Neural Networks to improve Evolutionary Strategies algorithms.",
        "abstract": "Evolutionary Strategies (ES) are a popular family of black-box zeroth-order optimization algorithms which rely on search distributions to efficiently optimize a large variety of objective functions. This paper investigates the potential benefits of using highly flexible search distributions in ES algorithms, in contrast to standard ones (typically Gaussians). We model such distributions with Generative Neural Networks (GNNs) and introduce a new ES algorithm that leverages their expressiveness to accelerate the stochastic search. Because it acts as a plug-in, our approach allows to augment virtually any standard ES algorithm with flexible search distributions. We demonstrate the empirical advantages of this method on a diversity of objective functions.",
        "keywords": "black-box optimization;evolutionary strategies;generative neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Louis Faury;Cl\u00e9ment Calauz\u00e8nes;Olivier Fercoq",
        "authorids": "l.faury@criteo.com;c.calauzenes@criteo.com;olivier.fercoq@telecom-paris.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfaury2020improving,\ntitle={Improving Evolutionary Strategies with Generative Neural Networks},\nauthor={Louis Faury and Cl{\\'e}ment Calauz{\\`e}nes and Olivier Fercoq},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlDDnVKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJlDDnVKwS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "547;616;258",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "746;633;223",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            473.6666666666667,
            155.07919123968745
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            534.0,
            224.6968327918012
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2882590198700324576&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SJlEs1HKDr",
        "title": "Attentive Sequential Neural Processes",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a new model, Attentive Sequential Neural Processes, that resolves the problem of augmenting attention mechanism on SNP.",
        "abstract": "Sequential Neural Processes (SNP) is a new class of models that can meta-learn a temporal stochastic process of stochastic processes by modeling temporal transition between Neural Processes. As Neural Processes (NP) suffers from underfitting, SNP is also prone to the same problem, even more severely due to its temporal context compression. Applying attention which resolves the problem of NP, however, is a challenge in SNP, because it cannot store the past contexts over which it is supposed to apply attention. In this paper, we propose the Attentive Sequential Neural Processes (ASNP) that resolve the underfitting in SNP by introducing a novel imaginary context as a latent variable and by applying attention over the imaginary context. We evaluate our model on 1D Gaussian Process regression and 2D moving MNIST/CelebA regression. We apply ASNP to implement Attentive Temporal GQN and evaluate on the moving-CelebA task.",
        "keywords": "meta-learning;neural processes;attention;sequential modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jaesik Yoon;Gautam Singh;Sungjin Ahn",
        "authorids": "jaesik817@gmail.com;singh.gautam.iitg@gmail.com;sjn.ahn@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyoon2020attentive,\ntitle={Attentive Sequential Neural Processes},\nauthor={Jaesik Yoon and Gautam Singh and Sungjin Ahn},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlEs1HKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJlEs1HKDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "579;87;188",
        "wc_reply_reviewers": "233;0;0",
        "wc_reply_authors": "1226;265;24",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            284.6666666666667,
            212.1702670550759
        ],
        "wc_reply_reviewers_avg": [
            77.66666666666667,
            109.83725334431037
        ],
        "wc_reply_authors_avg": [
            505.0,
            519.230841405503
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R093PYy5aKIJ:scholar.google.com/&scioq=Attentive+Sequential+Neural+Processes&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJlHwkBYDH",
        "title": "Nesterov Accelerated Gradient and Scale Invariance for Adversarial Attacks",
        "track": "main",
        "status": "Poster",
        "tldr": "We proposed a Nesterov Iterative Fast Gradient Sign Method (NI-FGSM) and a Scale-Invariant attack Method (SIM) that can boost the transferability of adversarial examples for image classification.",
        "abstract": "Deep learning models are vulnerable to adversarial examples crafted by applying human-imperceptible perturbations on benign inputs. However, under the black-box setting, most existing adversaries often have a poor transferability to attack other defense models. In this work, from the perspective of regarding the adversarial example generation as an optimization process, we propose two new methods to improve the transferability of adversarial examples, namely Nesterov Iterative Fast Gradient Sign Method (NI-FGSM) and Scale-Invariant attack Method (SIM). NI-FGSM aims to adapt Nesterov accelerated gradient into the iterative attacks so as to effectively look ahead and improve the transferability of adversarial examples. While SIM is based on our discovery on the scale-invariant property of deep learning models, for which we leverage to optimize the adversarial perturbations over the scale copies of the input images so as to avoid \"overfitting\u201d on the white-box model being attacked and generate more transferable adversarial examples. NI-FGSM and SIM can be naturally integrated to build a robust gradient-based attack to generate more transferable adversarial examples against the defense models. Empirical results on ImageNet dataset demonstrate that our attack methods exhibit higher transferability and achieve higher attack success rates than state-of-the-art gradient-based attacks.",
        "keywords": "adversarial examples;adversarial attack;transferability;Nesterov accelerated gradient;scale invariance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiadong Lin;Chuanbiao Song;Kun He;Liwei Wang;John E. Hopcroft",
        "authorids": "jdlin@hust.edu.cn;cbsong@hust.edu.cn;brooklet60@hust.edu.cn;wanglw@cis.pku.edu.cn;jeh@cs.cornell.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLin2020Nesterov,\ntitle={Nesterov Accelerated Gradient and Scale Invariance for Adversarial Attacks},\nauthor={Jiadong Lin and Chuanbiao Song and Kun He and Liwei Wang and John E. Hopcroft},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlHwkBYDH}\n}",
        "github": "https://github.com/JHL-HUST/SI-NI-FGSM",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJlHwkBYDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "150;242;241",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "677;800;299",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            211.0,
            43.135445594854666
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            592.0,
            213.1806745462637
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 734,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10642064480465270866&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJlJSaEFwS",
        "title": "Robust Cross-lingual Embeddings from Parallel Sentences",
        "track": "main",
        "status": "Reject",
        "tldr": "Joint method for learning cross-lingual embeddings with state-of-art performance for cross-lingual tasks and mono-lingual quality",
        "abstract": "Recent advances in cross-lingual word embeddings have primarily relied on mapping-based methods, which project pretrained word embeddings from different languages into a shared space through a linear transformation. However, these approaches assume word embedding spaces are isomorphic between different languages, which has been shown not to  hold in practice (S\u00f8gaard et al., 2018), and fundamentally limits their performance. This motivates investigating joint learning methods which can overcome this impediment, by simultaneously learning embeddings across languages via a cross-lingual term in the training objective. Given the abundance of parallel data available (Tiedemann, 2012), we propose a bilingual extension of the CBOW method which leverages sentence-aligned corpora to obtain robust cross-lingual word and sentence representations. Our approach significantly improves cross-lingual sentence retrieval performance over all other approaches, as well as convincingly outscores mapping methods while maintaining parity with jointly trained methods on word-translation. It also achieves parity with a deep RNN method on a zero-shot cross-lingual document classification task, requiring far fewer computational resources for training and inference. As an additional advantage, our bilingual method also improves the quality of monolingual word vectors despite training on much smaller datasets.  We make our code and models publicly available.\n",
        "keywords": "Cross-lingual embeddings;sent2vec;word2vec;bilingual;word translation;sentence retrieval;text;NLP;word vectors;sentence vectors",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ali Sabet;Prakhar Gupta;Jean-Baptiste Cordonnier;Robert West;Martin Jaggi",
        "authorids": "asabet@uwaterloo.ca;prakhar.gupta@epfl.ch;jean-baptiste.cordonnier@epfl.ch;robert.west@epfl.ch;martin.jaggi@epfl.ch",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsabet2020robust,\ntitle={Robust Cross-lingual Embeddings from Parallel Sentences },\nauthor={Ali Sabet and Prakhar Gupta and Jean-Baptiste Cordonnier and Robert West and Martin Jaggi},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlJSaEFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJlJSaEFwS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "637;762;450",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "531;743;173",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            616.3333333333334,
            128.20903070983556
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            482.3333333333333,
            235.23227291811435
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3644474292882996089&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJlJegHFvH",
        "title": "Address2vec: Generating vector embeddings for blockchain analytics",
        "track": "main",
        "status": "Reject",
        "tldr": "a 2vec model for cryptocurrency transaction graphs",
        "abstract": "Bitcoin is a virtual coinage system that enables users to trade virtually free of a central trusted authority. All transactions on the Bitcoin blockchain are publicly available for viewing, yet as Bitcoin is built mainly for security it\u2019s original structure does not allow for direct analysis of address transactions. \nExisting analysis methods of the Bitcoin blockchain can be complicated, computationally expensive or inaccurate. We propose a computationally efficient model to analyze bitcoin blockchain addresses and allow for their use with existing machine learning algorithms. We compare our approach against Multi Level Sequence Learners (MLSLs), one of the best performing models on bitcoin address data.",
        "keywords": "crypto-currency;bitcoin;blockchain;2vec",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ali Hussein;Samiiha Nalwooga",
        "authorids": "ali.hussein@ronininstitute.org;nsamiiha@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nhussein2020addressvec,\ntitle={Address2vec: Generating vector embeddings for blockchain analytics},\nauthor={Ali Hussein and Samiiha Nalwooga},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlJegHFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJlJegHFvH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "93;226;106",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            141.66666666666666,
            59.86837414046103
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17707843638101082166&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJlKrkSFPH",
        "title": "A FRAMEWORK FOR ROBUSTNESS CERTIFICATION OF SMOOTHED CLASSIFIERS USING F-DIVERGENCES",
        "track": "main",
        "status": "Poster",
        "tldr": "Develop a general framework to establish certified robustness of ML models against various classes of adversarial perturbations",
        "abstract": "Formal verification techniques that compute provable guarantees on properties of machine learning models, like robustness to norm-bounded adversarial perturbations, have yielded impressive results. Although most techniques developed so far require knowledge of the architecture of the machine learning model and remain hard to scale to complex prediction pipelines, the method of randomized smoothing has been shown to overcome many of these obstacles. By requiring only black-box access to the underlying model, randomized smoothing scales to large architectures and is agnostic to the internals of the network. However, past work on randomized smoothing has focused on restricted classes of smoothing measures or perturbations (like Gaussian or discrete) and has only been able to prove robustness with respect to simple norm bounds. In this paper we introduce a general framework for proving robustness properties of smoothed machine learning models in the black-box setting. Specifically, we extend randomized smoothing procedures to handle arbitrary smoothing measures and prove robustness of the smoothed classifier by using f-divergences. Our methodology improves upon the state of the art in terms of computation time or certified robustness on several image classification tasks and an audio classification task, with respect to several classes of adversarial perturbations. ",
        "keywords": "verification of machine learning;certified robustness of neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Krishnamurthy (Dj) Dvijotham;Jamie Hayes;Borja Balle;Zico Kolter;Chongli Qin;Andras Gyorgy;Kai Xiao;Sven Gowal;Pushmeet Kohli",
        "authorids": "dvij@google.com;j.hayes@cs.ucl.ac.uk;bballe@google.com;zkolter@cs.cmu.edu;chongliqin@google.com;agyorgy@google.com;kaix@mit.edu;sgowal@google.com;pushmeet@google.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@inproceedings{\nDvijotham2020A,\ntitle={A FRAMEWORK  FOR ROBUSTNESS CERTIFICATION  OF SMOOTHED CLASSIFIERS USING  F-DIVERGENCES},\nauthor={Krishnamurthy (Dj) Dvijotham and Jamie Hayes and Borja Balle and Zico Kolter and Chongli Qin and Andras Gyorgy and Kai Xiao and Sven Gowal and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlKrkSFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJlKrkSFPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "466;205;438",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "291;248;443",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.6666666666667,
            116.9966761161283
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            327.3333333333333,
            83.65139302818308
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 64,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12264288896578176180&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJlM0JSFDr",
        "title": "A Theoretical Analysis of Deep Q-Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We provide a characterization of the sample complexity of Q-learning and minimax Q-learning with deep neural networks",
        "abstract": "Despite the great empirical success of deep reinforcement learning, its theoretical foundation is less well understood. In this work, we make the first attempt to theoretically understand the deep Q-network (DQN) algorithm (Mnih et al., 2015) from both algorithmic and statistical perspectives. In specific, we focus on a slight simplification of DQN that fully captures its key features. Under mild assumptions, we establish the algorithmic and statistical rates of convergence for the action-value functions of the iterative policy sequence obtained by DQN. In particular, the statistical error characterizes the bias and variance that arise from approximating the action-value function using deep neural network, while the algorithmic error converges to zero at a geometric rate. As a byproduct, our analysis provides justifications for the techniques of experience replay and target network, which are crucial to the empirical success of DQN. Furthermore, as a simple extension of DQN, we propose the Minimax-DQN algorithm for zero-sum Markov game with two players, which is deferred to the appendix due to space limitations.",
        "keywords": "reinforcement learning;deep Q network;minimax-Q learning;zero-sum Markov Game",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhuoran Yang;Yuchen Xie;Zhaoran Wang",
        "authorids": "zy6@princeton.edu;yuchenxie2020@u.northwestern.edu;zhaoranwang@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyang2020a,\ntitle={A Theoretical Analysis of  Deep Q-Learning},\nauthor={Zhuoran Yang and Yuchen Xie and Zhaoran Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlM0JSFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJlM0JSFDr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "696;594;230",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1084;795;29",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            506.6666666666667,
            200.01555495066432
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            636.0,
            445.1344366218667
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 946,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3478993896770202937&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SJlNnhVYDr",
        "title": "Soft Token Matching for Interpretable Low-Resource Classification",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a model to efficiently tackle low resource classification, which leverages additional information in the input to guide the learning process.",
        "abstract": "We propose a model to tackle classification tasks in the presence of very little training data. To this aim, we introduce a novel matching mechanism to focus on elements of the input by using vectors that represent semantically meaningful concepts for the task at hand.\nBy leveraging highlighted portions of the training data, a simple, yet effective, error boosting technique guides the learning process. In practice, it increases the error associated to relevant parts of the input by a given factor. Results on text classification tasks confirm the benefits of the proposed approach in both balanced and unbalanced cases, thus being of practical use when labeling new examples is expensive. In addition, the model is interpretable, as it allows for human inspection of the learned weights.",
        "keywords": "low-resource classification;semantic matching;error boosting;text classification;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Federico Errica;Fabrizio Silvestri;Bora Edizel;Sebastian Riedel;Ludovic Denoyer;Vassilis Plachouras",
        "authorids": "federico.errica@phd.unipi.it;fabrizio.silvestri@gmail.com;b.edizel@gmail.com;sebastian.riedel@gmail.com;denoyer@fb.com;vplachouras@fb.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nerrica2020soft,\ntitle={Soft Token Matching for Interpretable Low-Resource Classification},\nauthor={Federico Errica and Fabrizio Silvestri and Bora Edizel and Sebastian Riedel and Ludovic Denoyer and Vassilis Plachouras},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlNnhVYDr}\n}",
        "github": "https://www.dropbox.com/s/5ygc44k56954qtf/PARCUS_CODE.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJlNnhVYDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "170;480;193",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1107;654;271",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            281.0,
            141.0271841407417
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            677.3333333333334,
            341.69414523648027
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KHj0MDOe4T0J:scholar.google.com/&scioq=Soft+Token+Matching+for+Interpretable+Low-Resource+Classification&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJlOq34Kwr",
        "title": "Unsupervised Intuitive Physics from Past Experiences",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We consider the problem of learning models of intuitive physics from raw, unlabelled visual input. Differently from prior work, in addition to learning general physical principles, we are also interested in learning ``on the fly'' physical properties specific to new environments, based on a small number of environment-specific experiences. We do all this in an unsupervised manner, using a meta-learning formulation where the goal is to predict videos containing demonstrations of physical phenomena, such as objects moving and colliding with a complex background. We introduce the idea of summarizing past experiences in a very compact manner, in our case using dynamic images, and show that this can be used to solve the problem well and efficiently. Empirically, we show, via extensive experiments and ablation studies, that our model learns to perform physical predictions that generalize well in time and space, as well as to a variable number of interacting physical objects.",
        "keywords": "Intuitive physics;Deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sebastien Ehrhardt;Aron Monszpart;Niloy Mitra;Andrea Vedaldi",
        "authorids": "hyenal@robots.ox.ac.uk;aron@nianticlabs.com;n.mitra@cs.ucl.ac.uk;vedaldi@robots.ox.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nehrhardt2020unsupervised,\ntitle={Unsupervised Intuitive Physics from Past Experiences},\nauthor={Sebastien Ehrhardt and Aron Monszpart and Niloy Mitra and Andrea Vedaldi},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlOq34Kwr}\n}",
        "github": "https://drive.google.com/file/d/1Lr4sB_WOlSQ5qAfBzkfqVBrYDXe9KgxE/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJlOq34Kwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "461;660;577",
        "wc_reply_reviewers": "0;0;383",
        "wc_reply_authors": "882;858;458",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            566.0,
            81.61290747588072
        ],
        "wc_reply_reviewers_avg": [
            127.66666666666667,
            180.54793146296512
        ],
        "wc_reply_authors_avg": [
            732.6666666666666,
            194.4656496373817
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14785625975980714138&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJlPOCEKvH",
        "title": "Compressing BERT: Studying the Effects of Weight Pruning on Transfer Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Universal feature extractors, such as BERT for natural language processing and VGG for computer vision, have become effective methods for improving deep learning models without requiring more labeled data. A common paradigm is to pre-train a feature extractor on large amounts of data then fine-tune it as part of a deep learning model on some downstream task (i.e. transfer learning). While effective, feature extractors like BERT may be prohibitively large for some deployment scenarios. We explore weight pruning for BERT and ask: how does compression during pre-training affect transfer learning? We find that pruning affects transfer learning in three broad regimes. Low levels of pruning (30-40%) do not affect pre-training loss or transfer to downstream tasks at all. Medium levels of pruning increase the pre-training loss and prevent useful pre-training information from being transferred to downstream tasks. High levels of pruning additionally prevent models from fitting downstream datasets, leading to further degradation. Finally, we observe that fine-tuning BERT on a specific task does not improve its prunability. We conclude that BERT can be pruned once during pre-training rather than separately for each task without affecting performance.",
        "keywords": "compression;pruning;pre-training;BERT;language modeling;transfer learning;ML;NLP",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mitchell A Gordon;Kevin Duh;Nicholas Andrews",
        "authorids": "mgordo37@jhu.edu;kevinduh@cs.jhu.edu;noa@jhu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngordon2020compressing,\ntitle={Compressing {\\{}BERT{\\}}: Studying the Effects of Weight Pruning on Transfer Learning},\nauthor={Mitchell A Gordon and Kevin Duh and Nicholas Andrews},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlPOCEKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJlPOCEKvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "301;190;304",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "34;701;482",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            265.0,
            53.04714883949938
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            405.6666666666667,
            277.5996317640849
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 397,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11788421670627771762&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SJlPZlStwS",
        "title": "Shape Features Improve General Model Robustness",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A unified  model  to improve model robustness against multiple tasks",
        "abstract": "Recent studies show that convolutional neural networks (CNNs) are vulnerable under various settings, including adversarial examples, backdoor attacks, and distribution shifting.  Motivated by the findings that human visual system pays more attention to global structure (e.g., shape) for recognition while CNNs are biased towards local texture features in images, we propose a unified framework EdgeGANRob based on robust edge features to improve the robustness of CNNs in general, which first explicitly extracts shape/structure features from a given image and then reconstructs a new image by refilling the texture information with a trained generative adversarial network (GAN). In addition, to reduce the sensitivity of edge detection algorithm to adversarial perturbation, we propose a robust edge detection approach Robust Canny based on the vanilla Canny algorithm. To gain more insights, we also compare EdgeGANRob with its simplified backbone procedure EdgeNetRob, which performs learning tasks directly on the extracted robust edge features. We find that EdgeNetRob can help boost model robustness significantly but at the cost of the clean model accuracy. EdgeGANRob, on the other hand, is able to improve clean model accuracy compared with EdgeNetRob and without losing the robustness benefits introduced by EdgeNetRob. Extensive experiments show that EdgeGANRob is resilient in different learning tasks under diverse settings.\n",
        "keywords": "adversarial machine learning;robustness;backdoor attacks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chaowei Xiao;Mingjie Sun;Haonan Qiu;Han Liu;Mingyan Liu;Bo Li",
        "authorids": "xiaocw@umich.edu;mingjies@andrew.cmu.edu;haonanqiu@link.cuhk.edu.cn;hanliu@northwestern.edu;mingyan@umich.ed;lxbosky@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJlPZlStwS",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "816;260;408",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            494.6666666666667,
            235.1132115008049
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_WdBJxc9wkcJ:scholar.google.com/&scioq=Shape+Features+Improve+General+Model+Robustness&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJlRDCVtwr",
        "title": "Simplicial Complex Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel method for supervised learning through subdivisioning the input space along with function approximation.",
        "abstract": "Universal approximation property of neural networks is one of the motivations to use these models in various real-world problems. However, this property is not the only characteristic that makes neural networks unique as there is a wide range of other approaches with similar property. Another characteristic which makes these models interesting is that they can be trained with the backpropagation algorithm which allows an efficient gradient computation and gives these universal approximators the ability to efficiently learn complex manifolds from a large amount of data in different domains. Despite their abundant use in practice, neural networks are still not well understood and a broad range of ongoing research is to study the interpretability of neural networks. On the other hand, topological data analysis (TDA) relies on strong theoretical framework of (algebraic) topology along with other mathematical tools for analyzing possibly complex datasets. In this work, we leverage a universal approximation theorem originating from algebraic topology to build a connection between TDA and common neural network training framework. We introduce the notion of automatic subdivisioning and devise a particular type of neural networks for regression tasks: Simplicial Complex Networks (SCNs). SCN's architecture is defined with a set of bias functions along with a particular policy during the forward pass which alternates the common architecture search framework in neural networks. We believe the view of SCNs can be used as a step towards building interpretable deep learning models. Finally, we verify its performance on a set of regression problems.",
        "keywords": "topological data analysis;supervised learning;simplicial approximation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohammad Firouzi;Sadra Boreiri;Hamed Firouzi",
        "authorids": "mfirouzi@alphabist.com;sadra.boreiri@epfl.ch;hfirouzi@alphabist.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfirouzi2020simplicial,\ntitle={Simplicial Complex Networks},\nauthor={Mohammad Firouzi and Sadra Boreiri and Hamed Firouzi},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlRDCVtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJlRDCVtwr",
        "pdf_size": 0,
        "rating": "1;1",
        "confidence": "0;0",
        "wc_review": "1590;257",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            923.5,
            666.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "SJlRF04YwB",
        "title": "Generating Semantic Adversarial Examples with Differentiable Rendering",
        "track": "main",
        "status": "Reject",
        "tldr": "Generating Semantic Adversarial Examples with Differentiable Rendering",
        "abstract": "Machine learning (ML) algorithms, especially deep neural networks, have demonstrated success in several domains. However, several types of attacks have raised concerns about deploying ML in safety-critical domains, such as autonomous driving and security. An attacker perturbs a data point slightly in the pixel space and causes the ML algorithm to misclassify (e.g. a perturbed stop sign is classified as a yield sign). These perturbed data points are called adversarial examples, and there are numerous algorithms in the literature for constructing adversarial examples and defending against them. In this paper we explore semantic adversarial examples (SAEs) where an attacker creates perturbations in the semantic space. For example, an attacker can change the background of the image to be cloudier to cause misclassification. We present an algorithm for constructing SAEs that uses recent advances in differential rendering and inverse graphics. ",
        "keywords": "semantic adversarial examples;inverse graphics;differentiable rendering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lakshya Jain;Steven Chen;Wilson Wu;Uyeong Jang;Varun Chandrasekaran;Sanjit Seshia;Somesh Jha",
        "authorids": "lakshya.jain@berkeley.edu;scchen@berkeley.edu;wilswu@berkeley.edu;wjang@cs.wisc.edu;vchandrasek4@wisc.edu;sseshia@eecs.berkeley.edu;jha@cs.wisc.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\njain2020generating,\ntitle={Generating Semantic Adversarial Examples with Differentiable Rendering},\nauthor={Lakshya Jain and Steven Chen and Wilson Wu and Uyeong Jang and Varun Chandrasekaran and Sanjit Seshia and Somesh Jha},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlRF04YwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJlRF04YwB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "318;199;513",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "152;270;535",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            343.3333333333333,
            129.43552663606525
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            319.0,
            160.1520111227663
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2627683183338840985&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJlRUkrFPS",
        "title": "Learning transport cost from subset correspondence",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Learning to align multiple datasets is an important problem with many applications, and it is especially useful when we need to integrate multiple experiments or correct for confounding. Optimal transport (OT) is a principled approach to align  datasets, but a key challenge in applying OT is that we need to specify a cost function that accurately captures how the two datasets are related. Reliable cost functions are typically not available and practitioners often resort to using hand-crafted or Euclidean cost even if it may not be appropriate. In this work, we investigate how to learn the cost function using a small amount of side information which is often available. The side information we consider captures subset correspondence---i.e. certain subsets of points in the two data sets are known to be related. For example, we may have some images labeled as cars in both datasets; or we may have a common annotated cell type in single-cell data from two batches. We develop an end-to-end optimizer (OT-SI) that differentiates through the Sinkhorn algorithm and effectively learns the suitable cost function from side information. On systematic experiments in images, marriage-matching and single-cell RNA-seq, our method substantially outperform state-of-the-art benchmarks. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruishan Liu;Akshay Balsubramani;James Zou",
        "authorids": "ruishan@stanford.edu;akshay7@gmail.com;jamesyzou@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLiu2020Learning,\ntitle={Learning transport cost from subset correspondence},\nauthor={Ruishan Liu and Akshay Balsubramani and James Zou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlRUkrFPS}\n}",
        "github": "https://drive.google.com/drive/folders/1TSqWqF7k0j4WzZ67YshVzI0YYC7EK6tm?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJlRUkrFPS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "515;359;329",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "392;112;153",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            401.0,
            81.53526844255803
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            219.0,
            123.46929442847994
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1129792136465080289&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJlRWC4FDB",
        "title": "Adversarial Attacks on Copyright Detection Systems",
        "track": "main",
        "status": "Reject",
        "tldr": "Adversarial examples can fool YouTube's copyright detection system",
        "abstract": "It is well-known that many machine learning models are susceptible to adversarial attacks, in which an attacker evades a classifier by making small perturbations to inputs. This paper discusses how industrial copyright detection tools, which serve a central role on the web, are susceptible to adversarial attacks. We discuss a range of copyright detection systems, and why they are particularly vulnerable to attacks.  These vulnerabilities are especially apparent for neural network based systems.  As proof of concept, we describe a well-known music identification method and implement this system in the form of a neural net. We then attack this system using simple gradient methods. Adversarial music created this way successfully fools industrial systems, including the AudioTag copyright detector and YouTube's Content ID system. Our goal is to raise awareness of the threats posed by adversarial examples in this space and to highlight the importance of hardening copyright detection systems to attacks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Parsa Saadatpanah;Ali Shafahi;Tom Goldstein",
        "authorids": "parsa@cs.umd.edu;ashafahi@cs.umd.edu;tomg@cs.umd.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsaadatpanah2020adversarial,\ntitle={Adversarial Attacks on Copyright Detection Systems},\nauthor={Parsa Saadatpanah and Ali Shafahi and Tom Goldstein},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlRWC4FDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJlRWC4FDB",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "365;355;705;261",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "496;453;549;94",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            421.5,
            168.63199577778826
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            398.0,
            178.77779504177806
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 50,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14318837297676810096&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SJlVVAEKwS",
        "title": "Adversarial Imitation Attack",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel adversarial imitation attack to fool machine learning models.",
        "abstract": "Deep learning models are known to be vulnerable to adversarial examples. A practical adversarial attack should require as little as possible knowledge of attacked models T. Current substitute attacks need pre-trained models to generate adversarial examples and their attack success rates heavily rely on the transferability of adversarial examples. Current score-based and decision-based attacks require lots of queries for the T. In this study, we propose a novel adversarial imitation attack. First, it produces a replica of the T by a two-player game like the generative adversarial networks (GANs). The objective of the generative model G is to generate examples which lead D returning different outputs with T. The objective of the discriminative model D is to output the same labels with T under the same inputs. Then, the adversarial examples generated by D are utilized to fool the T. Compared with the current substitute attacks, imitation attack can use less training data to produce a replica of T and improve the transferability of adversarial examples. Experiments demonstrate that our imitation attack requires less training data than the black-box substitute attacks, but achieves an attack success rate close to the white-box attack on unseen data with no query. ",
        "keywords": "Adversarial examples;Security;Machine learning;Deep neural network;Computer vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mingyi Zhou;Jing Wu;Yipeng Liu;Xiaolin Huang;Shuaicheng Liu;Liaqat Ali;Xiang Zhang;Ce Zhu",
        "authorids": "zhoumingyi@std.uestc.edu.cn;wujing@std.uestc.edu.cn;yipengliu@uestc.edu.cn;xiaolinhuang@sjtu.edu.cn;liushuaicheng@uestc.edu.cn;engr_liaqat183@yahoo.com;uestchero@uestc.edu.cn;eczhu@uestc.edu.cn",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nzhou2020adversarial,\ntitle={Adversarial Imitation Attack},\nauthor={Mingyi Zhou and Jing Wu and Yipeng Liu and Xiaolin Huang and Shuaicheng Liu and Liaqat Ali and Xiang Zhang and Ce Zhu},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlVVAEKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJlVVAEKwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "87;344;197",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "446;753;130",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            209.33333333333334,
            105.28163288162955
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            443.0,
            254.34753127692562
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Bqh1_kv5ALgJ:scholar.google.com/&scioq=Adversarial+Imitation+Attack&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "SJlVY04FwH",
        "title": "Convergence of Gradient Methods on Bilinear Zero-Sum Games",
        "track": "main",
        "status": "Poster",
        "tldr": "We systematically analyze the convergence of popular gradient algorithms for solving bilinear games, with both simultaneous and alternating updates.",
        "abstract": "Min-max formulations have attracted great attention in the ML community due to the rise of deep generative models and adversarial methods, while understanding the dynamics of gradient algorithms for solving such formulations has remained a grand challenge. As a first step, we restrict to bilinear zero-sum games and give a systematic analysis of popular gradient updates, for both simultaneous and alternating versions. We provide exact conditions for their convergence and find the optimal parameter setup and convergence rates. In particular, our results offer formal evidence that alternating updates converge \"better\" than simultaneous ones.",
        "keywords": "GAN;gradient algorithm;convergence;min-max optimization;bilinear game",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guojun Zhang;Yaoliang Yu",
        "authorids": "guojun.zhang@uwaterloo.ca;yaoliang.yu@uwaterloo.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nZhang2020Convergence,\ntitle={Convergence of Gradient Methods on Bilinear Zero-Sum Games},\nauthor={Guojun Zhang and Yaoliang Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlVY04FwH}\n}",
        "github": "https://github.com/Gordon-Guojun-Zhang/ICLR-2020",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJlVY04FwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "264;478;678",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "530;627;466",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            473.3333333333333,
            169.04700201094633
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            541.0,
            66.18660488850193
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 55,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18092221422699658079&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJlVn6NKPB",
        "title": "Representation Learning for Remote Sensing: An Unsupervised Sensor Fusion Approach",
        "track": "main",
        "status": "Reject",
        "tldr": "Multiple sensor views imply a self-supervised task for learning what things are in aerial imagery without labels",
        "abstract": "In the application of machine learning to remote sensing, labeled data is often scarce or expensive, which impedes the training of powerful models like deep convolutional neural networks. Although unlabeled data is abundant, recent self-supervised learning approaches are ill-suited to the remote sensing domain. In addition, most remote sensing applications currently use only a small subset of the multi-sensor, multi-channel information available, motivating the need for fused multi-sensor representations. We propose a new self-supervised training objective, Contrastive Sensor Fusion, which exploits coterminous data from multiple sources to learn useful representations of every possible combination of those sources. This method uses information common across multiple sensors and bands by training a single model to produce a representation that remains similar when any subset of its input channels is used. Using a dataset of 47 million unlabeled coterminous image triplets, we train an encoder to produce semantically meaningful representations from any possible combination of channels from the input sensors. These representations outperform fully supervised ImageNet weights on a remote sensing classification task and improve as more sensors are fused.",
        "keywords": "unsupervised learning;representation learning;deep learning;remote sensing;sensor fusion",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aidan M. Swope;Xander H. Rudelis;Kyle T. Story",
        "authorids": "aidanswope@gmail.com;xander@descarteslabs.com;kyle@descarteslabs.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nswope2020representation,\ntitle={Representation Learning for Remote Sensing: An Unsupervised Sensor Fusion Approach},\nauthor={Aidan M. Swope and Xander H. Rudelis and Kyle T. Story},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlVn6NKPB}\n}",
        "github": "https://storage.cloud.google.com/public-published-datasets/csf_code.zip",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJlVn6NKPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1185;482;713",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "756;753;775",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            793.3333333333334,
            292.56604192709875
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            761.3333333333334,
            9.741092797468305
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 17,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8501194087822053381&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SJlWyerFPS",
        "title": "DeepXML: Scalable & Accurate Deep Extreme Classification for Matching User Queries to Advertiser Bid Phrases",
        "track": "main",
        "status": "Reject",
        "tldr": "Scalable and accurate deep multi label learning with millions of labels.",
        "abstract": "The objective in deep extreme multi-label learning is to jointly learn feature representations and classifiers to automatically tag data points with the most relevant subset of labels from an extremely large label set. Unfortunately, state-of-the-art deep extreme classifiers are either not scalable or inaccurate for short text documents.  This paper develops the DeepXML algorithm which addresses both limitations by introducing a novel architecture that splits training of head and tail labels.  DeepXML increases accuracy by (a) learning word embeddings on head labels and transferring them through a novel residual connection to data impoverished tail labels; (b) increasing the amount of negative training data available by extending state-of-the-art negative sub-sampling techniques; and (c) re-ranking the set of predicted labels to eliminate the hardest negatives for the original classifier. All of these contributions are implemented efficiently by extending the highly scalable Slice algorithm for pretrained embeddings to learn the proposed DeepXML architecture. As a result, DeepXML could efficiently scale to problems involving millions of labels that were beyond the pale of state-of-the-art deep extreme classifiers as it could be more than 10x faster at training than XML-CNN and AttentionXML. At the same time, DeepXML was also empirically determined to be up to 19% more accurate than leading techniques for matching search engine queries to advertiser bid phrases.",
        "keywords": "extreme multi label learning;extreme classification;deep extreme multi label learning;deep extreme classification;large output space",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kunal Dahiya;Anshul Mittal;Deepak Saini;Kushal Dave;Himanshu Jain;Sumeet Agarwal;Manik Varma",
        "authorids": "kunalsdahiya@gmail.com;anshulmittal71@gmail.com;desaini@microsoft.com;kudave@microsoft.com;himanshu.j689@gmail.com;sumeet@iitd.ac.in;manik@microsoft.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\ndahiya2020deepxml,\ntitle={Deep{\\{}XML{\\}}: Scalable {\\&} Accurate Deep Extreme Classification for Matching User Queries to Advertiser Bid Phrases},\nauthor={Kunal Dahiya and Anshul Mittal and Deepak Saini and Kushal Dave and Himanshu Jain and Sumeet Agarwal and Manik Varma},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlWyerFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJlWyerFPS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "467;414;264",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1126;1001;700",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            381.6666666666667,
            85.97027910207625
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            942.3333333333334,
            178.79286587805703
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8829791566903596552&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJlYqRNKDS",
        "title": "Blockwise Adaptivity: Faster Training and Better Generalization in Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a blockwise adaptive gradient descent method that enjoys faster convergence and better generalization than its coordinate-wise counterpart.",
        "abstract": "Stochastic methods with coordinate-wise adaptive stepsize (such as RMSprop and Adam) have been widely used in training deep neural networks. Despite their fast convergence, they can generalize worse than stochastic gradient descent.  In this paper, by revisiting the design of Adagrad, we propose to split the network parameters into blocks, and use a blockwise\nadaptive stepsize. Intuitively, blockwise adaptivity is less aggressive than adaptivity to individual coordinates, and can have a better balance between adaptivity and generalization. We show theoretically that the proposed blockwise adaptive gradient\ndescent has comparable regret in online convex learning and convergence rate for optimizing nonconvex objective as its counterpart with coordinate-wise adaptive stepsize, but is better up to some constant. We also study its uniform stability \nand show that blockwise adaptivity can lead to lower generalization error than coordinate-wise adaptivity. Experimental results show that blockwise adaptive gradient descent converges faster and improves generalization performance over Nesterov's accelerated gradient and Adam.",
        "keywords": "optimization;deep learning;blockwise adaptivity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shuai Zheng;James T. Kwok",
        "authorids": "zs910504@gmail.com;jamesk@cse.ust.hk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzheng2020blockwise,\ntitle={Blockwise Adaptivity:  Faster Training and Better Generalization in Deep Learning},\nauthor={Shuai Zheng and James T. Kwok},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlYqRNKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJlYqRNKDS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "455;359;293",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "763;536;349",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.0,
            66.51315659326356
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            549.3333333333334,
            169.27754986674663
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8290977958667423105&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJlbGJrtDB",
        "title": "Dynamic Sparse Training: Find Efficient Sparse Network From Scratch With Trainable Masked Layers",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a novel network pruning method that can find the optimal sparse structure during the training process with trainable pruning threshold",
        "abstract": "We present a novel network pruning algorithm called Dynamic Sparse Training that can jointly \ufb01nd the optimal network parameters and sparse network structure in a uni\ufb01ed optimization process with trainable pruning thresholds. These thresholds can have \ufb01ne-grained layer-wise adjustments dynamically via backpropagation. We demonstrate that our dynamic sparse training algorithm can easily train very sparse neural network models with little performance loss using the same training epochs as dense models. Dynamic Sparse Training achieves prior art performance compared with other sparse training algorithms on various network architectures. Additionally, we have several surprising observations that provide strong evidence to the effectiveness and ef\ufb01ciency of our algorithm. These observations reveal the underlying problems of traditional three-stage pruning algorithms and present the potential guidance provided by our algorithm to the design of more compact network architectures.",
        "keywords": "neural network pruning;sparse learning;network compression;architecture search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junjie LIU;Zhe XU;Runbin SHI;Ray C. C. Cheung;Hayden K.H. So",
        "authorids": "jjliu@eee.hku.hk;zhexu22-c@my.cityu.edu.hk;rbshi@eee.hku.hk;r.cheung@cityu.edu.hk;hso@eee.hku.hk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLIU2020Dynamic,\ntitle={Dynamic Sparse Training: Find Efficient Sparse Network From Scratch With Trainable Masked Layers},\nauthor={Junjie LIU and Zhe XU and Runbin SHI and Ray C. C. Cheung and Hayden K.H. So},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlbGJrtDB}\n}",
        "github": "https://github.com/junjieliu2910/DynamicSaprseTraining",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJlbGJrtDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "178;489;310",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "780;2121;528",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            325.6666666666667,
            127.44759271515836
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1143.0,
            699.1609256816345
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 146,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2417069645139449524&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SJlbvp4YvS",
        "title": "Risk Averse Value Expansion for Sample Efficient and Robust Policy Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We extend the model-based value expansion methods with risk-averse learning and achieve state-of-the-art results on challenging continuous control benchmarks.",
        "abstract": "Model-based Reinforcement Learning(RL) has shown great advantage in sample-efficiency, but suffers from poor asymptotic performance and high inference cost. A promising direction is to combine model-based reinforcement learning with model-free reinforcement learning, such as model-based value expansion(MVE). However, the previous methods do not take into account the stochastic character of the environment, thus still suffers from higher function approximation errors. As a result, they tend to fall behind the best model-free algorithms in some challenging scenarios. We propose a novel Hybrid-RL method, which is developed from MVE, namely the Risk Averse Value Expansion(RAVE). In the proposed method, we use an ensemble of probabilistic models for environment modeling to generate imaginative rollouts, based on which we further introduce the aversion of risks by seeking the lower confidence bound of the estimation. Experiments on different environments including MuJoCo and robo-school show that RAVE yields state-of-the-art performance. Also we found that it greatly prevented some catastrophic consequences such as falling down and thus reduced the variance of the rewards.",
        "keywords": "reinforcement learning;model-based RL;risk-sensitive;sample efficiency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bo Zhou;Fan Wang;Hongsheng Zeng;Hao Tian",
        "authorids": "zhoubo01@baidu.com;wangfan04@baidu.com;zenghongsheng@baidu.com;tianhao@baidu.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhou2020risk,\ntitle={Risk Averse Value Expansion for Sample Efficient and Robust Policy Learning},\nauthor={Bo Zhou and Fan Wang and Hongsheng Zeng and Hao Tian},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlbvp4YvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJlbvp4YvS",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "667;409;1361;261",
        "wc_reply_reviewers": "0;0;15;223",
        "wc_reply_authors": "0;0;786;106",
        "reply_reviewers": "0;0;1;1",
        "reply_authors": "0;0;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            674.5,
            422.1406756047088
        ],
        "wc_reply_reviewers_avg": [
            59.5,
            94.59519015256537
        ],
        "wc_reply_authors_avg": [
            223.0,
            327.9161478183104
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            0.5,
            0.5
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kJhLnozUEp0J:scholar.google.com/&scioq=Risk+Averse+Value+Expansion+for+Sample+Efficient+and+Robust+Policy+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJlbyCNtPr",
        "title": "Long-term planning, short-term adjustments",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposes a new deep reinforcement learning algorithm that can be easily adjusted to achieve new short-term goals without retraining the network. ",
        "abstract": "Deep Reinforcement Learning (RL) algorithms can learn complex policies to optimize\nagent operation over time. RL algorithms have shown promising results\nin solving complicated problems in recent years. However, their application on\nreal-world physical systems remains limited. Despite the advancements in RL\nalgorithms, the industries often prefer traditional control strategies. Traditional\nmethods are simple, computationally efficient and easy to adjust. In this paper,\nwe propose a new Q-learning algorithm for continuous action space, which can\nbridge the control and RL algorithms and bring us the best of both worlds. Our\nmethod can learn complex policies to achieve long-term goals and at the same time\nit can be easily adjusted to address short-term requirements without retraining.\nWe achieve this by modeling both short-term and long-term prediction models.\nThe short-term prediction model represents the estimation of the system dynamic\nwhile the long-term prediction model represents the Q-value. The case studies\ndemonstrate that our proposed method can achieve short-term and long-term goals\nwithout complex reward functions.",
        "keywords": "Deep Reinforcement Learning;Control",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hamed Khorasgani;Chi Zhang;Chetan Gupta;Susumu Serita",
        "authorids": "hamed.khorasgani@hal.hitachi.com;chi.zhang@hal.hitachi.com;chetan.gupta@hal.hitachi.com;susumu.serita@hal.hitachi.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkhorasgani2020longterm,\ntitle={Long-term planning, short-term adjustments},\nauthor={Hamed Khorasgani and Chi Zhang and Chetan Gupta and Susumu Serita},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlbyCNtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJlbyCNtPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "205;376;315",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "262;805;337",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            298.6666666666667,
            70.75937315217602
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            468.0,
            240.2540322242272
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16667889084678452368&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJldu6EtDS",
        "title": "Wasserstein Adversarial Regularization (WAR) on label noise",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a novel method for handling label noise through an adversarial regularization incorporating a Wasserstein distance",
        "abstract": "Noisy labels often occur in vision datasets, especially when they are obtained from crowdsourcing or Web scraping. We propose a new regularization method, which enables learning robust classifiers in presence of noisy data. To achieve this goal, we propose a new adversarial regularization scheme based on the Wasserstein distance.  Using this distance allows taking into account specific relations between classes by leveraging the geometric properties of the labels space.  Our Wasserstein Adversarial Regularization (WAR) encodes a selective regularization, which promotes smoothness of the classifier between some classes, while preserving sufficient complexity of the decision boundary between others. We first discuss how and why adversarial regularization can be used in the context of label noise and then show the effectiveness of our method on five datasets corrupted with noisy labels: in both benchmarks and real datasets, WAR outperforms the state-of-the-art\ncompetitors.",
        "keywords": "Label Noise;Adversarial regularization;Wasserstein",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bharath Damodaran;Kilian Fatras;Sylvain Lobry;R\u00e9mi Flamary;Devis Tuia;Nicolas Courty",
        "authorids": "bharath-bhushan.damodaran@irisa.fr;kilian.fatras@irisa.fr;sylvain.lobry@wur.nl;remi.flamary@unice.fr;devis.tuia@wur.nl;ncourty@irisa.fr",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ndamodaran2020wasserstein,\ntitle={Wasserstein Adversarial Regularization ({\\{}WAR{\\}}) on label noise},\nauthor={Bharath Damodaran and Kilian Fatras and Sylvain Lobry and R{\\'e}mi Flamary and Devis Tuia and Nicolas Courty},\nyear={2020},\nurl={https://openreview.net/forum?id=SJldu6EtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJldu6EtDS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "216;245;358",
        "wc_reply_reviewers": "49;0;0",
        "wc_reply_authors": "934;348;161",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            273.0,
            61.25901294231459
        ],
        "wc_reply_reviewers_avg": [
            16.333333333333332,
            23.098821518760555
        ],
        "wc_reply_authors_avg": [
            481.0,
            329.29115789323384
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7484317893722038401&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJleNCNtDH",
        "title": "Intrinsic Motivation for Encouraging Synergistic Behavior",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a formulation of intrinsic motivation that is suitable as an exploration bias in synergistic multi-agent tasks, by encouraging agents to affect the world in ways that would not be achieved if they were acting individually.",
        "abstract": "We study the role of intrinsic motivation as an exploration bias for reinforcement learning in sparse-reward synergistic tasks, which are tasks where multiple agents must work together to achieve a goal they could not individually. Our key idea is that a good guiding principle for intrinsic motivation in synergistic tasks is to take actions which affect the world in ways that would not be achieved if the agents were acting on their own. Thus, we propose to incentivize agents to take (joint) actions whose effects cannot be predicted via a composition of the predicted effect for each individual agent. We study two instantiations of this idea, one based on the true states encountered, and another based on a dynamics model trained concurrently with the policy. While the former is simpler, the latter has the benefit of being analytically differentiable with respect to the action taken. We validate our approach in robotic bimanual manipulation and multi-agent locomotion tasks with sparse rewards; we find that our approach yields more efficient learning than both 1) training with only the sparse reward and 2) using the typical surprise-based formulation of intrinsic motivation, which does not bias toward synergistic behavior. Videos are available on the project webpage: https://sites.google.com/view/iclr2020-synergistic.",
        "keywords": "reinforcement learning;intrinsic motivation;synergistic;robot manipulation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rohan Chitnis;Shubham Tulsiani;Saurabh Gupta;Abhinav Gupta",
        "authorids": "ronuchit@mit.edu;shubhtuls@fb.com;saurabhg@illinois.edu;abhinavg@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nChitnis2020Intrinsic,\ntitle={Intrinsic Motivation for Encouraging Synergistic Behavior},\nauthor={Rohan Chitnis and Shubham Tulsiani and Saurabh Gupta and Abhinav Gupta},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJleNCNtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJleNCNtDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "210;423;125",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "196;335;127",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            252.66666666666666,
            125.34308472703586
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            219.33333333333334,
            86.50369292052733
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11891832627454676638&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJlgTJHKwB",
        "title": "Continual Learning with Delayed Feedback",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Most of the artificial neural networks are using the benefit of labeled datasets whereas in human brain, the learning is often unsupervised. The feedback or a label for a given input or a sensory stimuli is not often available instantly. After some time when brain gets the feedback, it updates its knowledge. That's how brain learns. Moreover, there is no training or testing phase. Human learns continually. This work proposes a model-agnostic continual learning framework which can be used with neural networks as well as decision trees to incorporate continual learning. Specifically, this work investigates how delayed feedback can be handled. In addition, a way to update the Machine Learning models with unlabeled data is proposed. Promising results are received from the experiments done on neural networks and decision trees. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "THEIVENDIRAM PRANAVAN;TERENCE SIM",
        "authorids": "pranavan@u.nus.edu;tsim@comp.nus.edu.sg",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\npranavan2020continual,\ntitle={Continual Learning with Delayed Feedback},\nauthor={THEIVENDIRAM PRANAVAN and TERENCE SIM},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlgTJHKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJlgTJHKwB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "376;746;147",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            423.0,
            246.78870854775076
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CCBUZqxbk3MJ:scholar.google.com/&scioq=Continual+Learning+with+Delayed+Feedback&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJlh8CEYDB",
        "title": "Learn to Explain Efficiently via Neural Logic Inductive Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "An efficient differentiable ILP model that learns first-order logic rules that can explain the data.",
        "abstract": "The capability of making interpretable and self-explanatory decisions is essential for developing responsible machine learning systems. In this work, we study the learning to explain the problem in the scope of inductive logic programming (ILP). We propose Neural Logic Inductive Learning (NLIL), an efficient differentiable ILP framework that learns first-order logic rules that can explain the patterns in the data. In experiments, compared with the state-of-the-art models, we find NLIL is able to search for rules that are x10 times longer while remaining x3 times faster. We also show that NLIL can scale to large image datasets, i.e. Visual Genome, with 1M entities.",
        "keywords": "inductive logic programming;interpretability;attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuan Yang;Le Song",
        "authorids": "yyang754@gatech.edu;lsong@cc.gatech.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nYang2020Learn,\ntitle={Learn to Explain Efficiently via Neural Logic Inductive Learning},\nauthor={Yuan Yang and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlh8CEYDB}\n}",
        "github": "https://github.com/gblackout/NLIL",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJlh8CEYDB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "509;1406;474",
        "wc_reply_reviewers": "228;174;0",
        "wc_reply_authors": "1539;2000;1292",
        "reply_reviewers": "1;2;0",
        "reply_authors": "3;5;2",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            796.3333333333334,
            431.3361669149584
        ],
        "wc_reply_reviewers_avg": [
            134.0,
            97.28309205612247
        ],
        "wc_reply_authors_avg": [
            1610.3333333333333,
            293.40794505640474
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            3.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 105,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4550874980727321525&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJllFpVYwS",
        "title": "Teaching GAN to generate per-pixel annotation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "GAN-based method for joint image and per-pixel annotation synthesis",
        "abstract": "We propose a method for joint image and per-pixel annotation synthesis with GAN. We demonstrate that GAN has good high-level representation of target data that can be easily projected to semantic segmentation masks. This method can be used to create a training dataset for teaching separate semantic segmentation network. Our experiments show that such segmentation network successfully generalizes on real data. Additionally, the method outperforms supervised training when the number of training samples is small, and works on variety of different scenes and classes. The source code of the proposed method will be publicly available. ",
        "keywords": "GAN;unsupervised representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Danil Galeev;Konstantin Sofiyuk;Danila Rukhovich;Anton Konushin;Mikhail Romanov",
        "authorids": "denemmy@gmail.com;ksofiyuk@gmail.com;danrukh@gmail.com;a.konushin@samsung.com;m.romanov@samsung.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJllFpVYwS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "161;166;512",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            279.6666666666667,
            164.29715626131681
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:C7D2EGNnH60J:scholar.google.com/&scioq=Teaching+GAN+to+generate+per-pixel+annotation&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "SJloA0EYDr",
        "title": "A\u22c6MCTS: SEARCH WITH THEORETICAL GUARANTEE USING POLICY AND VALUE FUNCTIONS",
        "track": "main",
        "status": "Reject",
        "tldr": "theoretical and experimental results for novel tree search algorithm that efficiently finds optimal policy",
        "abstract": "Combined with policy and value neural networks, Monte Carlos Tree Search (MCTS) is a critical component of the recent success of AI agents in learning to play board games like Chess and Go (Silver et al., 2017). However, the theoretical foundations of MCTS with policy and value networks remains open. Inspired by MCTS, we propose A\u22c6MCTS, a novel search algorithm that uses both the policy and value predictors to guide search and enjoys theoretical guarantees. Specifically, assuming that value and policy networks give reasonably accurate signals of the values of each state and action, the sample complexity (number of calls to the value network) to estimate the value of the current state, as well as the optimal one-step action to take from the current state, can be bounded. We apply our theoretical framework to different models for the noise distribution of the policy and value network as well as the distribution of rewards, and show that for these general models, the sample complexity is polynomial in D, where D is the depth of the search tree. Empirically, our method outperforms MCTS in these models.",
        "keywords": "tree search;reinforcement learning;value neural network;policy neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xian Wu;Yuandong Tian;Lexing Ying",
        "authorids": "xwu20@stanford.edu;yuandong@fb.com;lexing@stanford.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwu2020amcts,\ntitle={A{\\ensuremath{\\star}}{\\{}MCTS{\\}}: {\\{}SEARCH{\\}} {\\{}WITH{\\}} {\\{}THEORETICAL{\\}} {\\{}GUARANTEE{\\}} {\\{}USING{\\}} {\\{}POLICY{\\}} {\\{}AND{\\}} {\\{}VALUE{\\}} {\\{}FUNCTIONS{\\}}},\nauthor={Xian Wu and Yuandong Tian and Lexing Ying},\nyear={2020},\nurl={https://openreview.net/forum?id=SJloA0EYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJloA0EYDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "284;543;147",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "244;835;83",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            324.6666666666667,
            164.20380290629353
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            387.3333333333333,
            323.29999828160976
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BUZfyQfmBr8J:scholar.google.com/&scioq=A%E2%8B%86MCTS:+SEARCH+WITH+THEORETICAL+GUARANTEE+USING+POLICY+AND+VALUE+FUNCTIONS&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJlo_TVKwS",
        "title": "Couple-VAE: Mitigating the Encoder-Decoder Incompatibility in Variational Text Modeling with Coupled Deterministic Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "The variational autoencoder (VAE) combines latent variable models and amortized variational inference. Despite its theoretical attractiveness, the optimization of VAE for text modeling suffers from the posterior collapse problem, where the decoder ignores the latent codes, and the posterior becomes nearly identical to the prior. We demonstrate that the VAE training dynamics face the challenge of encoder-decoder incompatibility, in which the encoder receives scarce backpropagated gradients from the decoder, and little encoded information is passed to the decoder. We propose a model-agnostic approach, named Couple-VAE, to mitigate this issue. Specifically, we couple the VAE model with a deterministic network with the same structure, which is optimized with the reconstruction loss without any regularization (e.g., the KL divergence). To enrich the backpropagated gradients for the encoder, we share the encoder between the deterministic network and the stochastic network. To encourage nontrivial decoding signals, we propose a coupling loss that pushes the stochastic decoding signals to the deterministic ones. We conduct extensive experiments on the Penn Treebank, Yelp, and Yahoo. We apply the proposed method to various variational text modeling models with different regularization terms, posterior families, decoder architectures, and optimization strategies and observe consistently improved text modeling results in terms of probability estimation and the richness of the encoded text.",
        "keywords": "variational autoencoders;posterior collapse;text modeling;amortized variational inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chen Wu;Prince Zizhuang Wang;William Yang Wang",
        "authorids": "wu-c16@mails.tsinghua.edu.cn;zizhuang_wang@ucsb.edu;william@cs.ucsb.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJlo_TVKwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "381;98;325",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            268.0,
            122.36284839225779
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4133938621626939151&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJlpYJBKvH",
        "title": "Measuring the Reliability of Reinforcement Learning Algorithms",
        "track": "main",
        "status": "Spotlight",
        "tldr": "A novel set of metrics for measuring reliability of reinforcement learning algorithms (+ accompanying statistical tests)",
        "abstract": "Lack of reliability is a well-known issue for reinforcement learning (RL) algorithms. This problem has gained increasing attention in recent years, and efforts to improve it have grown substantially. To aid RL researchers and production users with the evaluation and improvement of reliability, we propose a set of metrics that quantitatively measure different aspects of reliability. In this work, we focus on variability and risk, both during training and after learning (on a fixed policy). We designed these metrics to be general-purpose, and we also designed complementary statistical tests to enable rigorous comparisons on these metrics. In this paper, we first describe the desired properties of the metrics and their design, the aspects of reliability that they measure, and their applicability to different scenarios. We then describe the statistical tests and make additional practical recommendations for reporting results. The metrics and accompanying statistical tools have been made available as an open-source library. We apply our metrics to a set of common RL algorithms and environments, compare them, and analyze the results.",
        "keywords": "reinforcement learning;metrics;statistics;reliability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stephanie C.Y. Chan;Samuel Fishman;Anoop Korattikara;John Canny;Sergio Guadarrama",
        "authorids": "scychan@google.com;sfishman@google.com;kbanoop@google.com;canny@google.com;sguada@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nChan2020Measuring,\ntitle={Measuring the Reliability of Reinforcement Learning Algorithms},\nauthor={Stephanie C.Y. Chan and Samuel Fishman and Anoop Korattikara and John Canny and Sergio Guadarrama},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlpYJBKvH}\n}",
        "github": "https://github.com/google-research/rl-reliability-metrics",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJlpYJBKvH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "161;529;374",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "87;1278;882",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            354.6666666666667,
            150.85607563355066
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            749.0,
            495.23529761114565
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 113,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=921553679446510240&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJlpy64tvB",
        "title": "Attacking Lifelong Learning Models with Gradient Reversion",
        "track": "main",
        "status": "Reject",
        "tldr": "Extensive evaluation of the robustness of episodic lifelong learning algorithm under traditional adversarial attacks and the proposed gradient reversion attack. ",
        "abstract": "Lifelong  learning  aims  at  avoiding  the  catastrophic  forgetting  problem  of  traditional supervised learning  models.   Episodic memory based  lifelong learning methods such as A-GEM (Chaudhry et al., 2018b) are shown to achieve the state-of-the-art results across the benchmarks. In A-GEM, a small episodic memory is utilized to store a random subset of the examples from previous tasks.  While the model is trained on a new task, a reference gradient is computed on the episodic memory to guide the direction of the current update.  While A-GEM has strong continual learning ability,  it is not clear that if it can retain the performance in the presence of adversarial attacks.  In this paper, we examine the robustness ofA-GEM against adversarial attacks to the examples in the episodic memory.  We evaluate the effectiveness of traditional attack methods such as FGSM and PGD.The results show that A-GEM still possesses strong continual learning ability in the  presence  of  adversarial  examples  in  the  memory  and  simple  defense  techniques such as label smoothing can further alleviate the adversarial effects.  We presume that traditional attack methods are specially designed for standard supervised learning models rather than lifelong learning models. we therefore propose a principled way for attacking A-GEM called gradient reversion(GREV) which is shown to be more effective.  Our results indicate that future lifelong learning research should bear adversarial attacks in mind to develop more robust lifelong learning algorithms.",
        "keywords": "lifelong learning;adversarial learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yunhui Guo;Mingrui Liu;Yandong Li;Liqiang Wang;Tianbao Yang;Tajana Rosing",
        "authorids": "yug185@eng.ucsd.edu;mingrui-liu@uiowa.edu;lyndon.leeseu@outlook.com;lwang@cs.ucf.edu;tianbao-yang@uiowa.edu;tajana@ucsd.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nguo2020attacking,\ntitle={Attacking Lifelong Learning Models with Gradient Reversion},\nauthor={Yunhui Guo and Mingrui Liu and Yandong Li and Liqiang Wang and Tianbao Yang and Tajana Rosing},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlpy64tvB}\n}",
        "github": "https://drive.google.com/file/d/1zdSJ0aZR3KxoH_TDY1vMd5LFiDBS6v43/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJlpy64tvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "316;183;298",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            265.6666666666667,
            58.91425030405537
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10946590219495635755&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SJlsFpVtDB",
        "title": "Continual Learning with Bayesian Neural Networks for Non-Stationary Data",
        "track": "main",
        "status": "Poster",
        "tldr": "This work addresses continual learning for non-stationary data, using Bayesian neural networks and memory-based online variational Bayes.",
        "abstract": "This work addresses continual learning for non-stationary data, using Bayesian neural networks and memory-based online variational Bayes. We represent the posterior approximation of the network weights by a diagonal Gaussian distribution and a complementary memory of raw data. This raw data corresponds to likelihood terms that cannot be well approximated by the Gaussian. We introduce a novel method for sequentially updating both components of the posterior approximation. Furthermore, we propose Bayesian forgetting and a Gaussian diffusion process for adapting to non-stationary data. The experimental results show that our update method improves on existing approaches for streaming data. Additionally, the adaptation methods lead to better predictive performance for non-stationary data. ",
        "keywords": "Continual Learning;Online Variational Bayes;Non-Stationary Data;Bayesian Neural Networks;Variational Inference;Lifelong Learning;Concept Drift;Episodic Memory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Richard Kurle;Botond Cseke;Alexej Klushyn;Patrick van der Smagt;Stephan G\u00fcnnemann",
        "authorids": "richard.kurle@tum.de;botond.cseke@argmax.ai;a.klushyn@tum.de;smagt@argmax.ai;guennemann@in.tum.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nKurle2020Continual,\ntitle={Continual Learning with Bayesian Neural Networks for Non-Stationary Data},\nauthor={Richard Kurle and Botond Cseke and Alexej Klushyn and Patrick van der Smagt and Stephan G\u00fcnnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlsFpVtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJlsFpVtDB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "506;748;328",
        "wc_reply_reviewers": "23;0;32",
        "wc_reply_authors": "699;750;599",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            527.3333333333334,
            172.1265684185784
        ],
        "wc_reply_reviewers_avg": [
            18.333333333333332,
            13.474255287605157
        ],
        "wc_reply_authors_avg": [
            682.6666666666666,
            62.71806402907823
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 101,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4721192290130334551&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJlxglSFPB",
        "title": "Efficacy of Pixel-Level OOD Detection for Semantic Segmentation",
        "track": "main",
        "status": "Reject",
        "tldr": "Evaluating pixel-level out-of-distribution detection methods on two new real world datasets using PSPNet and DeeplabV3+.",
        "abstract": "The detection of out of distribution samples for image classification has been widely researched. Safety critical applications, such as autonomous driving, would benefit from the ability to localise the unusual objects causing the image to be out of distribution. This paper adapts state-of-the-art methods for detecting out of distribution images for image classification to the new task of detecting out of distribution pixels, which can localise the unusual objects. It further experimentally compares the adapted methods on two new datasets derived from existing semantic segmentation datasets using PSPNet and DeeplabV3+ architectures, as well as proposing a new metric for the task. The evaluation shows that the performance ranking of the compared methods does not transfer to the new task and every method performs significantly worse than their image-level counterparts.",
        "keywords": "Out-of-Distribution Detection;Semantic Segmentation;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matt Angus;Krzysztof Czarnecki;Rick Salay",
        "authorids": "m2angus@gsd.uwaterloo.ca;rsalay@gsd.uwaterloo.ca;k2czarne@gsd.uwaterloo.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nangus2020efficacy,\ntitle={Efficacy of Pixel-Level {\\{}OOD{\\}} Detection for Semantic Segmentation},\nauthor={Matt Angus and Krzysztof Czarnecki and Rick Salay},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlxglSFPB}\n}",
        "github": "https://github.com/ICLR-2020-Anon/Submission",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJlxglSFPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "396;513;1082",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            663.6666666666666,
            299.6379296565922
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9509776585904975272&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJlyta4YPS",
        "title": "DeepEnFM: Deep neural networks with Encoder enhanced Factorization Machine",
        "track": "main",
        "status": "Reject",
        "tldr": "DNN and Encoder enhanced FM with bilinear attention and max-pooling for CTR",
        "abstract": "Click Through Rate (CTR) prediction is a critical task in industrial applications, especially for online social and commerce applications. It is challenging to find a proper way to automatically discover the effective cross features in CTR tasks. We propose a novel model for CTR tasks, called Deep neural networks with Encoder enhanced Factorization Machine (DeepEnFM). Instead of learning the cross features directly, DeepEnFM adopts the Transformer encoder as a backbone to align the feature embeddings with the clues of other fields. The embeddings generated from encoder are beneficial for the further feature interactions. Particularly, DeepEnFM utilizes a bilinear approach to generate different similarity functions with respect to different field pairs. Furthermore, the max-pooling method makes DeepEnFM feasible to capture both the supplementary and suppressing information among different attention heads. Our model is validated on the Criteo and Avazu datasets, and achieves state-of-art performance.",
        "keywords": "CTR;Attention;Transformer;Encoder",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qiang Sun;Zhinan Cheng;Yanwei Fu;Wenxuan Wang;Yu-Gang Jiang;Xiangyang Xue",
        "authorids": "~Qiang_Sun2;zhinancheng.bryan@gmail.com;yanweifu@fudan.edu.cn;wxwang.iris@gmail.com;ygj@fudan.edu.cn;xyxue@fudan.edu.cn",
        "gender": "M;;;;;",
        "homepage": "https://sites.google.com/view/qsun;;;;;",
        "dblp": "73/2066-7;;;;;",
        "google_scholar": "f0V2fAYAAAAJ;;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": "~Qiang_Sun2;zhinancheng.bryan@gmail.com;yanweifu@fudan.edu.cn;wxwang.iris@gmail.com;ygj@fudan.edu.cn;xyxue@fudan.edu.cn",
        "aff": "University of Toronto;;;;;",
        "aff_domain": "utoronto.ca;;;;;",
        "position": "Full Professor;;;;;",
        "bibtex": "@misc{\nsun2020deepenfm,\ntitle={DeepEn{\\{}FM{\\}}: Deep neural networks with Encoder enhanced Factorization Machine},\nauthor={Qiang Sun and Zhinan Cheng and Yanwei Fu and Wenxuan Wang and Yu-Gang Jiang and Xiangyang Xue},\nyear={2020},\nurl={https://openreview.net/forum?id=SJlyta4YPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJlyta4YPS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "280;246;289",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "182;124;216",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            271.6666666666667,
            18.51725921644153
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            174.0,
            37.98245208865097
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:78cCQSpdlzcJ:scholar.google.com/&scioq=DeepEnFM:+Deep+neural+networks+with+Encoder+enhanced+Factorization+Machine&hl=en&as_sdt=0,33",
        "gs_version_total": 0,
        "aff_unique_index": "0",
        "aff_unique_norm": "University of Toronto",
        "aff_unique_dep": "",
        "aff_unique_url": "https://www.utoronto.ca",
        "aff_unique_abbr": "U of T",
        "aff_country_unique_index": "0",
        "aff_country_unique": "Canada"
    },
    {
        "id": "SJlzvTNtvS",
        "title": "Better Optimization for Neural Architecture Search with Mixed-Level Reformulation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Many recently proposed methods for Neural Architecture Search (NAS) can be formulated as bilevel optimization. For efficient implementation, its solution requires approximations of second-order methods. In this paper, we demonstrate that gradient errors caused by such approximation leads to suboptimality, in that such a procedure fails to converge to a (locally) optimal solution. To remedy this problem, this paper proposes MiLeNAS, a mixed-level reformulation for Neural Architecture Search that can be optimized more reliably. It is shown that even when using a simple first-order method on mixed-level formulation, MiLeNAS can achieve lower validation error for NAS problems. Consequently, architectures obtained by our method achieve consistently higher accuracies than those obtained from bilevel optimization. Moreover, the use of first-order updates in our method also leads to faster training. Extensive experiments within convolutional architecture search space validate the effectiveness of our approach. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chaoyang He;Haishan Ye;Tong Zhang",
        "authorids": "chaoyang.he@usc.edu;yhs12354123@163.com;tongzhang@tongzhang-ml.org",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://tinyurl.com/milenas-code",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJlzvTNtvS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "628;270;631",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            509.6666666666667,
            169.47435073059154
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LxEGpeJDLT0J:scholar.google.com/&scioq=Better+Optimization+for+Neural+Architecture+Search+with+Mixed-Level+Reformulation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJx-j64FDr",
        "title": "In Search for a SAT-friendly Binarized Neural Network Architecture",
        "track": "main",
        "status": "Poster",
        "tldr": "Formal analysis of  Binarized Neural Networks ",
        "abstract": "Analyzing the behavior of neural networks is  one of the most pressing challenges in deep learning.  Binarized Neural Networks are an important class of networks that allow equivalent representation in Boolean logic and can be analyzed formally with logic-based reasoning tools like SAT solvers. Such tools can be used to answer existential and probabilistic queries about the network, perform explanation generation, etc. However, the main bottleneck for all methods is their ability to reason about large BNNs efficiently. In this work, we analyze architectural design choices of BNNs and discuss how they affect the performance of logic-based reasoners. We propose changes to the BNN architecture and the training procedure to get a simpler network for SAT solvers without sacrificing accuracy on the primary task. Our experimental results demonstrate that our approach scales to larger deep neural networks compared to existing work for existential and probabilistic queries, leading to significant speed ups on all tested datasets.\n",
        "keywords": "verification;Boolean satisfiability;Binarized Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nina Narodytska;Hongce Zhang;Aarti Gupta;Toby Walsh",
        "authorids": "n.narodytska@gmail.com;hongcez@princeton.edu;aartig@cs.princeton.edu;toby.walsh@data61.csiro.au",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nNarodytska2020In,\ntitle={In Search for a SAT-friendly Binarized Neural Network Architecture},\nauthor={Nina Narodytska and Hongce Zhang and Aarti Gupta and Toby Walsh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx-j64FDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJx-j64FDr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "141;448;284",
        "wc_reply_reviewers": "0;0;71",
        "wc_reply_authors": "20;735;505",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            291.0,
            125.42992731667617
        ],
        "wc_reply_reviewers_avg": [
            23.666666666666668,
            33.469720976163245
        ],
        "wc_reply_authors_avg": [
            420.0,
            298.02125203862
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 43,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7970780711429886690&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SJx0F2VtwB",
        "title": "SDNet: Contextualized Attention-based Deep Network for Conversational Question Answering",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A neural method for conversational question answering with attention mechanism and a novel usage of BERT as contextual embedder",
        "abstract": "Conversational question answering (CQA) is a novel QA task that requires the understanding of dialogue context. Different from traditional single-turn machine reading comprehension (MRC), CQA is a comprehensive task comprised of passage reading, coreference resolution, and contextual understanding. In this paper, we propose an innovative contextualized attention-based deep neural network, SDNet, to fuse context into traditional MRC models. Our model leverages both inter-attention and self-attention to comprehend the conversation and passage. Furthermore, we demonstrate a novel method to integrate the BERT contextual model as a sub-module in our network. Empirical results show the effectiveness of SDNet. On the CoQA leaderboard, it outperforms the previous best model's F1 score by 1.6%. Our ensemble model further improves the F1 score by 2.7%.",
        "keywords": "Machine Reading Comprehension;Conversational Question Answering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chenguang Zhu;Michael Zeng;Xuedong Huang",
        "authorids": "chezhu@microsoft.com;nzeng@microsoft.com;xdh@microsoft.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://www.dropbox.com/s/5fiabkrgc8gtr10/SDNet.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJx0F2VtwB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "216;302;441",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            319.6666666666667,
            92.70143232742176
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 145,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8333627368172197484&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJx0PAEFDS",
        "title": "Underwhelming Generalization Improvements From Controlling Feature Attribution",
        "track": "main",
        "status": "Reject",
        "tldr": "There is hope that one can diagnose and fix overfitting in classifiers by studying and guiding their saliency maps, but we developed multiple methods to do this well and only see a minor positive effect on generalization.",
        "abstract": "Overfitting is a common issue in machine learning, which can arise when the model learns to predict class membership using convenient but spuriously-correlated image features instead of the true image features that denote a class. These are typically visualized using saliency maps. In some object classification tasks such as for medical images, one may have some images with masks, indicating a region of interest, i.e., which part of the image contains the most relevant information for the classification. We describe a simple method for taking advantage of such auxiliary labels, by training networks to ignore the distracting features which may be extracted outside of the region of interest, on the training images for which such masks are available. This mask information is only used during training and has an impact on generalization accuracy in a dataset-dependent way. We observe an underwhelming relationship between controlling saliency maps and improving generalization performance.",
        "keywords": "interpretability;medical;generalization;saliency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Joseph D Viviano;Becks Simpson;Francis Dutil;Yoshua Bengio;Joseph Paul Cohen",
        "authorids": "joseph@viviano.ca;becks.simpson@imagia.com;francis.dutil@imagia.com;yoshua.bengio@mila.quebec;joseph@josephpcohen.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nviviano2020underwhelming,\ntitle={Underwhelming Generalization Improvements From Controlling Feature Attribution},\nauthor={Joseph D Viviano and Becks Simpson and Francis Dutil and Yoshua Bengio and Joseph Paul Cohen},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx0PAEFDS}\n}",
        "github": "https://github.com/bigtrellis2222/activmask",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJx0PAEFDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "296;381;1203",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "471;276;822",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            626.6666666666666,
            409.00393912800183
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            523.0,
            225.91591356077598
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2534734644576814925&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJx0oAEYwH",
        "title": "Cover Filtration and Stable Paths in the Mapper",
        "track": "main",
        "status": "Reject",
        "tldr": "A new filtration from a SINGLE cover, with applications to movie recommendations and explainable machine learning",
        "abstract": "The contributions of this paper are two-fold. We define a new filtration called the cover filtration built from a single cover based on a generalized Steinhaus distance, which is a generalization of Jaccard distance. We then develop a language and theory for stable paths within this filtration, inspired by ideas of persistent homology. This framework can be used to develop several new learning representations in applications where an obvious metric may not be defined but a cover is readily available. We demonstrate the utility of our framework as applied to recommendation systems and explainable machine learning.\n\nWe demonstrate a new perspective for modeling recommendation system data sets that does not require manufacturing a bespoke metric. As a direct application, we find that the stable paths identified by our framework in a movies data set represent a sequence of movies constituting a gentle transition and ordering from one genre to another.\n\nFor explainable machine learning, we apply the Mapper for model induction, providing explanations in the form of paths between subpopulations. Our framework provides an alternative way of building a filtration from a single mapper that is then used to explore stable paths. As a direct illustration, we build a mapper from a supervised machine learning model trained on the FashionMNIST data set. We show that the stable paths in the cover filtration provide improved explanations of relationships between subpopulations of images.\n",
        "keywords": "cover and nerve;Jaccard distance;stable paths in filtration;Mapper;recommender systems;explainable machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dustin L. Arendt;Matthew Broussard;Bala Krishnamoorthy;Nathaniel Saul",
        "authorids": "dustin.arendt@pnnl.gov;matthew.broussard@wsu.edu;kbala@wsu.edu;nat@riverasaul.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\narendt2020cover,\ntitle={Cover Filtration and Stable Paths in the Mapper},\nauthor={Dustin L. Arendt and Matthew Broussard and Bala Krishnamoorthy and Nathaniel Saul},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx0oAEYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJx0oAEYwH",
        "pdf_size": 0,
        "rating": "1;6",
        "confidence": "0;0",
        "wc_review": "1992;368",
        "wc_reply_reviewers": "82;0",
        "wc_reply_authors": "2951;307",
        "reply_reviewers": "1;0",
        "reply_authors": "5;1",
        "rating_avg": [
            3.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1180.0,
            812.0
        ],
        "wc_reply_reviewers_avg": [
            41.0,
            41.0
        ],
        "wc_reply_authors_avg": [
            1629.0,
            1322.0
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            3.0,
            2.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ad8VrT2xnGMJ:scholar.google.com/&scioq=Cover+Filtration+and+Stable+Paths+in+the+Mapper&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJx0q1rtvS",
        "title": "Robust anomaly detection and backdoor attack detection via differential privacy",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper shows that differential privacy could improve the utility of outlier detection, novelty detection and backdoor attack detection, through both a theoretical analysis and extensive experimental results (constructed and real-world).",
        "abstract": "Outlier detection and novelty detection are two important topics for anomaly detection. Suppose the majority of a dataset are drawn from a certain distribution, outlier detection and novelty detection both aim to detect data samples that do not fit the distribution. Outliers refer to data samples within this dataset, while novelties refer to new samples. In the meantime, backdoor poisoning attacks for machine learning models are achieved through injecting poisoning samples into the training dataset, which could be regarded as \u201coutliers\u201d that are intentionally added by attackers. Differential privacy has been proposed to avoid leaking any individual\u2019s information, when aggregated analysis is performed on a given dataset. It is typically achieved by adding random noise, either directly to the input dataset, or to intermediate results of the aggregation mechanism. In this paper, we demonstrate that applying differential privacy could improve the utility of outlier detection and novelty detection, with an extension to detect poisoning samples in backdoor attacks. We first present a theoretical analysis on how differential privacy helps with the detection, and then conduct extensive experiments to validate the effectiveness of differential privacy in improving outlier detection, novelty detection, and backdoor attack detection.",
        "keywords": "outlier detection;novelty detection;backdoor attack detection;system log anomaly detection;differential privacy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Min Du;Ruoxi Jia;Dawn Song",
        "authorids": "min.du@berkeley.edu;ruoxijia@berkeley.edu;dawnsong@berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nDu2020Robust,\ntitle={Robust anomaly detection and backdoor attack detection via differential privacy},\nauthor={Min Du and Ruoxi Jia and Dawn Song},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx0q1rtvS}\n}",
        "github": "https://www.dropbox.com/sh/rt8qzii7wr07g6n/AAAbwokv2sfBeE9XAL2pXv_Aa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJx0q1rtvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "234;340;400",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "360;428;832",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            324.6666666666667,
            68.63105872869967
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            540.0,
            208.333066666496
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 216,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16932863589506648039&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJx1URNKwH",
        "title": "MetaPix: Few-Shot Video Retargeting",
        "track": "main",
        "status": "Poster",
        "tldr": "Video retargeting typically requires large amount of target data to be effective, which may not always be available; we propose a metalearning approach that improves over popular baselines while producing temporally coherent frames.",
        "abstract": "We address the task of unsupervised retargeting of human actions from one video to another. We consider the challenging setting where only a few frames of the target is available. The core of our approach is a conditional generative model that can transcode input skeletal poses (automatically extracted with an off-the-shelf pose estimator) to output target frames. However, it is challenging to build a universal transcoder because humans can appear wildly different due to clothing and background scene geometry. Instead, we learn to adapt \u2013 or personalize \u2013 a universal generator to the particular human and background in the target. To do so, we make use of meta-learning to discover effective strategies for on-the-fly personalization. One significant benefit of meta-learning is that the personalized transcoder naturally enforces temporal coherence across its generated frames; all frames contain consistent clothing and background geometry of the target. We experiment on in-the-wild internet videos and images and show our approach improves over widely-used baselines for the task.\n",
        "keywords": "Meta-learning;Few-shot Learning;Generative Adversarial Networks;Video Retargeting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jessica Lee;Deva Ramanan;Rohit Girdhar",
        "authorids": "jl5@cs.cmu.edu;deva@cs.cmu.edu;rgirdhar@cs.cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLee2020MetaPix:,\ntitle={MetaPix: Few-Shot Video Retargeting},\nauthor={Jessica Lee and Deva Ramanan and Rohit Girdhar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx1URNKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJx1URNKwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "80;159;182",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "152;128;603",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            140.33333333333334,
            43.68320297576887
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            294.3333333333333,
            218.48010334022538
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 31,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10830519468480260826&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJx371HFvr",
        "title": "On the Distribution of Penultimate Activations of Classification Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "This paper considers probability distributions of penultimate activations in deep classi\ufb01cation networks. We \ufb01rst identify a dual relation between the activations and the weights of the \ufb01nal fully connected layer: learning the networks with the cross-entropy loss makes their (normalized) penultimate activations follow a von Mises-Fisher distribution for each class, which is parameterized by the weights of the \ufb01nal fully-connected layer. Through this analysis, we derive a probability density function of penultimate activations per class. This generative model allows us to synthesize activations of classi\ufb01cation networks without feeding images forward through them. We also demonstrate through experiments that our generative model of penultimate activations can be applied to real-world applications such as knowledge distillation and class-conditional image generation.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Minkyo Seo;Yoonho Lee;Suha Kwak",
        "authorids": "mkseo@postech.ac.kr;einet89@gmail.com;suha.kwak@postech.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJx371HFvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "344;556;318",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.0,
            106.59580979882215
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3632550500506638719&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SJx37TEtDH",
        "title": "Why ADAM Beats SGD for Attention Models",
        "track": "main",
        "status": "Reject",
        "tldr": "Adaptive methods provably beat SGD in training attention models due to existence of heavy tailed noise.",
        "abstract": "While stochastic gradient descent (SGD) is still the de facto algorithm in deep learning, adaptive methods like Adam have been observed to outperform SGD across important tasks, such as attention models. The settings under which SGD performs poorly in comparison to Adam are not well understood yet. In this paper, we provide empirical and theoretical evidence that a heavy-tailed distribution of the noise in stochastic gradients is a root cause of SGD's poor performance. Based on this observation, we study clipped variants of SGD that circumvent this issue; we then analyze their convergence under heavy-tailed noise. Furthermore, we develop a new adaptive coordinate-wise clipping algorithm (ACClip) tailored to such settings. Subsequently, we show how adaptive methods like Adam can be viewed through the lens of clipping, which helps us explain Adam's strong performance under heavy-tail noise settings. Finally, we show that the proposed ACClip outperforms Adam for both BERT pretraining and finetuning tasks.",
        "keywords": "Optimization;ADAM;Deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jingzhao Zhang;Sai Praneeth Karimireddy;Andreas Veit;Seungyeon Kim;Sashank J Reddi;Sanjiv Kumar;Suvrit Sra",
        "authorids": "jzhzhang@mit.edu;sai.karimrieddy@epfl.ch;aveit@google.com;seungyeonk@google.com;sashank@google.com;sanjivk@google.com;suvrit@mit.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nzhang2020why,\ntitle={Why {\\{}ADAM{\\}} Beats {\\{}SGD{\\}} for Attention Models\t},\nauthor={Jingzhao Zhang and Sai Praneeth Karimireddy and Andreas Veit and Seungyeon Kim and Sashank J Reddi and Sanjiv Kumar and Suvrit Sra},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx37TEtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJx37TEtDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "202;272;187",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "452;284;238",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            220.33333333333334,
            37.04351795148811
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            324.6666666666667,
            91.97584223890291
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 89,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4427900034435792424&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJx4O34YvS",
        "title": "Semantics Preserving Adversarial Attacks",
        "track": "main",
        "status": "Reject",
        "tldr": "Generating semantically meaningful adversarial examples beyond simple norm balls in an efficient and effective way using generative models.",
        "abstract": "While progress has been made in crafting visually imperceptible adversarial examples, constructing semantically meaningful ones remains a challenge. In this paper, we propose a framework to generate semantics preserving adversarial examples. First, we present a manifold learning method to capture the semantics of the inputs. The motivating principle is to learn the low-dimensional geometric summaries of the inputs via statistical inference. Then, we perturb the elements of the learned manifold using the Gram-Schmidt process to induce the perturbed elements to remain in the manifold. To produce adversarial examples, we propose an efficient algorithm whereby we leverage the semantics of the inputs as a source of knowledge upon which we impose adversarial constraints. We apply our approach on toy data, images and text, and show its effectiveness in producing semantics preserving adversarial examples which evade existing defenses against adversarial attacks.",
        "keywords": "black-box adversarial attacks;stein variational inference;adversarial images and tex",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ousmane Amadou Dia;Elnaz Barshan;Reza Babanezhad",
        "authorids": "ousmane@elementai.com;elnaz.barshan@elementai.com;babanezhad@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndia2020semantics,\ntitle={Semantics Preserving Adversarial Attacks},\nauthor={Ousmane Amadou Dia and Elnaz Barshan and Reza Babanezhad},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx4O34YvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJx4O34YvS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "585;636;301",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "591;0;71",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            507.3333333333333,
            147.37782133761587
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            220.66666666666666,
            263.4645242827876
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10300175777556027095&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SJx4Ogrtvr",
        "title": "Random Bias Initialization Improving Binary Neural Network Training",
        "track": "main",
        "status": "Reject",
        "tldr": "Improve saturating activations (sigmoid, tanh, htanh etc.) and Binarized Neural Network with Bias Initialization",
        "abstract": "Edge intelligence especially binary neural network (BNN) has attracted considerable attention of the artificial intelligence community recently. BNNs significantly reduce the computational cost, model size, and memory footprint.  However, there is still a performance gap between the successful full-precision neural network with ReLU activation and BNNs. We argue that the accuracy drop of BNNs is due to their geometry. \nWe analyze the behaviour of the full-precision neural network with ReLU activation and compare it with its binarized counterpart. This comparison suggests random bias initialization as a remedy to activation saturation in full-precision networks and  leads us towards an improved BNN training. Our numerical experiments confirm our geometric intuition.",
        "keywords": "Binarized Neural Network;Activation function;Initialization;Neural Network Acceleration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinlin Li;Vahid Partovi Nia",
        "authorids": "xinlin.li1@huawei.com;vahid.partovinia@huawei.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nli2020random,\ntitle={Random Bias Initialization Improving Binary Neural Network Training},\nauthor={Xinlin Li and Vahid Partovi Nia},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx4Ogrtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJx4Ogrtvr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "267;264;216",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            249.0,
            23.366642891095847
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hRUqxHZJHG0J:scholar.google.com/&scioq=Random+Bias+Initialization+Improving+Binary+Neural+Network+Training&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJx7004FPH",
        "title": "How fine can fine-tuning be? Learning efficient language models",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Sparsification as fine-tuning of language models",
        "abstract": "State-of-the-art performances on language comprehension tasks are achieved by huge language models pre-trained on massive unlabeled text corpora, with very light subsequent fine-tuning in a task-specific supervised manner. It seems the pre-training procedure learns a very good common initialization for further training on various natural language understanding tasks, such that only few steps need to be taken in the parameter space to learn each task. In this work, using Bidirectional Encoder Representations from Transformers (BERT) as an example, we verify this hypothesis by showing that task-specific fine-tuned language models are highly close in parameter space to the pre-trained one. Taking advantage of such observations, we further show that the fine-tuned versions of these huge models, having on the order of $10^8$ floating-point parameters, can be made very computationally efficient. First, fine-tuning only a fraction of critical layers suffices. Second, fine-tuning can be adequately performed by learning a binary multiplicative mask on pre-trained weights, \\textit{i.e.} by parameter-sparsification. As a result, with a single effort, we achieve three desired outcomes: (1) learning to perform specific tasks, (2) saving memory by storing only binary masks of certain layers for each task, and (3) saving compute on appropriate hardware by performing sparse operations with model parameters.  ",
        "keywords": "language model;BERT;pre-trained;fine-tuning;sparse",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Evani Radiya-Dixit;Xin Wang",
        "authorids": "evanir@stanford.edu;xin@cerebras.net",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=SJx7004FPH",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 73,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7034531909597436054&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJx9ngStPH",
        "title": "NAS-Bench-1Shot1: Benchmarking and Dissecting One-shot Neural Architecture Search",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "One-shot neural architecture search (NAS) has played a crucial role in making\nNAS methods computationally feasible in practice. Nevertheless, there is still a\nlack of understanding on how these weight-sharing algorithms exactly work due\nto the many factors controlling the dynamics of the process. In order to allow\na scientific study of these components, we introduce a general framework for\none-shot NAS that can be instantiated to many recently-introduced variants and\nintroduce a general benchmarking framework that draws on the recent large-scale\ntabular benchmark NAS-Bench-101 for cheap anytime evaluations of one-shot\nNAS methods. To showcase the framework, we compare several state-of-the-art\none-shot NAS methods, examine how sensitive they are to their hyperparameters\nand how they can be improved by tuning their hyperparameters, and compare their\nperformance to that of blackbox optimizers for NAS-Bench-101.",
        "keywords": "Neural Architecture Search;Deep Learning;Computer Vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arber Zela;Julien Siems;Frank Hutter",
        "authorids": "zelaa@cs.uni-freiburg.de;siemsj@cs.uni-freiburg.de;fh@cs.uni-freiburg.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nZela2020NAS-Bench-1Shot1:,\ntitle={NAS-Bench-1Shot1: Benchmarking and Dissecting One-shot Neural Architecture Search},\nauthor={Arber Zela and Julien Siems and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx9ngStPH}\n}",
        "github": "https://github.com/automl/nasbench-1shot1",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJx9ngStPH",
        "pdf_size": 0,
        "rating": "1;8;8",
        "confidence": "0;0;0",
        "wc_review": "237;107;257",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "892;31;82",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            3.299831645537222
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            200.33333333333334,
            66.4997911442
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            335.0,
            394.4084177600676
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 202,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14286994733629357547&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SJxAlgrYDr",
        "title": "City Metro Network Expansion with Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This paper presents a method to solve the city metro network expansion problem using reinforcement learning (RL). In this method, we formulate the metro expansion as a process of sequential station selection, and design feasibility rules based on the selected station sequence to ensure the reasonable connection patterns of metro line. Following this formulation, we train an actor critic model to design the next metro line. The actor is a seq2seq network with attention mechanism to generate the parameterized policy which is the probability distribution over feasible stations. The critic is used to estimate the expected reward, which is determined by the output station sequences generated by the actor during training, in order to reduce the training variance. The learning procedure only requires the reward calculation, thus our general method can be extended to multi-factor cases easily. Considering origin-destination (OD) trips and social equity, we expand the current metro network in Xi'an, China, based on the real mobility information of 24,770,715 mobile phone users in the whole city. The results demonstrate the effectiveness of our method. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Wei;Minjia Mao;Xi Zhao;Jianhua Zou",
        "authorids": "weiyu123112@163.com;maominjia@foxmail.com;zhaoxi1@mail.xjtu.edu.cn;jhzou@sei.xjtu.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwei2020city,\ntitle={City Metro Network Expansion with Reinforcement Learning},\nauthor={Yu Wei and Minjia Mao and Xi Zhao and Jianhua Zou},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxAlgrYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxAlgrYDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1870;559;210",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            879.6666666666666,
            714.6189349731941
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6758865431704096645&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJxDDpEKvH",
        "title": "Counterfactuals uncover the modular structure of deep generative models",
        "track": "main",
        "status": "Poster",
        "tldr": "We develop a framework to find modular internal representations in generative models and manipulate then to generate counterfactual examples.",
        "abstract": "Deep generative models can emulate the perceptual properties of complex image datasets, providing a latent representation of the data. However, manipulating such representation to perform meaningful and controllable transformations in the data space remains challenging without some form of supervision. While previous work has focused on exploiting statistical independence to \\textit{disentangle} latent factors, we argue that such requirement can be advantageously relaxed and propose instead a non-statistical framework that relies on identifying a modular organization of the network, based on counterfactual manipulations. Our experiments support that modularity between groups of channels is achieved to a certain degree on a variety of generative models. This allowed the design of targeted interventions on complex image datasets, opening the way to applications such as computationally efficient style transfer and the automated assessment of robustness to contextual changes in pattern recognition systems.",
        "keywords": "generative models;causality;counterfactuals;representation learning;disentanglement;generalization;unsupervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michel Besserve;Arash Mehrjou;R\u00e9my Sun;Bernhard Sch\u00f6lkopf",
        "authorids": "michel.besserve@tuebingen.mpg.de;mehrjou.arash@gmail.com;remy.sun@ens-rennes.fr;bs@tuebingen.mpg.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nBesserve2020Counterfactuals,\ntitle={Counterfactuals uncover the modular structure of deep generative models},\nauthor={Michel Besserve and Arash Mehrjou and R\u00e9my Sun and Bernhard Sch\u00f6lkopf},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxDDpEKvH}\n}",
        "github": "https://www.dropbox.com/sh/4qnjictmh4a2soq/AAAa5brzPDlt69QOc9n2K4uOa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJxDDpEKvH",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "447;267;1050",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "777;642;692",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            588.0,
            334.8462333668993
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            703.6666666666666,
            55.72751165766829
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 120,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9392882601140010159&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SJxDKerKDS",
        "title": "Reinforcement Learning with Structured Hierarchical Grammar Representations of Actions",
        "track": "main",
        "status": "Reject",
        "tldr": "We use grammar inference techniques to compose primitive actions into temporal abstractions, creating a hierarchical reinforcement learning structure that consistently improves sample efficiency.",
        "abstract": "From a young age humans learn to use grammatical principles to hierarchically combine words into sentences. Action grammars is the parallel idea; that there is an underlying set of rules (a \"grammar\") that govern how we hierarchically combine actions to form new, more complex actions. We introduce the Action Grammar Reinforcement Learning (AG-RL) framework which leverages the concept of action grammars to consistently improve the sample efficiency of Reinforcement Learning agents. AG-RL works by using a grammar inference algorithm to infer the \u201caction grammar\" of an agent midway through training, leading to a higher-level action representation. The agent's action space is then augmented with macro-actions identified by the grammar. We apply this framework to Double Deep Q-Learning (AG-DDQN) and a discrete action version of Soft Actor-Critic (AG-SAC) and find that it improves performance in 8 out of 8 tested Atari games (median +31%, max +668%) and 19 out of 20 tested Atari games (median +96%, maximum +3,756%) respectively without substantive hyperparameter tuning. We also show that AG-SAC beats the model-free state-of-the-art for sample efficiency in 17 out of the 20 tested Atari games (median +62%, maximum +13,140%), again without substantive hyperparameter tuning.",
        "keywords": "Hierarchical Reinforcement Learning;Action Representations;Macro-Actions;Action Grammars",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Petros Christodoulou;Robert Lange;Ali Shafti;A. Aldo Faisal",
        "authorids": "petros.christodoulou18@imperial.ac.uk;rtl17@ic.ac.uk;a.shafti@imperial.ac.uk;a.faisal@imperial.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nchristodoulou2020reinforcement,\ntitle={Reinforcement Learning with Structured Hierarchical Grammar Representations of Actions},\nauthor={Petros Christodoulou and Robert Lange and Ali Shafti and A. Aldo Faisal},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxDKerKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxDKerKDS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "444;615;352",
        "wc_reply_reviewers": "0;500;0",
        "wc_reply_authors": "537;321;289",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            470.3333333333333,
            108.97196377458235
        ],
        "wc_reply_reviewers_avg": [
            166.66666666666666,
            235.70226039551585
        ],
        "wc_reply_authors_avg": [
            382.3333333333333,
            110.14334094967138
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7257426929152388258&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJxE8erKDH",
        "title": "Latent Normalizing Flows for Many-to-Many Cross-Domain Mappings",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Learned joint representations of images and text form the backbone of several important cross-domain tasks such as image captioning. Prior work mostly maps both domains into a common latent representation in a purely supervised fashion. This is rather restrictive, however, as the two domains follow distinct generative processes. Therefore, we propose a novel semi-supervised framework, which models shared information between domains and domain-specific information separately. \nThe information shared between the domains is aligned with an invertible neural network. Our model integrates normalizing flow-based priors for the domain-specific information, which allows us to learn diverse many-to-many mappings between the two domains. We demonstrate the effectiveness of our model on diverse tasks, including image captioning and text-to-image synthesis.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shweta Mahajan;Iryna Gurevych;Stefan Roth",
        "authorids": "mahajan@aiphes.tu-darmstadt.de;gurevych@ukp.informatik.tu-darmstadt.de;stefan.roth@visinf.tu-darmstadt.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nMahajan2020Latent,\ntitle={Latent Normalizing Flows for Many-to-Many Cross-Domain Mappings},\nauthor={Shweta Mahajan and Iryna Gurevych and Stefan Roth},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxE8erKDH}\n}",
        "github": "https://github.com/visinf/lnfmm",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJxE8erKDH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "409;234;454",
        "wc_reply_reviewers": "90;0;0",
        "wc_reply_authors": "382;85;718",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.6666666666667,
            94.8976056365784
        ],
        "wc_reply_reviewers_avg": [
            30.0,
            42.42640687119285
        ],
        "wc_reply_authors_avg": [
            395.0,
            258.58460897740997
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3579800435067088843&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SJxFWRVKDr",
        "title": "Characterizing Missing Information in Deep Networks Using Backpropagated Gradients",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a gradient-based representation for characterizing information that deep networks have not learned.",
        "abstract": "Deep networks face challenges of ensuring their robustness against inputs that cannot be effectively represented by information learned from training data. We attribute this vulnerability to the limitations inherent to activation-based representation. To complement the learned information from activation-based representation, we propose utilizing a gradient-based representation that explicitly focuses on missing information. In addition, we propose a directional constraint on the gradients as an objective during training to improve the characterization of missing information. To validate the effectiveness of the proposed approach, we compare the anomaly detection performance of gradient-based and activation-based representations. We show that the gradient-based representation outperforms the activation-based representation by 0.093 in CIFAR-10 and 0.361 in CURE-TSR datasets in terms of AUROC averaged over all classes. Also, we propose an anomaly detection algorithm that uses the gradient-based representation, denoted as GradCon, and validate its performance on three benchmarking datasets. The proposed method outperforms the majority of the state-of-the-art algorithms in CIFAR-10, MNIST, and fMNIST datasets with an average AUROC of 0.664, 0.973, and 0.934, respectively.",
        "keywords": "Representation learning;Missing Information in Deep Networks;Gradient-based Representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gukyeong Kwon;Mohit Prabhushankar;Dogancan Temel;Ghassan AlRegib",
        "authorids": "gukyeong.kwon@gatech.edu;mohit.p@gatech.edu;cantemel@gatech.edu;alregib@gatech.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkwon2020characterizing,\ntitle={Characterizing Missing Information in Deep Networks Using Backpropagated Gradients},\nauthor={Gukyeong Kwon and Mohit Prabhushankar and Dogancan Temel and Ghassan AlRegib},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxFWRVKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxFWRVKDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "684;316;642",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            547.3333333333334,
            164.4735709941131
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lP8iLGKO0iQJ:scholar.google.com/&scioq=Characterizing+Missing+Information+in+Deep+Networks+Using+Backpropagated+Gradients&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJxHMaEtwB",
        "title": "Domain-invariant Learning using Adaptive Filter Decomposition",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Domain shifts are frequently encountered in real-world scenarios. In this paper, we consider the problem of domain-invariant deep learning by explicitly modeling domain shifts with only a small amount of domain-specific parameters in a Convolutional Neural Network (CNN). By exploiting the observation that a convolutional filter can be well approximated as a linear combination of a small set of basis elements, we show for the first time, both empirically and theoretically, that domain shifts can be effectively handled by decomposing a regular convolutional layer into a domain-specific basis layer and a domain-shared basis coefficient layer, while both remain convolutional. An input channel will now first convolve spatially only with each respective domain-specific basis to ``absorb\" domain variations, and then output channels are linearly combined using common basis coefficients trained to promote shared semantics across domains. We use toy examples, rigorous analysis, and real-world examples to show the framework's effectiveness in cross-domain performance and domain adaptation. With the proposed architecture, we need only a small set of basis elements to model each additional domain, which brings a negligible amount of additional parameters, typically a few hundred.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ze Wang;Xiuyuan Cheng;Guillermo Sapiro;Qiang Qiu",
        "authorids": "ze.w@duke.edu;xiuyuan.cheng@duke.edu;guillermo.sapiro@duke.edu;qiang.qiu@duke.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020domaininvariant,\ntitle={Domain-invariant Learning using Adaptive Filter Decomposition},\nauthor={Ze Wang and Xiuyuan Cheng and Guillermo Sapiro and Qiang Qiu},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxHMaEtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJxHMaEtwB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "185;290;290",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "715;161;209",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            255.0,
            49.49747468305833
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            361.6666666666667,
            250.61169610020644
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qVy888ATBtwJ:scholar.google.com/&scioq=Domain-invariant+Learning+using+Adaptive+Filter+Decomposition&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJxIkkSKwB",
        "title": "Learning in Confusion: Batch Active Learning with Noisy Oracle",
        "track": "main",
        "status": "Reject",
        "tldr": "We address the active learning in batch setting with noisy oracles and use model uncertainty to encode the decision quality of active learning algorithm during acquisition.",
        "abstract": "We study the problem of training machine learning models incrementally using active learning with access to imperfect or noisy oracles. We specifically consider the setting of batch active learning, in which multiple samples are selected as opposed to a single sample as in classical settings so as to reduce the training overhead. Our approach bridges between uniform randomness and score based importance sampling of clusters when selecting a batch of new samples. Experiments on\nbenchmark image classification datasets (MNIST, SVHN, and CIFAR10) shows improvement over existing active learning strategies. We introduce an extra denoising layer to deep networks to make active learning robust to label noises and show significant improvements.\n",
        "keywords": "Active Learning;Noisy Oracle;Model Uncertainty;Image classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gaurav Gupta;Anit Kumar Sahu;Wan-Yi Lin",
        "authorids": "ggaurav@usc.edu;anit.sahu@gmail.com;wan-yi.lin@us.bosch.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngupta2020learning,\ntitle={Learning in Confusion: Batch Active Learning with Noisy Oracle},\nauthor={Gaurav Gupta and Anit Kumar Sahu and Wan-Yi Lin},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxIkkSKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJxIkkSKwB",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "127;1494;191",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "288;798;25",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            604.0,
            629.8671817666536
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            370.3333333333333,
            320.90116166127467
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7526760279248345040&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SJxIm0VtwH",
        "title": "Towards Better Understanding of Adaptive Gradient Algorithms in Generative Adversarial Nets",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper provides novel analysis of adaptive gradient algorithms for solving non-convex non-concave min-max problems as GANs, and explains the reason why adaptive gradient methods outperform its non-adaptive counterparts by empirical studies.",
        "abstract": "Adaptive gradient algorithms perform gradient-based updates using the history of gradients and are ubiquitous in training deep neural networks. While adaptive gradient methods theory is well understood for minimization problems, the underlying factors driving their empirical success in min-max problems such as GANs remain unclear. In this paper, we aim at bridging  this gap from both theoretical and empirical perspectives. First, we analyze a variant of Optimistic Stochastic Gradient (OSG) proposed in~\\citep{daskalakis2017training} for solving a class of non-convex non-concave min-max problem and establish $O(\\epsilon^{-4})$ complexity for finding $\\epsilon$-first-order stationary point, in which the algorithm only requires invoking one stochastic first-order oracle while enjoying state-of-the-art iteration complexity achieved by stochastic extragradient method by~\\citep{iusem2017extragradient}. Then we propose an adaptive variant of OSG named Optimistic Adagrad (OAdagrad) and reveal an \\emph{improved} adaptive complexity $O\\left(\\epsilon^{-\\frac{2}{1-\\alpha}}\\right)$, where $\\alpha$ characterizes the growth rate of the cumulative stochastic gradient and $0\\leq \\alpha\\leq 1/2$. To the best of our knowledge, this is the first work for establishing adaptive complexity in non-convex non-concave min-max optimization. Empirically, our experiments show that indeed adaptive gradient algorithms outperform their non-adaptive counterparts in GAN training. Moreover, this observation can be explained by the slow growth rate of the cumulative stochastic gradient, as observed empirically.",
        "keywords": "Generative Adversarial Nets;Adaptive Gradient Algorithms",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mingrui Liu;Youssef Mroueh;Jerret Ross;Wei Zhang;Xiaodong Cui;Payel Das;Tianbao Yang",
        "authorids": "mingrui-liu@uiowa.edu;mroueh@us.ibm.com;rossja@us.ibm.com;weiz@us.ibm.com;cuix@us.ibm.com;daspa@us.ibm.com;tianbao-yang@uiowa.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nLiu2020Towards,\ntitle={Towards Better Understanding of Adaptive Gradient Algorithms in Generative Adversarial Nets},\nauthor={Mingrui Liu and Youssef Mroueh and Jerret Ross and Wei Zhang and Xiaodong Cui and Payel Das and Tianbao Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxIm0VtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJxIm0VtwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "836;400;271",
        "wc_reply_reviewers": "131;0;0",
        "wc_reply_authors": "592;919;9",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            502.3333333333333,
            241.74412551750294
        ],
        "wc_reply_reviewers_avg": [
            43.666666666666664,
            61.753992223625154
        ],
        "wc_reply_authors_avg": [
            506.6666666666667,
            376.37422275649476
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 83,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12826656932778991132&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJxNzgSKvH",
        "title": "Selective sampling for accelerating training of deep neural networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We present a selective sampling method designed to accelerate the training of deep neural networks. To this end,  we introduce a novel measurement,  the {\\it minimal margin score} (MMS), which measures the minimal amount of displacement an input should take until its predicted classification is switched.   For multi-class linear classification,  the MMS measure is a natural generalization of the margin-based selection criterion, which was thoroughly studied in the binary classification setting.  In addition, the MMS measure provides an interesting insight into the progress of the training process and can be useful for designing and monitoring new training regimes. Empirically we demonstrate a substantial acceleration when training commonly used deep neural network architectures for popular image classification tasks.  The efficiency of our method is compared against the standard training procedures, and against commonly used selective sampling alternatives: Hard negative mining selection, and Entropy-based selection.\nFinally, we demonstrate an additional speedup when we adopt a more aggressive learning-drop regime while using the MMS selective sampling method.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Berry Weinstein;Shai Fine;Yacov Hel-Or",
        "authorids": "berry.weinstein@post.idc.ac.il;shai.fine@idc.ac.il;toky@idc.ac.il",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nweinstein2020selective,\ntitle={Selective sampling for accelerating  training of deep neural networks},\nauthor={Berry Weinstein and Shai Fine and Yacov Hel-Or},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxNzgSKvH}\n}",
        "github": "https://github.com/paper-submissions/mms-select",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxNzgSKvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "776;115;173",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "510;0;126",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            354.6666666666667,
            298.86712023164336
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            212.0,
            216.90550938138938
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18437086450537440170&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJxRKT4Fwr",
        "title": "Cross-Dimensional Self-Attention for Multivariate, Geo-tagged Time Series Imputation",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel self-attention mechanism for multivariate, geo-tagged time series imputation.",
        "abstract": "Many real-world applications involve multivariate, geo-tagged time series data: at each location, multiple sensors record corresponding measurements. For example, air quality monitoring system records PM2.5, CO, etc. The resulting time-series data often has missing values due to device outages or communication errors. In order to impute the missing values, state-of-the-art methods are built on Recurrent Neural Networks (RNN), which process each time stamp sequentially, prohibiting the direct modeling of the relationship between distant time stamps. Recently, the self-attention mechanism has been proposed for sequence modeling tasks such as machine translation, significantly outperforming RNN because the relationship between each two time stamps can be modeled explicitly. In this paper, we are the first to adapt the self-attention mechanism for multivariate, geo-tagged time series data. In order to jointly capture the self-attention across different dimensions (i.e. time, location and sensor measurements) while keep the size of attention maps reasonable, we propose a novel approach called Cross-Dimensional Self-Attention (CDSA) to process each dimension sequentially, yet in an order-independent manner. On three real-world datasets, including one our newly collected NYC-traffic dataset, extensive experiments demonstrate the superiority of our approach compared to state-of-the-art methods for both imputation and forecasting tasks. \n",
        "keywords": "self-attention;cross-dimensional;multivariate time series;imputation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiawei Ma*;Zheng Shou*;Alireza Zareian;Hassan Mansour;Anthony Vetro;Shih-Fu Chang",
        "authorids": "jm4743@columbia.edu;zs2262@columbia.edu;alireza@cs.columbia.edu;mansour@merl.com;avetro@merl.com;sc250@columbia.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nma*2020crossdimensional,\ntitle={Cross-Dimensional Self-Attention for Multivariate, Geo-tagged Time Series Imputation},\nauthor={Jiawei Ma* and Zheng Shou* and Alireza Zareian and Hassan Mansour and Anthony Vetro and Shih-Fu Chang},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxRKT4Fwr}\n}",
        "github": "https://github.com/ICLR-Submission/cross-dimensional-attention",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJxRKT4Fwr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "271;586;276",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "457;1389;381",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            377.6666666666667,
            147.3280542040638
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            742.3333333333334,
            458.3138177663375
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=291731657913676659&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJxSDxrKDr",
        "title": "Adversarial Training and Provable Defenses: Bridging the Gap",
        "track": "main",
        "status": "Talk",
        "tldr": "We propose a novel combination of adversarial training and provable defenses which produces a model with state-of-the-art accuracy and certified robustness on CIFAR-10. ",
        "abstract": "We present COLT, a new method to train neural networks based on a novel combination of adversarial training and provable defenses. The key idea is to model neural network training as a procedure which includes both, the verifier and the adversary. In every iteration, the verifier aims to certify the network using convex relaxation while the adversary tries to find inputs inside that convex relaxation which cause verification to fail. We experimentally show that this training method, named convex layerwise adversarial training (COLT), is promising and achieves the best of both worlds -- it produces a state-of-the-art neural network with certified robustness of 60.5% and accuracy of 78.4% on the challenging CIFAR-10 dataset with a 2/255 L-infinity perturbation. This significantly improves over the best concurrent results of 54.0% certified robustness and 71.5% accuracy.\n   \n",
        "keywords": "adversarial examples;adversarial training;provable defense;convex relaxations;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mislav Balunovic;Martin Vechev",
        "authorids": "bmislav@student.ethz.ch;martin.vechev@inf.ethz.ch",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nBalunovic2020Adversarial,\ntitle={Adversarial Training and Provable Defenses: Bridging the Gap},\nauthor={Mislav Balunovic and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxSDxrKDr}\n}",
        "github": "[![github](/images/github_icon.svg) eth-sri/colt](https://github.com/eth-sri/colt)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJxSDxrKDr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "597;326;400",
        "wc_reply_reviewers": "0;37;48",
        "wc_reply_authors": "631;216;576",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            441.0,
            114.37074217939947
        ],
        "wc_reply_reviewers_avg": [
            28.333333333333332,
            20.531818125912658
        ],
        "wc_reply_authors_avg": [
            474.3333333333333,
            184.04407684634194
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 207,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16785232142228633680&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJxSOJStPr",
        "title": "A Neural Dirichlet Process Mixture Model for Task-Free Continual Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose an expansion-based approach for task-free continual learning for the first time. Our model consists of a set of neural network experts and expands the number of experts under the Bayesian nonparametric principle.",
        "abstract": "Despite the growing interest in continual learning, most of its contemporary works have been studied in a rather restricted setting where tasks are clearly distinguishable, and task boundaries are known during training. However, if our goal is to develop an algorithm that learns as humans do, this setting is far from realistic, and it is essential to develop a methodology that works in a task-free manner. Meanwhile, among several branches of continual learning, expansion-based methods have the advantage of eliminating catastrophic forgetting by allocating new resources to learn new data. In this work, we propose an expansion-based approach for task-free continual learning. Our model, named Continual Neural Dirichlet Process Mixture (CN-DPM), consists of a set of neural network experts that are in charge of a subset of the data. CN-DPM expands the number of experts in a principled way under the Bayesian nonparametric framework. With extensive experiments, we show that our model successfully performs task-free continual learning for both discriminative and generative tasks such as image classification and image generation.",
        "keywords": "continual learning;task-free;task-agnostic",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Soochan Lee;Junsoo Ha;Dongsu Zhang;Gunhee Kim",
        "authorids": "soochan.lee@vision.snu.ac.kr;junsooha@hanyang.ac.kr;96lives@snu.ac.kr;gunhee@snu.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLee2020A,\ntitle={A Neural Dirichlet Process Mixture Model for Task-Free Continual Learning},\nauthor={Soochan Lee and Junsoo Ha and Dongsu Zhang and Gunhee Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxSOJStPr}\n}",
        "github": "https://github.com/soochan-lee/CN-DPM",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJxSOJStPr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "470;487;220",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "473;726;290",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            392.3333333333333,
            122.05554291205112
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            496.3333333333333,
            178.75930434214854
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 280,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14278617304843676910&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJxSn0EYvS",
        "title": "Unifying Part Detection And Association For Multi-person Pose Estimation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Current bottom-up approaches for 2D multi-person pose estimation (MPPE) detect joints collectively without distinguishing between individuals. Associating the joints into individual poses is done independently of the learning algorithm, therefore  requires formulating a separate problem in a post-processing step, which relies on relaxations or sophisticated heuristics. We propose a differentiable learning-based model that performs part detection and association jointly, thereby eliminating the need for further post-processing. The approach introduces a recurrent neural network (RNN), which takes dense low-level features as input and predicts the heatmaps of a single person's joints in each iteration, then refines them in a feedback loop. In addition, the network learns a stopping criterion in order to halt once it has identified all individuals in an image, allowing it to output any number of poses. Furthermore, we introduce an efficient implementation that allows training on memory-constrained machines. The approach is evaluated on the challenging COCO and OCHuman datasets and substantially outperforms the baseline. On OCHuman, which contains severe occlusions, we achieve state-of-the-art results even compared to top-down approaches. Our results demonstrate the advantage of a learning-based detection and association framework, and the advantage of bottom-up approaches over top-down approaches in complex scenarios. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rania Briq;Andreas Doering;Juergen Gall",
        "authorids": "briq@iai.uni-bonn.de;doering@iai.uni-bonn.de;gall@iai.uni-bonn.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJxSn0EYvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "333;533;528",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            464.6666666666667,
            93.12476696465995
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KYodto_mOL4J:scholar.google.com/&scioq=Unifying+Part+Detection+And+Association+For+Multi-person+Pose+Estimation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJxTZeHFPH",
        "title": "The Intriguing Effects of Focal Loss on the Calibration of Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Miscalibration -- a mismatch between a model's confidence and its correctness -- of Deep Neural Networks (DNNs) makes their predictions hard for downstream components to trust. Ideally, we want networks to be accurate, calibrated and confident. Temperature scaling, the most popular calibration approach, will calibrate a DNN without affecting its accuracy, but it will also make its correct predictions under-confident. In this paper, we show that replacing the widely used cross-entropy loss with focal loss allows us to learn models that are already very well calibrated. When combined with temperature scaling, focal loss, whilst preserving accuracy and yielding state-of-the-art calibrated models, also preserves the confidence of the model's correct predictions, which is extremely desirable for downstream tasks. We provide a thorough analysis of the factors causing miscalibration, and use the insights we glean from this to theoretically justify the empirically excellent performance of focal loss. We perform extensive experiments on a variety of computer vision (CIFAR-10/100) and NLP (SST, 20 Newsgroup) datasets, and with a wide variety of different network architectures, and show that our approach achieves state-of-the-art accuracy and calibration in almost all cases.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jishnu Mukhoti;Viveka Kulharia;Amartya Sanyal;Stuart Golodetz;Philip Torr;Puneet Dokania",
        "authorids": "jishnumukhoti7@gmail.com;viveka@robots.ox.ac.uk;amartya.sanyal@cs.ox.ac.uk;stuart@five.ai;philip.torr@eng.ox.ac.uk;puneet@robots.ox.ac.uk",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nmukhoti2020the,\ntitle={The Intriguing Effects of Focal Loss on the Calibration of Deep Neural Networks},\nauthor={Jishnu Mukhoti and Viveka Kulharia and Amartya Sanyal and Stuart Golodetz and Philip Torr and Puneet Dokania},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxTZeHFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxTZeHFPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "239;420;156",
        "wc_reply_reviewers": "290;0;0",
        "wc_reply_authors": "2494;1009;1012",
        "reply_reviewers": "1;0;0",
        "reply_authors": "6;3;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            271.6666666666667,
            110.22502236586553
        ],
        "wc_reply_reviewers_avg": [
            96.66666666666667,
            136.7073110293992
        ],
        "wc_reply_authors_avg": [
            1505.0,
            699.3296790498741
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            4.0,
            1.4142135623730951
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5032333236812349578&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJxUjlBtwB",
        "title": "Reconstructing continuous distributions of 3D protein structure from cryo-EM images",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose a deep generative model of volumes for 3D cryo-EM reconstruction from unlabelled 2D images and show that it can learn can learn continuous deformations in protein structure.",
        "abstract": "Cryo-electron microscopy (cryo-EM) is a powerful technique for determining the structure of proteins and other macromolecular complexes at near-atomic resolution. In single particle cryo-EM, the central problem is to reconstruct the 3D structure of a macromolecule from $10^{4-7}$ noisy and randomly oriented 2D projection images. However, the imaged protein complexes may exhibit structural variability, which complicates reconstruction and is typically addressed using discrete clustering approaches that fail to capture the full range of protein dynamics. Here, we introduce a novel method for cryo-EM reconstruction that extends naturally to modeling continuous generative factors of structural heterogeneity. This method encodes structures in Fourier space using coordinate-based deep neural networks, and trains these networks from unlabeled 2D cryo-EM images by combining exact inference over image orientation with variational inference for structural heterogeneity. We demonstrate that the proposed method, termed cryoDRGN, can perform ab-initio reconstruction of 3D protein complexes from simulated and real 2D cryo-EM image data. To our knowledge, cryoDRGN is the first neural network-based approach for cryo-EM reconstruction and the first end-to-end method for directly reconstructing continuous ensembles of protein structures from cryo-EM images.",
        "keywords": "generative models;proteins;3D reconstruction;cryo-EM",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ellen D. Zhong;Tristan Bepler;Joseph H. Davis;Bonnie Berger",
        "authorids": "zhonge@mit.edu;tbepler@mit.edu;jhdavis@mit.edu;bab@mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nZhong2020Reconstructing,\ntitle={Reconstructing continuous distributions of 3D protein structure from cryo-EM images},\nauthor={Ellen D. Zhong and Tristan Bepler and Joseph H. Davis and Bonnie Berger},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxUjlBtwB}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SJxUjlBtwB)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxUjlBtwB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "142;109;187",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "256;136;441",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            146.0,
            31.96873472629156
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            277.6666666666667,
            125.45472844372543
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 127,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7699245839115804488&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJxWS64FwH",
        "title": "Deep Network Classification by Scattering and Homotopy Dictionary Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "A scattering transform followed by supervised dictionary learning reaches a higher accuracy than AlexNet on ImageNet.",
        "abstract": "We introduce a sparse scattering deep convolutional neural network, which provides a simple model to analyze properties of deep representation learning for classification. Learning a single dictionary matrix with a classifier yields a higher classification accuracy than AlexNet over the ImageNet 2012 dataset. The network first applies a scattering transform that linearizes variabilities due to geometric transformations such as translations and small deformations.\nA sparse $\\ell^1$ dictionary coding reduces intra-class variability while preserving class separation through projections over unions of linear spaces. It is implemented in a deep convolutional network with a homotopy algorithm having an exponential convergence. A convergence proof is given in a general framework that includes ALISTA. Classification results are analyzed on ImageNet.",
        "keywords": "dictionary learning;scattering transform;sparse coding;imagenet",
        "primary_area": "",
        "supplementary_material": "",
        "author": "John Zarka;Louis Thiry;Tomas Angles;Stephane Mallat",
        "authorids": "john.zarka@ens.fr;louis.thiry@ens.fr;tomas.angles@ens.fr;stephane.mallat@ens.fr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nZarka2020Deep,\ntitle={Deep Network Classification by Scattering and Homotopy Dictionary Learning},\nauthor={John Zarka and Louis Thiry and Tomas Angles and Stephane Mallat},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxWS64FwH}\n}",
        "github": "[![github](/images/github_icon.svg) j-zarka/SparseScatNet](https://github.com/j-zarka/SparseScatNet)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJxWS64FwH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "705;482;276",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "343;760;474",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            487.6666666666667,
            175.1843473474601
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            525.6666666666666,
            174.11554273591494
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 56,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8953532076769179699&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SJxZnR4YvB",
        "title": "Distributed Bandit Learning: Near-Optimal Regret with Efficient Communication",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We study the problem of regret minimization for distributed bandits learning, in which $M$ agents work collaboratively to minimize their total regret under the coordination of a central server. Our goal is to design communication protocols with near-optimal regret and little communication cost, which is measured by the total amount of transmitted data. For distributed multi-armed bandits, we propose a protocol with near-optimal regret and only $O(M\\log(MK))$ communication cost, where $K$ is the number of arms. The communication cost is independent of the time horizon $T$, has only logarithmic dependence on the number of arms, and matches the lower bound except for a logarithmic factor. For distributed $d$-dimensional linear bandits, we propose a protocol that achieves near-optimal regret and has communication cost of order $O\\left(\\left(Md+d\\log \\log d\\right)\\log T\\right)$, which has only logarithmic dependence on $T$.",
        "keywords": "Theory;Bandit Algorithms;Communication Efficiency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuanhao Wang;Jiachen Hu;Xiaoyu Chen;Liwei Wang",
        "authorids": "yuanhao-16@mails.tsinghua.edu.cn;nickh@pku.edu.cn;cxy30@pku.edu.cn;wanglw@cis.pku.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWang2020Distributed,\ntitle={Distributed Bandit Learning: Near-Optimal Regret with Efficient Communication},\nauthor={Yuanhao Wang and Jiachen Hu and Xiaoyu Chen and Liwei Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxZnR4YvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJxZnR4YvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "265;173;254",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "57;84;288",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            230.66666666666666,
            41.023028762987366
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            143.0,
            103.12128781197411
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 105,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17591228059482376411&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SJx_QJHYDB",
        "title": "Finding Winning Tickets with Limited (or No) Supervision",
        "track": "main",
        "status": "Reject",
        "tldr": "Finding winning tickets does not require much supervision or data.",
        "abstract": "The lottery ticket hypothesis argues that neural networks contain sparse subnetworks, which, if appropriately initialized (the winning tickets), are capable of matching the accuracy of the full network when trained in isolation. Empirically made in different contexts, such an observation opens interesting questions about the dynamics of neural network optimization and the importance of their initializations. However, the properties of winning tickets are not well understood, especially the importance of supervision in the generating process. In this paper, we aim to answer the following open questions: can we find winning tickets with few data samples or few labels? can we even obtain good tickets without supervision? Perhaps surprisingly, we provide a positive answer to both, by generating winning tickets with limited access to data, or with self-supervision---thus without using manual annotations---and then demonstrating the transferability of the tickets to challenging classification tasks such as ImageNet.\n",
        "keywords": "Lottery Tickets Hypothesis;Self-Supervised Learning;Deep Learning;Image Recognition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mathilde Caron;Ari Morcos;Piotr Bojanowski;Julien Mairal;Armand Joulin",
        "authorids": "mathilde@fb.com;arimorcos@gmail.com;bojanowski@fb.com;julien.mairal@inria.fr;ajoulin@fb.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ncaron2020finding,\ntitle={Finding Winning Tickets with Limited (or No) Supervision},\nauthor={Mathilde Caron and Ari Morcos and Piotr Bojanowski and Julien Mairal and Armand Joulin},\nyear={2020},\nurl={https://openreview.net/forum?id=SJx_QJHYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SJx_QJHYDB",
        "pdf_size": 0,
        "rating": "1;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "200;332;453;95",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "327;766;245;35",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.25,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            270.0,
            134.9611055082167
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            343.25,
            266.29530131040616
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gmSFOk3dAsgJ:scholar.google.com/&scioq=Finding+Winning+Tickets+with+Limited+(or+No)+Supervision&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJxbHkrKDH",
        "title": "Evolutionary Population Curriculum for Scaling Multi-Agent Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "In multi-agent games, the complexity of the environment can grow exponentially as the number of agents increases, so it is particularly challenging to learn good policies when the agent population is large. In this paper, we introduce Evolutionary Population Curriculum (EPC), a curriculum learning paradigm that scales up Multi-Agent Reinforcement Learning (MARL) by progressively increasing the population of training agents in a stage-wise manner. Furthermore, EPC uses an evolutionary approach to fix an objective misalignment issue throughout the curriculum: agents successfully trained in an early stage with a small population are not necessarily the best candidates for adapting to later stages with scaled populations. Concretely, EPC maintains multiple sets of agents in each stage, performs mix-and-match and fine-tuning over these sets and promotes the sets of agents with the best adaptability to the next stage. We implement EPC on a popular MARL algorithm, MADDPG, and empirically show that our approach consistently outperforms baselines by a large margin as the number of agents grows exponentially. The source code and videos can be found at https://sites.google.com/view/epciclr2020.",
        "keywords": "multi-agent reinforcement learning;evolutionary learning;curriculum learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qian Long*;Zihan Zhou*;Abhinav Gupta;Fei Fang;Yi Wu\u2020;Xiaolong Wang\u2020",
        "authorids": "qianlong@cs.cmu.edu;footoredo@sjtu.edu.cn;abhinavg@cs.cmu.edu;feif@cs.cmu.edu;jxwuyi@gmail.com;dragonwxl123@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nLong*2020Evolutionary,\ntitle={Evolutionary Population Curriculum for Scaling Multi-Agent Reinforcement Learning},\nauthor={Qian Long* and Zihan Zhou* and Abhinav Gupta and Fei Fang and Yi Wu\u2020 and Xiaolong Wang\u2020},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxbHkrKDH}\n}",
        "github": "https://github.com/qian18long/epciclr2020",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJxbHkrKDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "711;162;341",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "812;449;99",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.6666666666667,
            228.60494793906412
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            453.3333333333333,
            291.09715827461383
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 139,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13227492821855003720&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJxbu6VKDr",
        "title": "Gated Channel Transformation for Visual Recognition",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In this work, we propose a generally applicable transformation unit for visual recognition with deep convolutional neural networks. This transformation explicitly models channel relationships with explainable control variables. These variables determine the neuron behaviors of competition or cooperation, and they are jointly optimized with convolutional weights towards more accurate recognition. In Squeeze-and-Excitation (SE) Networks, the channel relationships are implicitly learned by fully connected layers, and the SE block is integrated at the block-level. We instead introduce a channel normalization layer to reduce the number of parameters and computational complexity. This lightweight layer incorporates a simple L2 normalization, enabling our transformation unit applicable to operator-level without much increase of additional parameters.  Extensive experiments demonstrate the effectiveness of our unit with clear margins on many vision tasks, i.e., image classification on ImageNet, object detection, and instance segmentation on COCO, video classification on Kinetics.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zongxin Yang;Linchao Zhu;Yu Wu;Yi Yang",
        "authorids": "zongxin.yang@student.uts.edu.au;zhulinchao7@gmail.com;yu.wu-3@student.uts.edu.au;yi.yang@uts.edu.au",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxbu6VKDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "132;333;224",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "643;35;124",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            229.66666666666666,
            82.15567877849683
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            267.3333333333333,
            268.10984481903847
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 340,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14632106519933169181&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "SJxeI6EYwS",
        "title": "Simple and Effective Stochastic Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "In this paper we propose a simple and effective stochastic neural network (SE-SNN) architecture for discriminative learning by directly modeling activation uncertainty and encouraging high activation variability.",
        "abstract": "Stochastic neural networks (SNNs) are currently topical, with several paradigms being actively investigated including dropout, Bayesian neural networks, variational information bottleneck (VIB) and noise regularized learning. These neural network variants impact several major considerations, including generalization, network compression, and robustness against adversarial attack and label noise. However, many existing networks are complicated and expensive to train, and/or only address one or two of these practical considerations. In this paper we propose a simple and effective stochastic neural network (SE-SNN) architecture for discriminative learning by directly modeling activation uncertainty and encouraging high activation variability. Compared to existing SNNs, our SE-SNN is simpler to implement and faster to train, and produces state of the art results on network compression by pruning,  adversarial defense and learning with label noise.",
        "keywords": "stochastic neural networks;pruning;adversarial defence;label noise",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianyuan Yu;Yongxin Yang;Da Li;Timothy Hospedales;Tao Xiang",
        "authorids": "tianyuan.yu@surrey.ac.uk;yongxin.yang@surrey.ac.uk;dali.darren@hotmail.com;t.hospedales@ed.ac.uk;t.xiang@surrey.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyu2020simple,\ntitle={Simple and Effective Stochastic Neural Networks},\nauthor={Tianyuan Yu and Yongxin Yang and Da Li and Timothy Hospedales and Tao Xiang},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxeI6EYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJxeI6EYwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "410;376;201",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "355;422;53",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            329.0,
            91.5678254992804
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            276.6666666666667,
            160.50406709973288
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2146822434312633728&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJxhNTNYwB",
        "title": "Black-Box Adversarial Attack with Transferable Model-based Embedding",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a new method that combines transfer-based and scored black-box adversarial attack, improving the success rate and query efficiency of black-box adversarial attack across different network architectures.",
        "abstract": "We present a new method for black-box adversarial attack. Unlike previous methods that combined transfer-based and scored-based methods by using the gradient or initialization of a surrogate white-box model, this new method tries to learn a low-dimensional embedding using a pretrained model, and then performs efficient search within the embedding space to attack an unknown target network. The method produces adversarial perturbations with high level semantic patterns that are easily transferable. We show that this approach can greatly improve the query efficiency of black-box adversarial attack across different target network architectures. We evaluate our approach on MNIST, ImageNet and Google Cloud Vision API, resulting in a significant reduction on the number of queries. We also attack adversarially defended networks on CIFAR10 and ImageNet, where our method not only reduces the number of queries, but also improves the attack success rate.",
        "keywords": "adversarial examples;black-box attack;embedding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhichao Huang;Tong Zhang",
        "authorids": "zhuangbx@connect.ust.hk;tongzhang@tongzhang-ml.org",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nHuang2020Black-Box,\ntitle={Black-Box Adversarial Attack with Transferable Model-based Embedding},\nauthor={Zhichao Huang and Tong Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxhNTNYwB}\n}",
        "github": "https://github.com/TransEmbedBA/TREMBA",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJxhNTNYwB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "440;326;263",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "700;574;20",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            343.0,
            73.25298628724975
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            431.3333333333333,
            295.37020085911774
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 150,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2817331092772484407&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SJxivAEYvr",
        "title": "RL-ST: Reinforcing Style, Fluency and Content Preservation for Unsupervised Text Style Transfer",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A reinforcement learning approach to text style transfer",
        "abstract": "Unsupervised text style transfer is the task of re-writing text of a given style into a target style without using a parallel corpus of source style and target style sentences for training. Style transfer systems are evaluated on their ability to generate sentences that 1) possess the target style, 2) are fluent and natural sounding, and 3) preserve the non-stylistic parts (content) of the source sentence. We train a reinforcement learning (RL) based unsupervised style transfer system that incorporates rewards for the above measures, and describe novel rewards shaping methods for the same. Our approach does not attempt to disentangle style and content, and leverages the power of massively pre-trained language models as well as the Transformer. Our system significantly outperforms existing state-of-art systems based on human as well as automatic evaluations on target style, fluency and content preservation as well as on overall success of style transfer, on a variety of datasets.",
        "keywords": "style transfer;text generation;reinforcement learning;sentiment transfer;RL",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bhargav Upadhyay;Akhilesh Sudhakar;Arjun Maheswaran",
        "authorids": "bhargav@agaralabs.com;akhilesh@agaralabs.com;arjun@agaralabs.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://anonymous.4open.science/r/d2c45647-1f9d-4f6c-a51c-acccafbf7cf4/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJxivAEYvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "318;967;244",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            509.6666666666667,
            324.79155708785834
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dfL1mpZ0RaUJ:scholar.google.com/&scioq=RL-ST:+Reinforcing+Style,+Fluency+and+Content+Preservation+for+Unsupervised+Text+Style+Transfer&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJxjPxSYDH",
        "title": "Discriminative Variational Autoencoder for Continual Learning with Generative Replay",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Generative replay (GR) is a method to alleviate catastrophic forgetting in continual learning (CL) by generating previous task data and learning them together with the data from new tasks. In this paper, we propose discriminative variational autoencoder (DiVA) to address the GR-based CL problem. DiVA has class-wise discriminative latent embeddings by maximizing the mutual information between classes and latent variables of VAE. Thus, DiVA is directly applicable to classification and class-conditional generation which are efficient and effective properties in the GR-based CL scenario. Furthermore, we use a novel trick based on domain translation to cover natural images which is challenging to GR-based methods. As a result, DiVA achieved the competitive or higher accuracy compared to state-of-the-art algorithms in Permuted MNIST, Split MNIST, and Split CIFAR10 settings.",
        "keywords": "Continual learning;Generative replay;Variational Autoencoder",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Woo-Young Kang;Cheol-Ho Han;Byoung-Tak Zhang",
        "authorids": "rkddndud50@gmail.com;chhan@bi.snu.ac.kr;btzhang@bi.snu.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkang2020discriminative,\ntitle={Discriminative Variational Autoencoder for Continual Learning with Generative Replay},\nauthor={Woo-Young Kang and Cheol-Ho Han and Byoung-Tak Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxjPxSYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJxjPxSYDH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "250;445;361",
        "wc_reply_reviewers": "0;0;109",
        "wc_reply_authors": "527;574;875",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;3",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            352.0,
            79.86238163240563
        ],
        "wc_reply_reviewers_avg": [
            36.333333333333336,
            51.383092766222454
        ],
        "wc_reply_authors_avg": [
            658.6666666666666,
            154.1694594341636
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6930754523652313428&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJxjVaNKwB",
        "title": "MobileBERT: Task-Agnostic Compression of BERT by Progressive Knowledge Transfer",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We develop a task-agnosticlly compressed BERT, which is 4.3x smaller and 4.0x faster than BERT-BASE while achieving competitive performance on GLUE and SQuAD.",
        "abstract": "The recent development of Natural Language Processing (NLP) has achieved great success using large pre-trained models with hundreds of millions of parameters. However, these models suffer from the heavy model size and high latency such that we cannot directly deploy them to resource-limited mobile devices. In this paper, we propose MobileBERT for compressing and accelerating the popular BERT model. Like BERT, MobileBERT is task-agnostic; that is, it can be universally applied to various downstream NLP tasks via fine-tuning. MobileBERT is a slimmed version of BERT-LARGE augmented with bottleneck structures and a carefully designed balance between self-attentions and feed-forward networks. To train MobileBERT, we use a bottom-to-top progressive scheme to transfer the intrinsic knowledge of a specially designed Inverted Bottleneck BERT-LARGE teacher to it. Empirical studies show that MobileBERT is 4.3x smaller and 4.0x faster than original BERT-BASE while achieving competitive results on well-known NLP benchmarks. On the natural language inference tasks of GLUE, MobileBERT achieves 0.6 GLUE score performance degradation, and 367 ms latency on a Pixel 3 phone. On the SQuAD v1.1/v2.0 question answering task, MobileBERT achieves a 90.0/79.2 dev F1 score, which is 1.5/2.1 higher than BERT-BASE. ",
        "keywords": "BERT;knowledge transfer;model compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiqing Sun;Hongkun Yu;Xiaodan Song;Renjie Liu;Yiming Yang;Denny Zhou",
        "authorids": "zhiqings@andrew.cmu.edu;hongkuny@google.com;xiaodansong@google.com;renjieliu@google.com;yiming@cs.cmu.edu;dennyzhou@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxjVaNKwB",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "231;500;341;181",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "109;375;107;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;0",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            313.25,
            122.37314860703715
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            147.75,
            138.41491068522927
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.75,
            0.4330127018922193
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2151429393183114999&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SJxjanVKDH",
        "title": "3D-SIC: 3D Semantic Instance Completion for RGB-D Scans",
        "track": "main",
        "status": "Withdraw",
        "tldr": "From an incomplete RGB-D scan of a scene, we aim to detect the individual object instances comprising the scene and infer their complete object geometry.",
        "abstract": "This paper introduces the task of semantic instance completion: from an incomplete RGB-D scan of a scene, we aim to detect the individual object instances comprising the scene and infer their complete object geometry. This enables a semantically meaningful decomposition of a scanned scene into individual, complete 3D objects, including hidden and unobserved object parts. This will open up new possibilities for interactions with object in a scene, for instance for virtual or robotic agents. To address this task, we propose 3D-SIC, a new data-driven approach that jointly detects object instances and predicts their completed geometry. The core idea of 3D-SIC is a novel end-to-end 3D neural network architecture that leverages joint color and geometry feature learning. The fully-convolutional nature of our 3D network enables efficient inference of semantic instance completion for 3D scans at scale of large indoor environments in a single forward pass. In a series evaluation, we evaluate on both real and synthetic scan benchmark data, where we outperform state-of-the-art approaches by over 15 in mAP@0.5 on ScanNet, and over 18 in mAP@0.5 on SUNCG.",
        "keywords": "3d reconstruction;rgb-d scanning;3d learning;3d scene understanding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ji Hou;Angela Dai;Matthias Niessner",
        "authorids": "ji.hou@tum.de;angela.dai@tum.de;niessner@tum.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJxjanVKDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "483;412;291",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            395.3333333333333,
            79.26467617349412
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "SJxmfgSYDB",
        "title": "Representing Unordered Data Using Multiset Automata and Complex Numbers",
        "track": "main",
        "status": "Reject",
        "tldr": "Automata for multisets and complex numbers give a new way of thinking about DeepSets and Transformer position encodings.",
        "abstract": "Unordered, variable-sized inputs arise in many settings across multiple fields.  The ability for set- and multiset- oriented neural networks to handle this type of input has been the focus of much work in recent years.  We propose to represent multisets using complex-weighted multiset automata and show how the multiset representations of certain existing neural architectures can be viewed as special cases of ours.  Namely, (1) we provide a new theoretical and intuitive justification for the Transformer model's representation of positions using sinusoidal functions, and (2) we extend the DeepSets model to use complex numbers, enabling it to outperform the existing model on an extension of one of their tasks.  \n",
        "keywords": "sets;multisets;automata;complex numbers;position encodings",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Justin DeBenedetto;David Chiang",
        "authorids": "jdebened@nd.edu;dchiang@nd.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ndebenedetto2020representing,\ntitle={Representing Unordered Data Using Multiset Automata and Complex Numbers},\nauthor={Justin DeBenedetto and David Chiang},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxmfgSYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SJxmfgSYDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "204;788;362",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "206;909;522",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            451.3333333333333,
            246.64324213099553
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            545.6666666666666,
            287.48603830833633
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:baVF7Um79lIJ:scholar.google.com/&scioq=Representing+Unordered+Data+Using+Multiset+Automata+and+Complex+Numbers&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJxngREtDB",
        "title": "GumbelClip: Off-Policy Actor-Critic Using Experience Replay",
        "track": "main",
        "status": "Withdraw",
        "tldr": "With a set of modifications, under 10 LOC, to A2C you get an off-policy actor-critic that outperforms A2C and performs similarly to ACER. The modifications are large batchsizes, aggressive clamping, and policy \"forcing\" with gumbel noise.",
        "abstract": "This paper presents GumbelClip, a set of modifications to the actor-critic algorithm, for off-policy reinforcement learning. GumbelClip uses the concepts of truncated importance sampling along with additive noise to produce a loss function enabling the use of off-policy samples. The modified algorithm achieves an increase in convergence speed and sample efficiency compared to on-policy algorithms and is competitive with existing off-policy policy gradient methods while being significantly simpler to implement. The effectiveness of GumbelClip is demonstrated against existing on-policy and off-policy actor-critic algorithms on a subset of the Atari domain.",
        "keywords": "reinforcement learning;off-policy;actor-critic;experience replay",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Norman Tasfi;Miriam Capretz",
        "authorids": "ntasfi@gmail.com;ntasfi@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxngREtDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "388;370;372",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            376.6666666666667,
            8.055363982396381
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RPhl9g1J5CQJ:scholar.google.com/&scioq=GumbelClip:+Off-Policy+Actor-Critic+Using+Experience+Replay&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SJxpsxrYPS",
        "title": "PROGRESSIVE LEARNING AND DISENTANGLEMENT OF HIERARCHICAL REPRESENTATIONS",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We proposed a progressive learning method to improve learning and disentangling latent representations at different levels of abstraction.",
        "abstract": "Learning rich representation from data is an important task for deep generative models such as variational auto-encoder (VAE). However, by extracting high-level abstractions in the bottom-up inference process, the goal of preserving all factors of variations for top-down generation is compromised. Motivated by the concept of \u201cstarting small\u201d, we present a strategy to progressively learn independent hierarchical representations from high- to low-levels of abstractions. The model starts with learning the most abstract representation, and then progressively grow the network architecture to introduce new  representations at different levels of abstraction. We quantitatively demonstrate the ability of the presented model to improve disentanglement in comparison to existing works on two benchmark datasets using three disentanglement metrics, including a new metric we proposed to complement the previously-presented metric of mutual information gap. We further present both qualitative and quantitative evidence on how the progression of learning improves disentangling of hierarchical representations. By drawing on the respective advantage of hierarchical representation learning and progressive learning, this is to our knowledge the first attempt to improve disentanglement by progressively growing the capacity of VAE to learn hierarchical representations.",
        "keywords": "generative model;disentanglement;progressive learning;VAE",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiyuan Li;Jaideep Vitthal Murkute;Prashnna Kumar Gyawali;Linwei Wang",
        "authorids": "zl7904@rit.edu;jvm6526@rit.edu;pkg2182@rit.edu;linwei.wang@rit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLi2020PROGRESSIVE,\ntitle={PROGRESSIVE LEARNING AND DISENTANGLEMENT OF HIERARCHICAL REPRESENTATIONS},\nauthor={Zhiyuan Li and Jaideep Vitthal Murkute and Prashnna Kumar Gyawali and Linwei Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxpsxrYPS}\n}",
        "github": "[![github](/images/github_icon.svg) Zhiyuan1991/proVLAE](https://github.com/Zhiyuan1991/proVLAE)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJxpsxrYPS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "478;203;327",
        "wc_reply_reviewers": "107;0;0",
        "wc_reply_authors": "1372;272;284",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            336.0,
            112.44850673382314
        ],
        "wc_reply_reviewers_avg": [
            35.666666666666664,
            50.440283724640395
        ],
        "wc_reply_authors_avg": [
            642.6666666666666,
            515.7398138165751
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 58,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=478514677450330118&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJxrKgStDH",
        "title": "SCALOR: Generative World Models with Scalable Object Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Scalability in terms of object density in a scene is a primary challenge in unsupervised sequential object-oriented representation learning. Most of the previous models have been shown to work only on scenes with a few objects. In this paper, we propose SCALOR, a probabilistic generative world model for learning SCALable Object-oriented Representation of a video. With the proposed spatially parallel attention and proposal-rejection mechanisms, SCALOR can deal with orders of magnitude larger numbers of objects compared to the previous state-of-the-art models. Additionally, we introduce a background module that allows SCALOR to model complex dynamic backgrounds as well as many foreground objects in the scene. We demonstrate that SCALOR can deal with crowded scenes containing up to a hundred objects while jointly modeling complex dynamic backgrounds. Importantly, SCALOR is the \ufb01rst unsupervised object representation model shown to work for natural scenes containing several tens of moving objects.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jindong Jiang*;Sepehr Janghorbani*;Gerard De Melo;Sungjin Ahn",
        "authorids": "jindong.jiang@rutgers.edu;sj620@scarletmail.rutgers.edu;gdm@demelo.org;sjn.ahn@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nJiang*2020SCALOR:,\ntitle={SCALOR: Generative World Models with Scalable Object Representations},\nauthor={Jindong Jiang* and Sepehr Janghorbani* and Gerard De Melo and Sungjin Ahn},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxrKgStDH}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SJxrKgStDH)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxrKgStDH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "667;808;157",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1272;1605;64",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            544.0,
            279.6390530666273
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            980.3333333333334,
            662.053539291868
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 140,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3993568451817358173&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SJxstlHFPH",
        "title": "Differentiable Reasoning over a Virtual Knowledge Base",
        "track": "main",
        "status": "Talk",
        "tldr": "Differentiable multi-hop access to a textual knowledge base of indexed contextual representations",
        "abstract": "We consider the task of answering complex multi-hop questions using a corpus as a virtual knowledge base (KB). In particular, we describe a neural module, DrKIT, that traverses textual data like a KB, softly following paths of relations between mentions of entities in the corpus. At each step the module uses a combination of sparse-matrix TFIDF indices and a maximum inner product search (MIPS) on a special index of contextual representations of the mentions. This module is differentiable, so the full system can be trained end-to-end using gradient based methods, starting from natural language inputs. We also describe a pretraining scheme for the contextual representation encoder by generating hard negative examples using existing knowledge bases. We show that DrKIT improves accuracy by 9 points on 3-hop questions in the MetaQA dataset, cutting the gap between text-based and KB-based state-of-the-art by 70%. On HotpotQA, DrKIT leads to a 10% improvement over a BERT-based re-ranking approach to retrieving the relevant passages required to answer a question. DrKIT is also very efficient, processing up to 10-100x more queries per second than existing multi-hop systems.",
        "keywords": "Question Answering;Multi-Hop QA;Deep Learning;Knowledge Bases;Information Extraction;Data Structures for QA",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bhuwan Dhingra;Manzil Zaheer;Vidhisha Balachandran;Graham Neubig;Ruslan Salakhutdinov;William W. Cohen",
        "authorids": "bdhingra@andrew.cmu.edu;manzilzaheer@google.com;vbalacha@andrew.cmu.edu;gneubig@cs.cmu.edu;rsalakhu@cs.cmu.edu;wcohen@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nDhingra2020Differentiable,\ntitle={Differentiable Reasoning over a Virtual Knowledge Base},\nauthor={Bhuwan Dhingra and Manzil Zaheer and Vidhisha Balachandran and Graham Neubig and Ruslan Salakhutdinov and William W. Cohen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxstlHFPH}\n}",
        "github": "http://www.cs.cmu.edu/~bdhingra/pages/drkit.html",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxstlHFPH",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "464;394;612",
        "wc_reply_reviewers": "19;36;18",
        "wc_reply_authors": "393;323;819",
        "reply_reviewers": "1;1;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            490.0,
            90.8772065298371
        ],
        "wc_reply_reviewers_avg": [
            24.333333333333332,
            8.259674462242577
        ],
        "wc_reply_authors_avg": [
            511.6666666666667,
            219.18840196405364
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 103,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8508172030252277603&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SJxy5A4twS",
        "title": "Superbloom: Bloom filter meets Transformer",
        "track": "main",
        "status": "Reject",
        "tldr": "We apply Transformer on Bloom filter digests and show it achieves good quality.",
        "abstract": "We extend the idea of word pieces in natural language models to machine learning tasks on opaque ids. This is achieved by applying hash functions to map each id to multiple hash tokens in a much smaller space, similarly to a Bloom filter. We show that by applying a multi-layer Transformer to these Bloom filter digests, we are able to obtain models with high accuracy. They outperform models of a similar size without hashing and, to a large degree, models of a much larger size trained using sampled softmax with the same computational budget. Our key observation is that it is important to use a multi-layer Transformer for Bloom filter digests to remove ambiguity in the hashed input. We believe this provides an alternative method to solving problems with large vocabulary size.",
        "keywords": "Bloom filter;Transformer;word pieces;contextual embeddings",
        "primary_area": "",
        "supplementary_material": "",
        "author": "John Anderson;Qingqing Huang;Walid Krichene;Steffen Rendle;Li Zhang",
        "authorids": "janders@google.com;qqhuang@google.com;walidk@google.com;srendle@google.com;liqzhang@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nanderson2020superbloom,\ntitle={Superbloom: Bloom filter meets Transformer},\nauthor={John Anderson and Qingqing Huang and Walid Krichene and Steffen Rendle and Li Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxy5A4twS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJxy5A4twS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "320;809;254",
        "wc_reply_reviewers": "0;0;5",
        "wc_reply_authors": "600;790;55",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            461.0,
            247.54393549428755
        ],
        "wc_reply_reviewers_avg": [
            1.6666666666666667,
            2.357022603955158
        ],
        "wc_reply_authors_avg": [
            481.6666666666667,
            311.5106561401855
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6308249793138108757&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SJxyCRVKvB",
        "title": "Granger Causal Structure Reconstruction from Heterogeneous Multivariate Time Series",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a network architecture that inductively reconstructs Granger causality via a prototypical Granger causal attention mechanism.",
        "abstract": "Granger causal structure reconstruction is an emerging topic that can uncover causal relationship behind multivariate time series data. In many real-world systems, it is common to encounter a large amount of multivariate time series data collected from heterogeneous individuals with sharing commonalities, however there are ongoing concerns regarding its applicability in such large scale complex scenarios, presenting both challenges and opportunities for Granger causal reconstruction. To bridge this gap,  we propose a Granger cAusal StructurE Reconstruction (GASER) framework for inductive Granger causality learning and common causal structure detection on heterogeneous multivariate time series. In particular, we address the problem through a novel attention mechanism, called prototypical Granger causal attention. Extensive experiments, as well as an online A/B test on an E-commercial advertising platform, demonstrate the superior performances of GASER.",
        "keywords": "causal inference;Granger causality;time series;inductive;LSTM;attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yunfei Chu;Xiaowei Wang;Chunyan Feng;Jianxin Ma;Jingren Zhou;Hongxia Yang",
        "authorids": "yfchu@bupt.edu.cn;daemon.wxw@alibaba-inc.com;cyfeng@bupt.edu.cn;jason.mjx@alibaba-inc.com;jingren.zhou@alibaba-inc.com;yang.yhx@alibaba-inc.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nchu2020granger,\ntitle={Granger Causal Structure Reconstruction from Heterogeneous Multivariate Time Series},\nauthor={Yunfei Chu and Xiaowei Wang and Chunyan Feng and Jianxin Ma and Jingren Zhou and Hongxia Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxyCRVKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SJxyCRVKvB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "281;195;355",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1538;176;543",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            277.0,
            65.38093503970914
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            752.3333333333334,
            575.3991851073672
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xEqc73bi0iEJ:scholar.google.com/&scioq=Granger+Causal+Structure+Reconstruction+from+Heterogeneous+Multivariate+Time+Series&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "SJxyOhVtvB",
        "title": "IEG: Robust neural net training with severe label noises",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Collecting large-scale data with clean labels for supervised training of neural networks is practically challenging. \nAlthough noisy labels are usually cheap to acquire, existing methods suffer severely for training datasets with high noise ratios, making high-cost human labeling a necessity. \nHere we present a method to train neural networks in a way that is almost invulnerable to severe label noise by utilizing a tiny trusted set. Our method, named IEG, is based on three key factors: (i) Isolation of noisy labels, (ii) Escalation of useful supervision from mislabeled data, and (iii) Guidance from small trusted data. On CIFAR100 with a 40\\% uniform noise ratio and 10 trusted labeled data per class, our method achieves $80.2{\\pm}0.3\\%$ classification accuracy, only 1.4\\% higher error than a neural network trained without label noise.  Moreover, increasing the noise ratio to 80\\%, our method still achieves a high accuracy of $75.5{\\pm}0.2\\%$, compared to the previous best 47.7\\%. Finally, our method sets new state of the art on various types of challenging label corruption levels and large-scale WebVision benchmarks.",
        "keywords": "Robust deep learning;label noise",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zizhao Zhang;Han Zhang;Sercan Arik;Honglak Lee;Tomas Pfister",
        "authorids": "zizhaoz@google.com;zhanghan@google.com;soarik@google.com;honglak@google.com;tpfister@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxyOhVtvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "219;100;279",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "268;166;703",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            199.33333333333334,
            74.38787236879111
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            379.0,
            232.85617878853893
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nvkWCs3LZQIJ:scholar.google.com/&scioq=IEG:+Robust+neural+net+training+with+severe+label+noises&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SJxzFySKwH",
        "title": "On the Equivalence between Positional Node Embeddings and Structural Graph Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "We develop the foundations of a unifying theoretical framework connecting node embeddings and structural graph representations through invariant theory",
        "abstract": "This work provides the first unifying theoretical framework for node (positional) embeddings and structural graph representations, bridging methods like matrix factorization and graph neural networks. Using invariant theory, we show that relationship between structural representations and node embeddings is analogous to that of a distribution and its samples. We prove that all tasks that can be performed by node embeddings can also be performed by structural representations and vice-versa. We also show that the concept of transductive and inductive learning is unrelated to node embeddings and graph representations, clearing another source of confusion in the literature. Finally, we introduce new practical guidelines to generating  and  using  node  embeddings, which further augments standard operating procedures used today.",
        "keywords": "Graph Neural Networks;Structural Graph Representations;Node Embeddings;Relational Learning;Invariant Theory;Theory;Deep Learning;Representational Power;Graph Isomorphism",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Balasubramaniam Srinivasan;Bruno Ribeiro",
        "authorids": "bsriniv@purdue.edu;ribeiro@cs.purdue.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nSrinivasan2020On,\ntitle={On the Equivalence between Positional Node Embeddings and Structural Graph Representations},\nauthor={Balasubramaniam Srinivasan and Bruno Ribeiro},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SJxzFySKwH}\n}",
        "github": "https://github.com/PurdueMINDS/Equivalence",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SJxzFySKwH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "261;284;76",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "320;226;289",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            207.0,
            93.10567472859357
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            278.3333333333333,
            39.109532796436675
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 162,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2186546565333105540&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Ske-ih4FPS",
        "title": "Unsupervised Few Shot Learning via Self-supervised Training",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Learning from limited exemplars (few-shot learning) is a fundamental, unsolved problem that has been laboriously explored in the machine learning community. However, current few-shot learners are mostly supervised and rely heavily on a large amount of labeled examples. Unsupervised learning is a more natural procedure for cognitive mammals and has produced promising results in many machine learning tasks. In the current study, we develop a method to learn an unsupervised few-shot learner via self-supervised training (UFLST), which can effectively generalize to novel but related classes. The proposed model consists of two alternate processes, progressive clustering and episodic training. The former generates pseudo-labeled training examples for constructing  episodic tasks; and the later trains the few-shot learner using the generated episodic tasks which further optimizes the feature representations of data. The two processes facilitate with each other, and eventually produce a high quality few-shot learner. Using the benchmark dataset Omniglot, we show that our model outperforms other unsupervised few-shot learning methods to a large extend and approaches to the performances of supervised methods. Using the benchmark dataset Market1501, we further demonstrate the feasibility of our model to a real-world application on person re-identification.",
        "keywords": "few shot learning;self-supervised learning;meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zilong Ji;Xiaolong Zou;Tiejun Huang;Si Wu",
        "authorids": "jizilong@mail.bnu.edu.cn;xiaolz@pku.edu.cn;tjhuang@pku.edu.cn;siwu@pku.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nji2020unsupervised,\ntitle={Unsupervised Few Shot Learning via Self-supervised Training},\nauthor={Zilong Ji and Xiaolong Zou and Tiejun Huang and Si Wu},\nyear={2020},\nurl={https://openreview.net/forum?id=Ske-ih4FPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Ske-ih4FPS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "503;809;176",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "540;742;416",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            496.0,
            258.4685667542574
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            566.0,
            134.35276947896037
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 49,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1731551913371800700&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "Ske066VFwS",
        "title": "Variational inference of latent hierarchical dynamical systems in neuroscience: an application to calcium imaging data",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We extend a successful recurrent variational autoencoder for dynamic systems to model an instance of dynamic systems hierarchy in neuroscience using the ladder method.",
        "abstract": "A key problem in neuroscience, and life sciences more generally, is that data is generated by a hierarchy of dynamical systems. One example of this is in \\textit{in-vivo} calcium imaging data, where data is generated by a lower-order dynamical system governing calcium flux in neurons, which itself is driven by a higher-order dynamical system of neural computation. Ideally, life scientists would be able to infer the dynamics of both the lower-order systems and the higher-order systems, but this is difficult in high-dimensional regimes. A recent approach using sequential variational auto-encoders demonstrated it was possible to learn the latent dynamics of a single dynamical system for computations during reaching behaviour in the brain, using spiking data modelled as a Poisson process. Here we extend this approach using a ladder method to infer a hierarchy of dynamical systems, allowing us to capture calcium dynamics as well as neural computation. In this approach, spiking events drive lower-order calcium dynamics, and are themselves controlled by a higher-order latent dynamical system. We generate synthetic data by generating firing rates, sampling spike trains, and converting spike trains to fluorescence transients, from two dynamical systems that have been used as key benchmarks in recent literature: a Lorenz attractor, and a chaotic recurrent neural network. We show that our model is better able to reconstruct Lorenz dynamics from fluorescence data than competing methods. However, though our model can reconstruct underlying spike rates and calcium transients from the chaotic neural network well, it does not perform as well at reconstructing firing rates as basic techniques for inferring spikes from calcium data. These results demonstrate that VLAEs are a promising approach for modelling hierarchical dynamical systems data in the life sciences, but that inferring the dynamics of lower-order systems can potentially be better achieved with simpler methods.",
        "keywords": "variational autoencoders;neuroscience;dynamic systems;hierarchical;generative model;calcium imaging",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Luke Y. Prince;Blake A. Richards",
        "authorids": "luke.prince@utoronto.ca;blake.richards@mcgill.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Ske066VFwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "221;416;238",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            291.6666666666667,
            88.1904504782437
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AH-AdnDHOuwJ:scholar.google.com/&scioq=Variational+inference+of+latent+hierarchical+dynamical+systems+in+neuroscience:+an+application+to+calcium+imaging+data&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Ske31kBtPr",
        "title": "Mathematical Reasoning in Latent Space",
        "track": "main",
        "status": "Talk",
        "tldr": "Learning to reason about higher order logic formulas in the latent space.",
        "abstract": "We design and conduct a simple experiment to study whether neural networks can perform several steps of approximate reasoning in a fixed dimensional latent space. The set of rewrites (i.e. transformations) that can be successfully performed on a statement represents essential semantic features of the statement. We can compress this information by embedding the formula in a vector space, such that the vector associated with a statement can be used to predict whether a statement can be rewritten by other theorems. Predicting the embedding of a formula generated by some rewrite rule is naturally viewed as approximate reasoning in the latent space. In order to measure the effectiveness of this reasoning, we perform approximate deduction sequences in the latent space and use the resulting embedding to inform the semantic features of the corresponding formal statement (which is obtained by performing the corresponding rewrite sequence using real formulas). Our experiments show that graph neural networks can make non-trivial predictions about the rewrite-success of statements, even when they propagate predicted latent representations for several steps. Since our corpus of mathematical formulas includes a wide variety of mathematical disciplines, this experiment is a strong indicator for the feasibility of deduction in latent space in general.",
        "keywords": "machine learning;formal reasoning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dennis Lee;Christian Szegedy;Markus Rabe;Sarah Loos;Kshitij Bansal",
        "authorids": "ldennis@google.com;szegedy@google.com;mrabe@google.com;smoos@google.com;kbk@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLee2020Mathematical,\ntitle={Mathematical Reasoning in Latent Space},\nauthor={Dennis Lee and Christian Szegedy and Markus Rabe and Sarah Loos and Kshitij Bansal},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Ske31kBtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Ske31kBtPr",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "398;607;455",
        "wc_reply_reviewers": "81;20;16",
        "wc_reply_authors": "100;341;193",
        "reply_reviewers": "1;1;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            486.6666666666667,
            88.21312575549942
        ],
        "wc_reply_reviewers_avg": [
            39.0,
            29.743346594938952
        ],
        "wc_reply_authors_avg": [
            211.33333333333334,
            99.23820948718404
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 41,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3872068897089870600&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Ske5UANYDB",
        "title": "Benefit of Interpolation in Nearest Neighbor Algorithms",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The over-parameterized models attract much attention in the era of data science and deep learning. It is empirically observed that although these models, e.g. deep neural networks, over-fit the training data, they can still achieve small testing error, and sometimes even outperform traditional algorithms which are designed to avoid over-fitting. The major goal of this work is to sharply quantify the benefit of data interpolation in the context of nearest neighbors (NN) algorithm. Specifically, we consider a class of interpolated weighting schemes and then carefully characterize their asymptotic performances. Our analysis reveals a U-shaped performance curve with respect to the level of data interpolation, and proves that a mild degree of data interpolation strictly improves the prediction accuracy and statistical stability over those of the (un-interpolated) optimal $k$NN algorithm. This theoretically justifies (predicts) the existence of the second U-shaped curve in the recently discovered double descent phenomenon. Note that our goal in this study is not to promote the use of interpolated-NN method, but to obtain theoretical insights on data interpolation inspired by the aforementioned phenomenon.",
        "keywords": "Data Interpolation;Multiplicative Constant;W-Shaped Double Descent;Nearest Neighbor Algorithm",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yue Xing;Qifan Song;Guang Cheng",
        "authorids": "xing49@purdue.edu;qfsong@purdue.edu;chengg@purdue.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nxing2020benefit,\ntitle={Benefit of Interpolation in Nearest Neighbor Algorithms},\nauthor={Yue Xing and Qifan Song and Guang Cheng},\nyear={2020},\nurl={https://openreview.net/forum?id=Ske5UANYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Ske5UANYDB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "220;181;90",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "418;412;49",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            163.66666666666666,
            54.46915538989832
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            293.0,
            172.5514416051051
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3344813257066528075&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Ske6qJSKPH",
        "title": "Scheduling the Learning Rate Via Hypergradients: New Insights and a New Algorithm",
        "track": "main",
        "status": "Reject",
        "tldr": "MARTHE: a new method to fit task-specific learning rate schedules from the perspective of hyperparameter optimization",
        "abstract": "We study the problem of fitting task-specific learning rate schedules from the perspective of hyperparameter optimization.  This allows us to explicitly search for schedules that achieve good generalization. We describe the structure of the gradient of a validation error w.r.t. the learning rates, the hypergradient, and based on this we introduce a novel online algorithm. Our method adaptively interpolates between two recently proposed techniques (Franceschi et al., 2017; Baydin et al.,2018), featuring increased stability and faster convergence. We show empirically that the proposed technique compares favorably with baselines and related methodsin terms of final test accuracy.",
        "keywords": "automl;hyperparameter optimization;learning rate;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michele Donini;Luca Franceschi;Orchid Majumder;Massimiliano Pontil;Paolo Frasconi",
        "authorids": "mikko108382892@gmail.com;luca.franceschi@iit.it;orchid@amazon.com;massimiliano.pontil@gmail.com;paolo.frasconi@unifi.it",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ndonini2020scheduling,\ntitle={Scheduling the Learning Rate Via Hypergradients: New Insights and a New Algorithm},\nauthor={Michele Donini and Luca Franceschi and Orchid Majumder and Massimiliano Pontil and Paolo Frasconi},\nyear={2020},\nurl={https://openreview.net/forum?id=Ske6qJSKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Ske6qJSKPH",
        "pdf_size": 0,
        "rating": "1;6",
        "confidence": "0;0",
        "wc_review": "224;421",
        "wc_reply_reviewers": "288;0",
        "wc_reply_authors": "806;490",
        "reply_reviewers": "2;0",
        "reply_authors": "2;1",
        "rating_avg": [
            3.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            322.5,
            98.5
        ],
        "wc_reply_reviewers_avg": [
            144.0,
            144.0
        ],
        "wc_reply_authors_avg": [
            648.0,
            158.0
        ],
        "reply_reviewers_avg": [
            1.0,
            1.0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10586227994147955011&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Ske9VANKDH",
        "title": "An Optimization Principle Of Deep Learning?",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Training deep neural networks (DNNs) has achieved great success in recent years. Modern DNN trainings utilize various types of training techniques that are developed in different aspects, e.g., activation functions for neurons, batch normalization for hidden layers, skip connections for network architecture and stochastic algorithms for optimization. Despite the effectiveness of these techniques, it is still mysterious how they help accelerate DNN trainings in practice. In this paper, we propose an optimization principle that is parameterized by $\\gamma>0$ for stochastic algorithms in nonconvex and over-parameterized optimization. The principle guarantees the convergence of stochastic algorithms to a global minimum with a monotonically diminishing parameter distance to the minimizer and leads to a $\\mathcal{O}(1/\\gamma K)$ sub-linear convergence rate, where $K$ is the number of iterations. Through extensive experiments, we show that DNN trainings consistently obey the $\\gamma$-optimization principle and its theoretical implications. In particular, we observe that the trainings that apply the training techniques achieve accelerated convergence and obey the principle with a large $\\gamma$, which is consistent with the $\\mathcal{O}(1/\\gamma K)$ convergence rate result under the optimization principle. We think the $\\gamma$-optimization principle captures and quantifies the impacts of various DNN training techniques and can be of independent interest from a theoretical perspective.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cheng Chen;Junjie Yang;Yi Zhou",
        "authorids": "u0952128@utah.edu;yang.4972@buckeyemail.osu.edu;yi.zhou@utah.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchen2020an,\ntitle={An Optimization Principle Of Deep Learning?},\nauthor={Cheng Chen and Junjie Yang and Yi Zhou},\nyear={2020},\nurl={https://openreview.net/forum?id=Ske9VANKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Ske9VANKDH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "349;216;560",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "335;206;354",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            375.0,
            141.6356828862934
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            298.3333333333333,
            65.7486797501584
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "SkeATxrKwH",
        "title": "Generative Hierarchical Models for Parts, Objects, and Scenes",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Hierarchical structure such as part-whole relationship in objects and scenes are the most inherent structure in natural scenes. Learning such representation via unsupervised learning can provide various benefits such as interpretability, compositionality, and transferability, which are important in many downstream tasks. In this paper, we propose the first hierarchical generative model for learning multiple latent part-whole relationships in a scene. During inference, taking top-down approach, our model infers the representation of more abstract concept (e.g., objects) and then infers that of more specific concepts (e.g., parts) by conditioning on the corresponding abstract concept. This makes the model avoid a difficult problem of routing between parts and whole. In experiments on images containing multiple objects with different shapes and part compositions, we demonstrate that our model can learn the latent hierarchical structure between parts and wholes and generate imaginary scenes.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fei Deng;Zhuo Zhi;Sungjin Ahn",
        "authorids": "fei.deng@rutgers.edu;zhizz001@stu.xjtu.edu.cn;sjn.ahn@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndeng2020generative,\ntitle={Generative Hierarchical Models for Parts, Objects, and Scenes},\nauthor={Fei Deng and Zhuo Zhi and Sungjin Ahn},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeATxrKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkeATxrKwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "293;756;360",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "257;304;286",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            469.6666666666667,
            204.30750244559192
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            282.3333333333333,
            19.362047641943477
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10628930768618224722&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkeAaJrKDS",
        "title": "Combining Q-Learning and Search with Amortized Value Estimates",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a model-based method called \"Search with Amortized Value Estimates\" (SAVE) which leverages both real and planned experience by combining Q-learning with Monte-Carlo Tree Search, achieving strong performance with very small search budgets.",
        "abstract": "We introduce \"Search with Amortized Value Estimates\" (SAVE), an approach for combining model-free Q-learning with model-based Monte-Carlo Tree Search (MCTS). In SAVE, a learned prior over state-action values is used to guide MCTS, which estimates an improved set of state-action values. The new Q-estimates are then used in combination with real experience to update the prior. This effectively amortizes the value computation performed by MCTS, resulting in a cooperative relationship between model-free learning and model-based search. SAVE can be implemented on top of any Q-learning agent with access to a model, which we demonstrate by incorporating it into agents that perform challenging physical reasoning tasks and Atari. SAVE consistently achieves higher rewards with fewer training steps, and---in contrast to typical model-based search approaches---yields strong performance with very small search budgets. By combining real experience with information computed during search, SAVE demonstrates that it is possible to improve on both the performance of model-free learning and the computational cost of planning.",
        "keywords": "model-based RL;Q-learning;MCTS;search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jessica B. Hamrick;Victor Bapst;Alvaro Sanchez-Gonzalez;Tobias Pfaff;Theophane Weber;Lars Buesing;Peter W. Battaglia",
        "authorids": "jhamrick@google.com;vbapst@google.com;alvarosg@google.com;tpfaff@google.com;theophane@google.com;lbuesing@google.com;peterbattaglia@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nHamrick2020Combining,\ntitle={Combining Q-Learning and Search with Amortized Value Estimates},\nauthor={Jessica B. Hamrick and Victor Bapst and Alvaro Sanchez-Gonzalez and Tobias Pfaff and Theophane Weber and Lars Buesing and Peter W. Battaglia},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeAaJrKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkeAaJrKDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "312;679;1258",
        "wc_reply_reviewers": "99;0;0",
        "wc_reply_authors": "931;985;710",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            749.6666666666666,
            389.4220789266519
        ],
        "wc_reply_reviewers_avg": [
            33.0,
            46.66904755831214
        ],
        "wc_reply_authors_avg": [
            875.3333333333334,
            118.96871670971137
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 65,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12357354355154682772&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkeBBJrFPH",
        "title": "Characterize and Transfer Attention in Graph Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "An analytic paradigm for studying attention in graph neural networks and an approach to perform transfer learning for graph sparsification",
        "abstract": "Does attention matter and, if so, when and how? Our study on both inductive and transductive learning suggests that datasets have a strong influence on the effects of attention in graph neural networks. Independent of learning setting, task and attention variant, attention mostly degenerate to simple averaging for all three citation networks, whereas they behave strikingly different in the protein-protein interaction networks and molecular graphs: nodes attend to different neighbors per head and get more focused in deeper layers. Consequently, attention distributions become telltale features of the datasets themselves. We further explore the possibility of transferring attention for graph sparsification and show that, when applicable, attention-based sparsification retains enough information to obtain good performance while reducing computational and storage costs. Finally, we point out several possible directions for further study and transfer of attention.",
        "keywords": "Graph Neural Networks;Graph Attention Networks;Attention;Transfer Learning;Empirical Study",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mufei Li;Hao Zhang;Xingjian Shi;Minjie Wang;Yixing Guan;Zheng Zhang",
        "authorids": "limufe@amazon.com;sufeidechabei@gmail.com;xshiab@connect.ust.hk;wmjlyjemaine@gmail.com;guayixin@amazon.com;zz@nyu.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nli2020characterize,\ntitle={Characterize and Transfer Attention in Graph Neural Networks},\nauthor={Mufei Li and Hao Zhang and Xingjian Shi and Minjie Wang and Yixing Guan and Zheng Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeBBJrFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkeBBJrFPH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "210;411;474",
        "wc_reply_reviewers": "0;0;193",
        "wc_reply_authors": "12;187;411",
        "reply_reviewers": "0;0;2",
        "reply_authors": "1;1;3",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.0,
            112.57886124845996
        ],
        "wc_reply_reviewers_avg": [
            64.33333333333333,
            90.98107251266912
        ],
        "wc_reply_authors_avg": [
            203.33333333333334,
            163.29999659794512
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9IELJv5wYnMJ:scholar.google.com/&scioq=Characterize+and+Transfer+Attention+in+Graph+Neural+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SkeFl1HKwr",
        "title": "Empirical Studies on the Properties of Linear Regions in Deep Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "A deep neural networks (DNN) with piecewise linear activations can partition the input space into numerous small linear regions, where different linear functions are fitted. It is believed that the number of these regions represents the expressivity of a DNN. This paper provides a novel and meticulous perspective to look into DNNs: Instead of just counting the number of the linear regions, we study their local properties, such as the inspheres, the directions of the corresponding hyperplanes, the decision boundaries, and the relevance of the surrounding regions. We empirically observed that different optimization techniques lead to completely different linear regions, even though they result in similar classification accuracies. We hope our study can inspire the design of novel optimization techniques, and help discover and analyze the behaviors of DNNs.",
        "keywords": "deep learning;linear region;optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiao Zhang;Dongrui Wu",
        "authorids": "xiao_zhang@hust.edu.cn;drwu@hust.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nZhang2020Empirical,\ntitle={Empirical Studies on the Properties of Linear Regions in Deep Neural Networks},\nauthor={Xiao Zhang and Dongrui Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeFl1HKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkeFl1HKwr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "331;539;329",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "880;896;456",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            399.6666666666667,
            98.5269280732712
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            744.0,
            203.7514826121927
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 49,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6963077335975826675&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkeGURNtDH",
        "title": "MultiGrain: a unified image embedding for classes and instances",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Combining classification and image retrieval in a neural network architecture, we obtain an improvement for both tasks.",
        "abstract": "We introduce MultiGrain, a neural network architecture that generates compact image embedding vectors that solve multiple tasks of different granularity: class, instance, and copy recognition. MultiGrain is trained jointly for classification by optimizing the cross-entropy loss and for instance/copy recognition by optimizing a self-supervised ranking loss. The self-supervised loss only uses data augmentation and thus does not require additional labels. Remarkably, the unified embeddings are not only much more compact than using several specialized embeddings, but they also have the same or better accuracy. When fed to a linear classifier, MultiGrain using ResNet-50 achieves 79.4% top-1 accuracy on ImageNet, a +1.8% absolute improvement over the the current state-of-the-art AutoAugment method. The same embeddings perform on par with state-of-the-art instance retrieval with images of moderate resolution. An ablation study shows that our approach benefits from the self-supervision, the pooling method and the mini-batches with repeated augmentations of the same image.\n",
        "keywords": "classification;image retrieval;deep learning;data augmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Maxim Berman;Herv\u00e9 J\u00e9gou;Andrea Vedaldi;Iasonas Kokkinos;Matthijs Douze",
        "authorids": "maxim.berman@esat.kuleuven.be;rvj@fb.com;vedaldi@fb.com;iasonas.kokkinos@gmail.com;matthijs@fb.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkeGURNtDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "420;494;146",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "229;284;71",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            353.3333333333333,
            149.68708101309952
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            194.66666666666666,
            90.28227339233814
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 128,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=550947420332493154&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkeGvaEtPr",
        "title": "Neural Markov Logic Networks",
        "track": "main",
        "status": "Reject",
        "tldr": " We introduce a statistical relational learning system that borrows ideas from Markov logic but learns an implicit representation of rules as a neural network.",
        "abstract": "We introduce Neural Markov Logic Networks (NMLNs), a statistical relational learning system that borrows ideas from Markov logic. Like Markov Logic Networks (MLNs), NMLNs are an exponential-family model for modelling distributions over possible worlds, but unlike MLNs, they do not rely on explicitly specified first-order logic rules. Instead, NMLNs learn an implicit representation of such rules as a neural network that acts as a potential function on fragments of the relational structure. Interestingly, any MLN can be represented as an NMLN. Similarly to recently proposed Neural theorem provers (NTPs) (Rocktaschel at al. 2017), NMLNs can exploit embeddings of constants but, unlike NTPs, NMLNs work well also in their absence. This is extremely important for predicting in settings other than the transductive one. We showcase the potential of NMLNs on knowledge-base completion tasks and on generation of molecular (graph) data.",
        "keywords": "Statistical Relational Learning;Markov Logic Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Giuseppe Marra;Ond\u0159ej Ku\u017eelka",
        "authorids": "g.marra@unifi.it;kuzelo1@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmarra2020neural,\ntitle={Neural Markov Logic Networks},\nauthor={Giuseppe Marra and Ond{\\v{r}}ej Ku{\\v{z}}elka},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeGvaEtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkeGvaEtPr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "552;284;283",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "399;385;322",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            373.0,
            126.57277221688189
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            368.6666666666667,
            33.48963355361709
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 66,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1375796221098744174&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 14
    },
    {
        "id": "SkeHuCVFDr",
        "title": "BERTScore: Evaluating Text Generation with BERT",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose BERTScore, an automatic evaluation metric for text generation, which correlates better with human judgments and provides stronger model selection performance than existing metrics.",
        "abstract": "We propose BERTScore, an automatic evaluation metric for text generation. Analogously to common metrics, BERTScore computes a similarity score for each token in the candidate sentence with each token in the reference sentence. However, instead of exact matches, we compute token similarity using contextual embeddings. We evaluate using the outputs of 363 machine translation and image captioning systems. BERTScore correlates better with human judgments and provides stronger model selection performance than existing metrics. Finally, we use an adversarial paraphrase detection task and show that BERTScore is more robust to challenging examples compared to existing metrics. ",
        "keywords": "Metric;Evaluation;Contextual Embedding;Text Generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianyi Zhang*;Varsha Kishore*;Felix Wu*;Kilian Q. Weinberger;Yoav Artzi",
        "authorids": "zty27x@gmail.com;vk352@cornell.edu;fw245@cornell.edu;kqw4@cornell.edu;yoav@cs.cornell.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nZhang*2020BERTScore:,\ntitle={BERTScore: Evaluating Text Generation with BERT},\nauthor={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeHuCVFDr}\n}",
        "github": "https://github.com/Tiiiger/bert_score",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkeHuCVFDr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "173;529;616",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "522;311;373",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            439.3333333333333,
            191.64608585155665
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            402.0,
            88.54753902095003
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6747,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5304773001741994283&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SkeIyaVtwB",
        "title": "Exploration in Reinforcement Learning with Deep Covering Options",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce a method to automatically discover task-agnostic options that encourage exploration for reinforcement learning.",
        "abstract": "While many option discovery methods have been proposed to accelerate exploration in reinforcement learning, they are often heuristic. Recently, covering options was proposed to discover a set of options that provably reduce the upper bound of the environment's cover time, a measure of the difficulty of exploration. Covering options are computed using the eigenvectors of the graph Laplacian, but they are constrained to tabular tasks and are not applicable to tasks with large or continuous state-spaces. \nWe introduce deep covering options, an online method that extends covering options to large state spaces,  automatically discovering task-agnostic options that encourage exploration. We evaluate our method in several challenging sparse-reward domains and we show that our approach identifies less explored regions of the state-space and successfully generates options to visit these regions, substantially improving both the exploration and the total accumulated reward.",
        "keywords": "Reinforcement learning;temporal abstraction;exploration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuu Jinnai;Jee Won Park;Marlos C. Machado;George Konidaris",
        "authorids": "yuu_jinnai@brown.edu;jee_won_park@brown.edu;marlosm@google.com;gdk@cs.brown.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nJinnai2020Exploration,\ntitle={Exploration in Reinforcement Learning with Deep Covering Options},\nauthor={Yuu Jinnai and Jee Won Park and Marlos C. Machado and George Konidaris},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeIyaVtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkeIyaVtwB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "373;1345;589",
        "wc_reply_reviewers": "0;580;0",
        "wc_reply_authors": "352;1155;555",
        "reply_reviewers": "0;2;0",
        "reply_authors": "1;3;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            769.0,
            416.73012850044813
        ],
        "wc_reply_reviewers_avg": [
            193.33333333333334,
            273.4146220587984
        ],
        "wc_reply_authors_avg": [
            687.3333333333334,
            340.91673796137115
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 66,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9497424044573444608&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "SkeJMCVFDS",
        "title": "Improving Irregularly Sampled Time Series Learning with Dense Descriptors of Time",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A novel method to create dense descriptors of time (Time Embeddings) to make simple models understand temporal structures",
        "abstract": "Supervised learning with irregularly sampled time series have been a challenge to Machine Learning methods due to the obstacle of dealing with irregular time intervals. Some papers introduced recently recurrent neural network models that deals with irregularity, but most of them rely on complex mechanisms to achieve a better performance. This work propose a novel method to represent timestamps (hours or dates) as dense vectors using sinusoidal functions, called Time Embeddings. As a data input method it and can be applied to most machine learning models. The method was evaluated with two predictive tasks from MIMIC III, a dataset of irregularly sampled time series of electronic health records. Our tests showed an improvement to LSTM-based and classical machine learning models, specially with very irregular data.",
        "keywords": "irregularly sampled;time series;embeddings",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rafael Teixeira Sousa;Lucas Ara\u00fajo Pereira;Anderson da Silva Soares",
        "authorids": "rafaelts777@gmail.com;apereiral@outlook.com;engsoares@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkeJMCVFDS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "149;530;278",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            319.0,
            158.22136391777187
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11062179738780274579&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkeJPertPS",
        "title": "Collaborative Training of Balanced Random Forests for Open Set Domain Adaptation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, we introduce a collaborative training algorithm of balanced random forests for domain adaptation tasks which can avoid the overfitting problem. In real scenarios, most domain adaptation algorithms face the challenges from noisy, insufficient training data. Moreover in open set categorization, unknown or misaligned source and target categories adds difficulty. In such cases, conventional methods suffer from overfitting and fail to successfully transfer the knowledge of the source to the target domain. To address these issues, the following two techniques are proposed. First, we introduce the optimized decision tree construction method, in which the data at each node are split into equal sizes while maximizing the information gain. Compared to the conventional random forests, it generates larger and more balanced decision trees due to the even-split constraint, which contributes to enhanced discrimination power and reduced overfitting. Second, to tackle the domain misalignment problem, we propose the domain alignment loss which penalizes uneven splits of the source and target domain data. By collaboratively optimizing the information gain of the labeled source data as well as the entropy of unlabeled target data distributions, the proposed CoBRF algorithm achieves significantly better performance than the state-of-the-art methods. The proposed algorithm is extensively evaluated in various experimental setups in challenging domain adaptation tasks with noisy and small training data as well as open set domain adaptation problems, for two backbone networks of AlexNet and ResNet-50.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jongbin Ryu;Jiun Bae;Jongwoo Lim",
        "authorids": "jongbin.ryu@gmail.com;maybe@hanyang.ac.kr;jlim@hanyang.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nryu2020collaborative,\ntitle={Collaborative Training of Balanced Random Forests for Open Set Domain Adaptation},\nauthor={Jongbin Ryu and Jiun Bae and Jongwoo Lim},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeJPertPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkeJPertPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "404;191;236",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "775;339;282",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            277.0,
            91.6624241442479
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            465.3333333333333,
            220.20041376487515
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8505935254690913430&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkeKtyHYPS",
        "title": "Data Augmentation in Training CNNs: Injecting Noise to Images",
        "track": "main",
        "status": "Reject",
        "tldr": "Ideal methodology to inject noise to input data during CNN training",
        "abstract": "Noise injection is a fundamental tool for data augmentation, and yet there is no widely accepted procedure to incorporate it with learning frameworks. This study analyzes the effects of adding or applying different noise models of varying magnitudes to Convolutional Neural Network (CNN) architectures. Noise models that are distributed with different density functions are given common magnitude levels via Structural Similarity (SSIM) metric in order to create an appropriate ground for comparison. The basic results are conforming with the most of the common notions in machine learning, and also introduces some novel heuristics and recommendations on noise injection. The new approaches will provide better understanding on optimal learning procedures for image classification.",
        "keywords": "deep learning;data augmentation;convolutional neural networks;noise;image processing;SSIM",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Murtaza Eren Akbiyik",
        "authorids": "erenakbiyik@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nakbiyik2020data,\ntitle={Data Augmentation in Training {\\{}CNN{\\}}s: Injecting Noise to Images},\nauthor={Murtaza Eren Akbiyik},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeKtyHYPS}\n}",
        "github": "https://drive.google.com/open?id=1GwQFo2QtW_O6AebR-ZGUYqlv6ItVx2_2",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkeKtyHYPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "416;347;228",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            330.3333333333333,
            77.6502128493899
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 39,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4593505628125604090&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SkeNlJSKvS",
        "title": "Shallow VAEs with RealNVP Prior Can Perform as Well as Deep Hierarchical VAEs",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that VAE with learned RealNVP prior and just one latent variable can have better test NLLs than some deep hierarchical VAEs with powerful posteriors, on several datasets.",
        "abstract": "Using powerful posterior distributions is a popular technique in variational inference.  However, recent works showed that the aggregated posterior may fail to match unit Gaussian prior, even with expressive posteriors, thus learning the prior becomes an alternative way to improve the variational lower-bound.  We show that using learned RealNVP prior and just one latent variable in VAE, we can achieve test NLL comparable to very deep state-of-the-art hierarchical VAE, outperforming many previous works with complex hierarchical VAE architectures.  We hypothesize that, when coupled with Gaussian posteriors, the learned prior can encourage appropriate posterior overlapping, which is likely to improve reconstruction loss and lower-bound, supported by our experimental results.  We demonstrate that, with learned RealNVP prior, \u00df-VAE can have better rate-distortion curve than using fixed Gaussian prior.",
        "keywords": "Variational Auto-encoder;RealNVP;learnable prior",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haowen Xu;Wenxiao Chen;Jinlin Lai;Zhihan Li;Youjian Zhao;Dan Pei",
        "authorids": "xhw15@mails.tsinghua.edu.cn;chen-wx17@mails.tsinghua.edu.cn;laijl16@mails.tsinghua.edu.cn;lizhihan17@mails.tsinghua.edu.cn;zhaoyoujian@tsinghua.edu.cn;peidan@tsinghua.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nxu2020shallow,\ntitle={Shallow {\\{}VAE{\\}}s with Real{\\{}NVP{\\}} Prior Can Perform as Well as Deep Hierarchical {\\{}VAE{\\}}s},\nauthor={Haowen Xu and Wenxiao Chen and Jinlin Lai and Zhihan Li and Youjian Zhao and Dan Pei},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeNlJSKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkeNlJSKvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "400;96;306",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            267.3333333333333,
            127.08352459002
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2929103266839497417&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkeP3yBFDS",
        "title": "Reducing Computation in Recurrent Networks by Selectively Updating State Neurons",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that conditionally computing individual dimensions of an RNN's hidden state depending on input data at each time step from scratch with no assumptions leads to higher accuracy with far fewer computations than state-of-the-art approach.",
        "abstract": "Recurrent Neural Networks (RNN) are the state-of-the-art approach to sequential learning. However, standard RNNs use the same amount of computation at each timestep, regardless of the input data. As a result, even for high-dimensional hidden states, all dimensions are updated at each timestep regardless of the recurrent memory cell. Reducing this rigid assumption could allow for models with large hidden states to perform inference more quickly. Intuitively, not all hidden state dimensions need to be recomputed from scratch at each timestep. Thus, recent methods have begun studying this problem by imposing mainly a priori-determined patterns for updating the state. In contrast, we now design a fully-learned approach, SA-RNN, that augments any RNN by predicting discrete update patterns at the fine granularity of independent hidden state dimensions through the parameterization of a distribution of update-likelihoods driven entirely by the input data. We achieve this without imposing assumptions on the structure of the update pattern. Better yet, our method adapts the update patterns online, allowing different dimensions to be updated conditional to the input. To learn which to update, the model solves a multi-objective optimization problem, maximizing accuracy while minimizing the number of updates based on a unified control. Using publicly-available datasets we demonstrate that our method consistently achieves higher accuracy with fewer updates compared to state-of-the-art alternatives. Additionally, our method can be directly applied to a wide variety of models containing RNN architectures.",
        "keywords": "recurrent neural networks;conditional computation;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thomas Hartvigsen;Cansu Sen;Xiangnan Kong;Elke Rundensteiner",
        "authorids": "twhartvigsen@wpi.edu;csen@wpi.edu;xkong@wpi.edu;rundenst@wpi.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhartvigsen2020reducing,\ntitle={Reducing Computation in Recurrent Networks by Selectively Updating State Neurons},\nauthor={Thomas Hartvigsen and Cansu Sen and Xiangnan Kong and Elke Rundensteiner},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeP3yBFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkeP3yBFDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "252;217;126",
        "wc_reply_reviewers": "27;0;0",
        "wc_reply_authors": "409;182;20",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            198.33333333333334,
            53.105764491458196
        ],
        "wc_reply_reviewers_avg": [
            9.0,
            12.727922061357855
        ],
        "wc_reply_authors_avg": [
            203.66666666666666,
            159.54588333419602
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8877206136670884845&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SkeWc2EKPH",
        "title": "Model-free Learning Control of Nonlinear Stochastic Systems with Stability Guarantee",
        "track": "main",
        "status": "Reject",
        "tldr": "A stability guaranteed reinforcement learning framework for the stabilization and tracking problems in discrete-time nonlinear stochastic systems",
        "abstract": "Reinforcement learning (RL) offers a principled way to achieve the optimal cumulative performance index in discrete-time nonlinear stochastic systems, which are modeled as Markov decision processes. Its integration with deep learning techniques has promoted the field of deep RL with an impressive performance in complicated continuous control tasks. However, from a control-theoretic perspective, the first and most important property of a system to be guaranteed is stability. Unfortunately, stability is rarely assured in RL and remains an open question. In this paper, we propose a stability guaranteed RL framework which simultaneously learns a Lyapunov function along with the controller or policy, both of which are parameterized by deep neural networks, by borrowing the concept of Lyapunov function from control theory. Our framework can not only offer comparable or superior control performance over state-of-the-art RL algorithms, but also construct a Lyapunov function to validate the closed-loop stability. In the simulated experiments, our approach is evaluated on several well-known examples including classic CartPole balancing, 3-dimensional robot control and control of synthetic biology gene regulatory networks. Compared with RL algorithms without stability guarantee, our approach can enable the system to recover to the operating point when interfered by uncertainties such as unseen disturbances and system parametric variations to a certain extent. ",
        "keywords": "Reinforcement learning;nonlinear stochastic system;Lyapunov",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Minghao Han;Yuan Tian;Lixian Zhang;Jun Wang;Wei Pan",
        "authorids": "mhhan@hit.edu.cn;yuantian013@163.com;lixianzhang@hit.edu.cn;jun.wang@cs.ucl.ac.uk;wei.pan@tudelft.nl",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nhan2020modelfree,\ntitle={Model-free Learning Control of Nonlinear Stochastic Systems with Stability Guarantee},\nauthor={Minghao Han and Yuan Tian and Lixian Zhang and Jun Wang and Wei Pan},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeWc2EKPH}\n}",
        "github": "https://www.dropbox.com/sh/j9mhvi0vydu7x7c/AACwJbqU5MCLcKPGgcOv0zrHa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkeWc2EKPH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "1543;368;218",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2411;647;698",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            709.6666666666666,
            592.4290862392074
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1252.0,
            819.8011954126439
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8KeIZdc4dtkJ:scholar.google.com/&scioq=Model-free+Learning+Control+of+Nonlinear+Stochastic+Systems+with+Stability+Guarantee&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SkeXL0NKwH",
        "title": "Low Rank Training of Deep Neural Networks for Emerging Memory Technology",
        "track": "main",
        "status": "Reject",
        "tldr": "We use Kronecker sum approximations for low-rank training to address challenges in training neural networks on edge devices that utilize emerging memory technologies.",
        "abstract": "The recent success of neural networks for solving difficult decision tasks has incentivized incorporating smart decision making \"at the edge.\" However, this work has traditionally focused on neural network inference, rather than training, due to memory and compute limitations, especially in emerging non-volatile memory systems, where writes are energetically costly and reduce lifespan. Yet, the ability to train at the edge is becoming increasingly important as it enables applications such as real-time adaptability to device drift and environmental variation, user customization, and federated learning across devices. In this work, we address four key challenges for training on edge devices with non-volatile memory: low weight update density, weight quantization, low auxiliary memory, and online learning. We present a low-rank training scheme that addresses these four challenges while maintaining computational efficiency. We then demonstrate the technique on a representative convolutional neural network across several adaptation problems, where it out-performs standard SGD both in accuracy and in number of weight updates.",
        "keywords": "low rank training;kronecker sum;emerging memory;non-volatile memory;rram;reram;federated learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Albert Gural;Phillip Nadeau;Mehul Tikekar;Boris Murmann",
        "authorids": "agural@stanford.edu;phillip.nadeau@analog.com;mehul.tikekar@analog.com;murmann@stanford.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ngural2020low,\ntitle={Low Rank Training of Deep Neural Networks for Emerging Memory Technology},\nauthor={Albert Gural and Phillip Nadeau and Mehul Tikekar and Boris Murmann},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeXL0NKwH}\n}",
        "github": "https://anonymous.4open.science/r/77ebbbb0-45c7-4944-a594-3dd742b7ca07/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkeXL0NKwH",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "157;705;310;270",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "17;251;123;196",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            360.5,
            206.6597445077294
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            146.75,
            87.59673224498731
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11753922436783100318&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkeYUkStPr",
        "title": "Deep Lifetime Clustering",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a neural network based lifetime clustering model that maximizes divergence between empirical lifetime distributions of clusters.",
        "abstract": "The goal of lifetime clustering is to develop an inductive model that maps subjects into $K$ clusters according to their underlying (unobserved) lifetime distribution. We introduce a neural-network based lifetime clustering model that can find cluster assignments by directly maximizing the divergence between the empirical lifetime distributions of the clusters. Accordingly, we define a novel clustering loss function over the lifetime distributions (of entire clusters) based on a tight upper bound of the two-sample Kuiper test p-value. The resultant model is robust to the modeling issues associated with the unobservability of termination signals, and does not assume proportional hazards. Our results in real and synthetic datasets show significantly better lifetime clusters (as evaluated by C-index, Brier Score, Logrank score and adjusted Rand index) as compared to competing approaches.",
        "keywords": "Lifetime Clustering;Deep Learning;Survival Distributions;Kuiper two-sample test",
        "primary_area": "",
        "supplementary_material": "",
        "author": "S Chandra Mouli;Leonardo Teixeira;Jennifer Neville;Bruno Ribeiro",
        "authorids": "chandr@purdue.edu;lteixeir@purdue.edu;ribeiro@cs.purdue.edu;neville@cs.purdue.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmouli2020deep,\ntitle={Deep Lifetime Clustering},\nauthor={S Chandra Mouli and Leonardo Teixeira and Jennifer Neville and Bruno Ribeiro},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeYUkStPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkeYUkStPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "396;301;405",
        "wc_reply_reviewers": "0;0;252",
        "wc_reply_authors": "1078;446;1694",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            367.3333333333333,
            47.04843839656695
        ],
        "wc_reply_reviewers_avg": [
            84.0,
            118.79393923933998
        ],
        "wc_reply_authors_avg": [
            1072.6666666666667,
            509.5078235142442
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2275116069696505234&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Ske_56EYvS",
        "title": "SimulS2S: End-to-End Simultaneous Speech to Speech Translation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Simultaneous speech to speech translation aims to interpret concurrently with the speech in source language, which is of great importance to the real-time understanding of spoken lectures or conversations. Previous methods usually divide this problem into three stages: simultaneous automatic speech recognition (ASR), simultaneous neural machine translation (NMT), and simultaneous text to speech (TTS), which is not end-to-end and suffers from translation delay and error propagation. In this work, we propose SimulS2S, an end-to-end simultaneous speech to speech translation system that directly translates from source-language speech into target-language speech concurrently, which jointly optimizes speech recognition, text translation and speech synthesis in one sequence to sequence model. SimulS2S consists of a speech encoder and a speech decoder both with a speech segmenter and a wait-$k$ strategy for simultaneous translation. Since simultaneous speech to speech translation is challenging, we propose several key techniques to help the training of SimulS2S: 1) a curriculum learning mechanism to train the model gradually from full-sentence translation to simultaneous translation; 2) two auxiliary tasks: ASR and S2T (speech to text translation) that share the same encoder with SimulS2S model to help the training of the encoder; 3) knowledge distillation to transfer the knowledge from the cascaded NMT and TTS models to the SimulS2S model. Experiments on Fisher Spanish-English conversation translation datasets demonstrate that SimulS2S 1) achieves low translation delay and reasonable translation quality compared with full-sentence speech to speech translation (without simultaneous translation), and 2) although performs worse than but close to the accuracy of simultaneous translation with three-stage cascaded models, demonstrating the potential of end-to-end approach for this challenging task.",
        "keywords": "Simultaneous Translation;Speech to Speech Translation;End-to-End",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chen Zhang;Xu Tan;Jinglin Liu;Yi Ren;Tao Qin;Tie-Yan Liu",
        "authorids": "zc99@zju.edu.cn;xuta@microsoft.com;jinglinliu@zju.edu.cn;rayeren@zju.edu.cn;taoqin@microsoft.com;tyliu@microsoft.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=Ske_56EYvS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sDtYjqUePGgJ:scholar.google.com/&scioq=SimulS2S:+End-to-End+Simultaneous+Speech+to+Speech+Translation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Sked_0EYwB",
        "title": "Objective Mismatch in Model-based Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We define, explore, and begin to address the objective mismatch issue in model-based reinforcement learning.",
        "abstract": "Model-based reinforcement learning (MBRL) has been shown to be a powerful framework for data-efficiently learning control of continuous tasks. Recent work in MBRL has mostly focused on using more advanced function approximators and planning schemes, leaving the general framework virtually unchanged since its conception. In this paper, we identify a fundamental issue of the standard MBRL framework -- what we call the objective mismatch issue. Objective mismatch arises when one objective is optimized in the hope that a second, often uncorrelated, metric will also be optimized. In the context of MBRL, we characterize the objective mismatch between training the forward dynamics model w.r.t. the likelihood of the one-step ahead prediction, and the overall goal of improving performance on a downstream control task. For example, this issue can emerge with the realization that dynamics models effective for a specific task do not necessarily need to be globally accurate, and vice versa globally accurate models might not be sufficiently accurate locally to obtain good control performance on a specific task. In our experiments, we study this objective mismatch issue and demonstrate that the likelihood of the one-step ahead prediction is not always correlated with downstream control performance. This observation highlights a critical flaw in the current MBRL framework which will require further research to be fully understood and addressed. We propose an initial method to mitigate the mismatch issue by re-weighting dynamics model training. Building on it, we conclude with a discussion about other potential directions of future research for addressing this issue.",
        "keywords": "Model-based Reinforcement learning;dynamics model;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nathan Lambert;Brandon Amos;Omry Yadan;Roberto Calandra",
        "authorids": "nol@berkeley.edu;brandon.amos.cs@gmail.com;omry@fb.com;rcalandra@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nlambert2020objective,\ntitle={Objective Mismatch in Model-based Reinforcement Learning},\nauthor={Nathan Lambert and Brandon Amos and Omry Yadan and Roberto Calandra},\nyear={2020},\nurl={https://openreview.net/forum?id=Sked_0EYwB}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=Sked_0EYwB)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Sked_0EYwB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "344;348;392",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "992;792;753",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            361.3333333333333,
            21.74600857373345
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            845.6666666666666,
            104.69107995537262
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 118,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1180138323283378198&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Skeh-xBYDH",
        "title": "On Symmetry and Initialization for Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "When initialized properly, neural networks can learn the simple class of symmetric functions; when initialized randomly, they fail.  ",
        "abstract": "This work provides an additional step in the theoretical understanding of neural networks. We consider neural networks with one hidden layer and show that when learning symmetric functions, one can choose initial conditions so that standard SGD training efficiently produces generalization guarantees. We empirically verify this and show that this does not hold when the initial conditions are chosen at random. The proof of convergence investigates the interaction between the two layers of the network. Our results highlight the importance of using symmetry in the design of neural networks.",
        "keywords": "Neural Network Theory;Symmetry",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ido Nachum;Amir Yehudayoff",
        "authorids": "ido0808@gmail.com;amir.yehudayoff@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nnachum2020on,\ntitle={On Symmetry and Initialization for Neural Networks},\nauthor={Ido Nachum and Amir Yehudayoff},\nyear={2020},\nurl={https://openreview.net/forum?id=Skeh-xBYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skeh-xBYDH",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "504;565",
        "wc_reply_reviewers": "80;0",
        "wc_reply_authors": "487;797",
        "reply_reviewers": "1;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            534.5,
            30.5
        ],
        "wc_reply_reviewers_avg": [
            40.0,
            40.0
        ],
        "wc_reply_authors_avg": [
            642.0,
            155.0
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8523768713829325697&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Skeh1krtvH",
        "title": "WaveFlow: A Compact Flow-based Model for Raw Audio",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this work, we present WaveFlow, a small-footprint generative flow for raw audio, which is trained with maximum likelihood without complicated density distillation and auxiliary losses as used in Parallel WaveNet.  It provides a unified view of flow-based models for raw audio, including autoregressive flow (e.g., WaveNet) and bipartite flow (e.g., WaveGlow) as special cases. We systematically study these likelihood-based generative models for raw waveforms in terms of test likelihood and speech fidelity. We demonstrate that WaveFlow can synthesize high-fidelity speech and obtain comparable likelihood as WaveNet, while only requiring a few sequential steps to generate very long waveforms.  In particular, our small-footprint WaveFlow has only 5.91M parameters and can generate 22.05kHz speech 15.39 times faster than real-time on a GPU without customized inference kernels.",
        "keywords": "flow-based models;raw audio;waveforms;speech synthesis;generative models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei Ping;Kainan Peng;Kexin Zhao;Zhao Song",
        "authorids": "weiping.thu@gmail.com;weiping.thu@gmail.com;weiping.thu@gmail.com;weiping.thu@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nping2020waveflow,\ntitle={WaveFlow: A Compact Flow-based Model for Raw Audio},\nauthor={Wei Ping and Kainan Peng and Kexin Zhao and Zhao Song},\nyear={2020},\nurl={https://openreview.net/forum?id=Skeh1krtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skeh1krtvH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "251;260;304",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "609;428;369",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            271.6666666666667,
            23.156472577277874
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            468.6666666666667,
            102.11213879303914
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 162,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15645705670677592172&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SkejkR4KDr",
        "title": "Layer Flexible Adaptive Computation Time for Recurrent Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep recurrent neural networks perform well on sequence data and are the model of choice. However, it is a daunting task to decide the structure of the networks, i.e. the number of layers, especially considering different computational needs of a sequence. We propose a layer flexible recurrent neural network with adaptive computation time, and expand it to a sequence to sequence model. Different from the adaptive computation time model, our model has a dynamic number of transmission states which vary by step and sequence. We evaluate the model on a financial data set and Wikipedia language modeling. Experimental results show the performance improvement of 7% to 12% and indicate the model's ability to dynamically change the number of layers along with the computational steps.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lida Zhang;Diego Klabjan",
        "authorids": "lidazhang2018@u.northwestern.edu;d-klabjan@northwestern.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzhang2020layer,\ntitle={Layer Flexible Adaptive Computation Time for Recurrent Neural Networks},\nauthor={Lida Zhang and Diego Klabjan},\nyear={2020},\nurl={https://openreview.net/forum?id=SkejkR4KDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkejkR4KDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "359;559;444",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            454.0,
            81.95527235429498
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5291405588198110664&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Skek-TVYvr",
        "title": "A Uniform Generalization Error Bound for Generative Adversarial Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "This theoretical issue establishes a uniform generalization error bound on GANs.",
        "abstract": "\tThis paper focuses on the theoretical investigation of unsupervised generalization theory of generative adversarial networks (GANs). We first formulate a more reasonable definition of general error and generalization bounds for GANs. On top of that, we establish a  bound for generalization error with a fixed generator in a general weight normalization context. Then, we obtain a width-independent bound by applying $\\ell_{p,q}$ and spectral norm weight normalization. To better understand the unsupervised model, GANs,  we establish the generalization bound, which uniformly holds with respect to the choice of generators. Hence, we can explain how the complexity of discriminators and generators contribute to generalization error. For $\\ell_{p,q}$ and spectral weight normalization, we provide explicit guidance on how to design parameters to train robust generators. Our numerical simulations also verify that our generalization bound is reasonable.",
        "keywords": "GANs;Uniform Generalization Bound;Deep Learning;Weight normalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hao Chen;Zhanfeng Mo;Qingyi Gao;Zhouwang Yang;Xiao Wang",
        "authorids": "ch330822@mail.ustc.edu.cn;oscarmzf@mail.ustc.edu.cn;gao424@purdue.edu;yangzw@ustc.edu.cn;wangxiao@purdue.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchen2020a,\ntitle={A Uniform Generalization Error Bound for Generative Adversarial Networks},\nauthor={Hao Chen and Zhanfeng Mo and Qingyi Gao and Zhouwang Yang and Xiao Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=Skek-TVYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skek-TVYvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "524;427;177",
        "wc_reply_reviewers": "40;99;0",
        "wc_reply_authors": "819;650;361",
        "reply_reviewers": "1;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            376.0,
            146.18025402449769
        ],
        "wc_reply_reviewers_avg": [
            46.333333333333336,
            40.663934334438856
        ],
        "wc_reply_authors_avg": [
            610.0,
            189.10490915538566
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:muBoY-lB9KUJ:scholar.google.com/&scioq=A+Uniform+Generalization+Error+Bound+for+Generative+Adversarial+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Skep6TVYDB",
        "title": "Gradientless Descent: High-Dimensional Zeroth-Order Optimization",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Gradientless Descent is a provably efficient gradient-free algorithm that is monotone-invariant and fast for high-dimensional zero-th order optimization.",
        "abstract": "Zeroth-order optimization is the process of minimizing an objective $f(x)$, given oracle access to evaluations at adaptively chosen inputs $x$. In this paper, we present two simple yet powerful GradientLess Descent (GLD) algorithms that do not rely on an underlying gradient estimate and are numerically stable. We analyze our algorithm from a novel geometric perspective and we show that for {\\it any monotone transform} of a smooth and strongly convex objective with latent dimension $k \\ge n$, we present a novel analysis that shows convergence within an $\\epsilon$-ball of the optimum in $O(kQ\\log(n)\\log(R/\\epsilon))$ evaluations, where the input dimension is $n$, $R$ is the diameter of the input space and $Q$ is the condition number. Our rates are the first of its kind to be both 1) poly-logarithmically dependent on dimensionality and 2) invariant under monotone transformations. We further leverage our geometric perspective to show that our analysis is optimal. Both monotone invariance and its ability to utilize a low latent dimensionality are key to the empirical success of our algorithms, as demonstrated on synthetic and MuJoCo benchmarks.\n",
        "keywords": "Zeroth Order Optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Golovin;John Karro;Greg Kochanski;Chansoo Lee;Xingyou Song;Qiuyi Zhang",
        "authorids": "dgg@google.com;karro@google.com;gpk@google.com;chansoo@google.com;xingyousong@google.com;qiuyiz@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nGolovin2020Gradientless,\ntitle={Gradientless Descent: High-Dimensional Zeroth-Order Optimization},\nauthor={Daniel Golovin and John Karro and Greg Kochanski and Chansoo Lee and Xingyou Song and Qiuyi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skep6TVYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skep6TVYDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "137;953;715",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "131;767;534",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            601.6666666666666,
            342.6342397108354
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            477.3333333333333,
            262.71953782609233
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 81,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5951143213846090912&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Skeq30NFPr",
        "title": "Stochastic Mirror Descent on Overparameterized Nonlinear Models",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Most modern learning problems are highly overparameterized, meaning that the model has many more parameters than the number of training data points, and as a result, the training loss may have infinitely many global minima (in fact, a manifold of parameter vectors that perfectly interpolates the training data). Therefore, it is important to understand which interpolating solutions we converge to, how they depend on the initialization point and the learning algorithm, and whether they lead to different generalization performances. In this paper, we study these questions for the family of stochastic mirror descent (SMD) algorithms, of which the popular stochastic gradient descent (SGD) is a special case. Recently it has been shown that, for overparameterized linear models, SMD converges to the global minimum that is closest (in terms of the Bregman divergence of the mirror used) to the initialization point, a phenomenon referred to as implicit regularization. Our contributions in this paper are both theoretical and experimental. On the theory side, we show that in the overparameterized nonlinear setting, if the initialization is close enough to the manifold of global optima, SMD with sufficiently small step size converges to a global minimum that is approximately the closest global minimum in Bregman divergence, thus attaining approximate implicit regularization. For highly overparametrized models, this closeness comes for free: the manifold of global optima is so high dimensional that with high probability an arbitrarily chosen initialization will be close to the manifold. On the experimental side, our extensive experiments on the MNIST and CIFAR-10 datasets, using various initializations, various mirror descents, and various Bregman divergences, consistently confirms that this phenomenon indeed happens in deep learning: SMD converges to the closest global optimum to the initialization point in the Bregman divergence of the mirror used. Our experiments further indicate that there is a clear difference in the generalization performance of the solutions obtained from different SMD algorithms. Experimenting on the CIFAR-10 dataset with different regularizers, l1 to encourage sparsity, l2 (yielding SGD) to encourage small Euclidean norm, and l10 to discourage large components in the parameter vector, consistently and definitively shows that, for small initialization vectors, l10-SMD has better generalization performance than SGD, which in turn has better generalization performance than l1-SMD. This surprising, and perhaps counter-intuitive, result strongly suggests the importance of a comprehensive study of the role of regularization, and the choice of the best regularizer, to improve the generalization performance of deep networks.",
        "keywords": "deep learning;optimization;overparameterized;stochastic gradient descent;mirror descent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Navid Azizan;Sahin Lale;Babak Hassibi",
        "authorids": "azizan@caltech.edu;alale@caltech.edu;hassibi@caltech.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nazizan2020stochastic,\ntitle={Stochastic Mirror Descent on Overparameterized Nonlinear Models},\nauthor={Navid Azizan and Sahin Lale and Babak Hassibi},\nyear={2020},\nurl={https://openreview.net/forum?id=Skeq30NFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skeq30NFPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "273;308;244",
        "wc_reply_reviewers": "0;0;64",
        "wc_reply_authors": "476;579;784",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            275.0,
            26.166135875720485
        ],
        "wc_reply_reviewers_avg": [
            21.333333333333332,
            30.169889330626027
        ],
        "wc_reply_authors_avg": [
            613.0,
            128.01822786879478
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 51,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11077788787861141776&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Skerzp4KPS",
        "title": "Adaptive Data Augmentation with Deep Parallel Generative Models",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an adaptive DA strategy based on generative models that improves model performances across machine learning and computer vision tasks.",
        "abstract": "Data augmentation(DA) is a useful technique to enlarge the size of the training set and prevent overfitting for different machine learning tasks when training data is scarce. However, current data augmentation techniques rely heavily on human design and domain knowledge, and existing automated approaches are yet to fully exploit the latent features in the training dataset. In this paper we propose an adaptive DA strategy based on generative models, where the training set adaptively enriches itself with sample images automatically constructed from deep generative models trained in parallel. We demonstrate by experiments that our data augmentation strategy, with little model-specific considerations, can be easily adapted to cross-domain deep learning/machine learning tasks such as image classification and image inpainting, while significantly improving model performance in both tasks. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Boli Fang;Miao Jiang;Abhirag Nagpure;Jerry Shen",
        "authorids": "bfang@iu.edu;miajiang@iu.edu;anagpure@iu.edu;hashen@iu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfang2020adaptive,\ntitle={Adaptive Data Augmentation with Deep Parallel Generative Models},\nauthor={Boli Fang and Miao Jiang and Abhirag Nagpure and Jerry Shen},\nyear={2020},\nurl={https://openreview.net/forum?id=Skerzp4KPS}\n}",
        "github": "https://github.com/anonymizedsubmssion/ICLR_2020_anonymized_code",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skerzp4KPS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "315;285;379",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            326.3333333333333,
            39.20317447463775
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:n3UPbGI5_aUJ:scholar.google.com/&scioq=Adaptive+Data+Augmentation+with+Deep+Parallel+Generative+Models&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SkeuexBtDr",
        "title": "Learning from Rules Generalizing Labeled Exemplars",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Coupled rule-exemplar supervision and a implication loss helps to jointly learn to denoise rules and imply labels.",
        "abstract": "In many applications labeled data is not readily available, and needs to be collected via pain-staking human supervision. We propose a rule-exemplar method for collecting human supervision to combine the efficiency of rules with the quality of instance labels. The supervision is coupled such that it is both natural for humans and synergistic for learning. We propose a training algorithm that jointly denoises rules via latent coverage variables, and trains the model through a soft implication loss over the coverage and label variables. The denoised rules and trained model are used jointly for inference. Empirical evaluation on five different tasks shows that (1) our algorithm is more accurate than several existing methods of learning from a mix of clean and noisy supervision, and (2) the coupled rule-exemplar supervision is effective in denoising rules.",
        "keywords": "Learning from Rules;Learning from limited labeled data;Weakly Supervised Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Abhijeet Awasthi;Sabyasachi Ghosh;Rasna Goyal;Sunita Sarawagi",
        "authorids": "awasthi@cse.iitb.ac.in;sghosh@cse.iitb.ac.in;rasna.goyal66@gmail.com;sunita@iitb.ac.in",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nAwasthi2020Learning,\ntitle={Learning from Rules Generalizing Labeled Exemplars},\nauthor={Abhijeet Awasthi and Sabyasachi Ghosh and Rasna Goyal and Sunita Sarawagi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeuexBtDr}\n}",
        "github": "https://github.com/awasthiabhijeet/Learning-From-Rules",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkeuexBtDr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "379;412;211",
        "wc_reply_reviewers": "0;0;5",
        "wc_reply_authors": "607;201;241",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            334.0,
            88.01136290275251
        ],
        "wc_reply_reviewers_avg": [
            1.6666666666666667,
            2.357022603955158
        ],
        "wc_reply_authors_avg": [
            349.6666666666667,
            182.69342869651575
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 98,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18218931920464777128&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkeuipVKDH",
        "title": "RTC-VAE: HARNESSING THE PECULIARITY OF TOTAL CORRELATION IN LEARNING DISENTANGLED REPRESENTATIONS",
        "track": "main",
        "status": "Reject",
        "tldr": "diagnosed all the problem of STOA VAEs theoretically and qualitatively",
        "abstract": "In the problem of unsupervised learning of disentangled representations, one of the promising methods is to penalize the total correlation of sampled latent vari-ables.  Unfortunately, this well-motivated strategy often fail to achieve disentanglement due to a problematic difference between the sampled latent representation and its corresponding mean representation.  We provide a theoretical explanation that low total correlation of sample distribution cannot guarantee low total correlation of the  mean representation. We prove that for the mean representation of arbitrarily high total correlation, there exist distributions of latent variables of abounded total correlation.  However, we still believe that total correlation could be a key to the disentanglement of unsupervised representative learning, and we propose a remedy,  RTC-VAE, which rectifies the total correlation penalty.   Experiments show that our model has a more reasonable distribution of the mean representation compared with baseline models, e.g.,\u03b2-TCVAE and FactorVAE.",
        "keywords": "Total Correlation;VAEs;Disentanglement",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ze Cheng;Juncheng B Li;Chenxu Wang;Jixuan Gu;Hao Xu;Xinjian Li;Florian Metze",
        "authorids": "ze.cheng@cn.bosch.com;junchenl@cs.cmu.edu;chenxujwang@gmail.com;jixuan.gu@sjtu.edu.cn;hao.xu-1@colorado.edu;xinjianl@cs.cmu.edu;fmetze@cs.cmu.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\ncheng2020rtcvae,\ntitle={{\\{}RTC{\\}}-{\\{}VAE{\\}}: {\\{}HARNESSING{\\}} {\\{}THE{\\}} {\\{}PECULIARITY{\\}} {\\{}OF{\\}} {\\{}TOTAL{\\}} {\\{}CORRELATION{\\}}  {\\{}IN{\\}} {\\{}LEARNING{\\}} {\\{}DISENTANGLED{\\}} {\\{}REPRESENTATIONS{\\}}},\nauthor={Ze Cheng and Juncheng B Li and Chenxu Wang and Jixuan Gu and Hao Xu and Xinjian Li and Florian Metze},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeuipVKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkeuipVKDH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "992;270;194",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "823;232;101",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            485.3333333333333,
            359.60842901998586
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            385.3333333333333,
            314.06404159378417
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hlQUOBbrlhAJ:scholar.google.com/&scioq=RTC-VAE:+HARNESSING+THE+PECULIARITY+OF+TOTAL+CORRELATION+IN+LEARNING+DISENTANGLED+REPRESENTATIONS&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SkevphEYPB",
        "title": "POP-Norm: A Theoretically Justified and More Accelerated Normalization Approach",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Batch Normalization (BatchNorm) has been a default module in  modern deep networks  due to its effectiveness for accelerating training deep neural networks.  It is widely accepted that the great success of BatchNorm is owing to reduction of internal covariate shift (ICS), but recently it is demonstrated that the link between them is fairly weak. The intrinsic reason behind effectiveness of BatchNorm is still unrevealed that limits it to be made better use. In light of this,  we propose a new normalization approach,  referred to as Pre-Operation Normalization (POP-Norm), which is theoretically ensured to speed up the training convergence. Not surprisingly, POP-Norm and BatchNorm are largely the same. Hence the similarities  can help us to theoretically interpret the root of BatchNorm's effectiveness. There are still some significant distinctions between the two approaches. Just the distinctions make POP-Norm achieve faster convergence rate and better performance than BatchNorm, which are validated in extensive experiments on benchmark datasets: CIFAR10, CIFAR100 and ILSVRC2012.",
        "keywords": "Batch Normalization;Optimization;Accelerate Training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hanyang Peng;Shiqi Yu",
        "authorids": "philoso_phy0922@163.com;shiqi.yu@gmai.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\npeng2020popnorm,\ntitle={{\\{}POP{\\}}-Norm: A Theoretically Justified and More Accelerated Normalization Approach},\nauthor={Hanyang Peng and Shiqi Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=SkevphEYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkevphEYPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "825;473;150",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            482.6666666666667,
            275.6523575488195
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Mgjch5a2ycoJ:scholar.google.com/&scioq=POP-Norm:+A+Theoretically+Justified+and+More+Accelerated+Normalization+Approach&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SkexAREFDH",
        "title": "Learning Sparsity and Quantization Jointly and Automatically for Neural Network Compression via Constrained Optimization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Deep Neural Networks (DNNs) are widely applied in a wide range of usecases. There is an increased demand for deploying DNNs on devices that do not have abundant resources such as memory and computation units. Recently, network compression through a variety of techniques such as pruning and quantization have been proposed to reduce the resource requirement. A key parameter that all existing compression techniques are sensitive to is the compression ratio (e.g., pruning sparsity, quantization bitwidth) of each layer. Traditional solutions treat the compression ratios of each layer as hyper-parameters, and tune them using human heuristic. Recent researchers start using black-box hyper-parameter optimizations, but they will introduce new hyper-parameters and have efficiency issue. In this paper, we propose a framework to jointly prune and quantize the DNNs automatically according to a target model size without using any hyper-parameters to manually set the compression ratio for each layer. In the experiments, we show that our framework can compress the weights data of ResNet-50 to be 836x smaller without accuracy loss on CIFAR-10, and compress AlexNet to be 205x smaller without accuracy loss on ImageNet classification.",
        "keywords": "model compression;pruning;quantization;autoML",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haichuan Yang;Shupeng Gui;Yuhao Zhu;Ji Liu",
        "authorids": "h.yang@rochester.edu;sgui2@ur.rochester.edu;yzhu@rochester.edu;ji.liu.uwisc@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkexAREFDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "596;526;215",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            445.6666666666667,
            165.5905257622616
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5444175471809738687&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SkexNpNFwS",
        "title": "Potential Flow Generator with $L_2$ Optimal Transport Regularity for Generative Models",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a special generator with $L_2$ optimal transport regularity, which can be easily integrated into a wide range of generative models.",
        "abstract": "We propose a potential flow generator with $L_2$ optimal transport regularity, which can be easily integrated into a wide range of generative models including different versions of GANs and flow-based models. With up to a slight augmentation of the original generator loss functions, our generator is not only a transport map from the input distribution to the target one, but also the one with minimum $L_2$ transport cost. We show the correctness and robustness of the potential flow generator in several 2D problems, and illustrate the concept of ``proximity'' due to the $L_2$ optimal transport regularity. Subsequently, we demonstrate the effectiveness of the potential flow generator in image translation tasks with unpaired training data from the MNIST dataset and the CelebA dataset. ",
        "keywords": "generative models;optimal transport;GANs;flow-based models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liu Yang;George Em Karniadakis",
        "authorids": "liu_yang@brown.edu;george_karniadakis@brown.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nyang2020potential,\ntitle={Potential Flow Generator with {\\$}L{\\_}2{\\$} Optimal Transport Regularity for Generative Models},\nauthor={Liu Yang and George Em Karniadakis},\nyear={2020},\nurl={https://openreview.net/forum?id=SkexNpNFwS}\n}",
        "github": "https://drive.google.com/drive/folders/1I04bvuQqiorxhq4pVedgrmPZKnA6N4-D?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkexNpNFwS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "525;436;145",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "973;1850;181",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;4;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            368.6666666666667,
            162.27617063375496
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1001.3333333333334,
            681.6608801710389
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 48,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16665041116767321776&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "Skey4eBYPS",
        "title": "Convolutional Conditional Neural Processes",
        "track": "main",
        "status": "Talk",
        "tldr": "We extend deep sets to functional embeddings and Neural Processes to include translation equivariant members",
        "abstract": "We introduce the Convolutional Conditional Neural Process (ConvCNP), a new member of the Neural Process family that models translation equivariance in the data. Translation equivariance is an important inductive bias for many learning problems including time series modelling, spatial data, and images. The model embeds data sets into an infinite-dimensional function space, as opposed to finite-dimensional vector spaces. To formalize this notion, we extend the theory of neural representations of sets to include functional representations, and demonstrate that any translation-equivariant embedding can be represented using a convolutional deep-set. We evaluate ConvCNPs in several settings, demonstrating that they achieve state-of-the-art performance compared to existing NPs. We demonstrate that building in translation equivariance enables zero-shot generalization to challenging, out-of-domain tasks.",
        "keywords": "Neural Processes;Deep Sets;Translation Equivariance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jonathan Gordon;Wessel P. Bruinsma;Andrew Y. K. Foong;James Requeima;Yann Dubois;Richard E. Turner",
        "authorids": "jg801@cam.ac.uk;wpb23@cam.ac.uk;ykf21@cam.ac.uk;jrr41@cam.ac.uk;yanndubois96@gmail.com;ret26@cam.ac.uk",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nGordon2020Convolutional,\ntitle={Convolutional Conditional Neural Processes},\nauthor={Jonathan Gordon and Wessel P. Bruinsma and Andrew Y. K. Foong and James Requeima and Yann Dubois and Richard E. Turner},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skey4eBYPS}\n}",
        "github": "https://github.com/cambridge-mlg/convcnp",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skey4eBYPS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "261;842;529",
        "wc_reply_reviewers": "0;0;20",
        "wc_reply_authors": "1069;1027;651",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;2;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            544.0,
            237.42928771882094
        ],
        "wc_reply_reviewers_avg": [
            6.666666666666667,
            9.428090415820632
        ],
        "wc_reply_authors_avg": [
            915.6666666666666,
            187.9314295753167
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 196,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12448908036618273456&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SkeyppEFvS",
        "title": "CoPhy: Counterfactual Learning of Physical Dynamics",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "Understanding causes and effects in mechanical systems is an essential component of reasoning in the physical world. This work poses a new problem of counterfactual learning of object mechanics from visual input.  We develop the CoPhy benchmark to assess the capacity of the state-of-the-art models for causal physical reasoning in a synthetic 3D environment and propose a model for learning the physical dynamics in a counterfactual setting. Having observed a mechanical experiment that involves, for example, a falling tower of blocks, a set of bouncing balls or colliding objects, we learn to predict how its outcome is affected by an arbitrary intervention on its initial conditions, such as displacing one of the objects in the scene. The alternative future is predicted given the altered past and a latent representation of the confounders learned by the model in an end-to-end fashion with no supervision. We compare against feedforward video prediction baselines and show how observing alternative experiences allows the network to capture latent physical properties of the environment, which results in significantly more accurate predictions at the level of super human performance.",
        "keywords": "intuitive physics;visual reasoning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fabien Baradel;Natalia Neverova;Julien Mille;Greg Mori;Christian Wolf",
        "authorids": "fabien.baradel@insa-lyon.fr;nneverova@fb.com;julien.mille@insa-cvl.fr;mori@cs.sfu.ca;christian.wolf@insa-lyon.fr",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nBaradel2020CoPhy:,\ntitle={CoPhy: Counterfactual Learning of Physical Dynamics},\nauthor={Fabien Baradel and Natalia Neverova and Julien Mille and Greg Mori and Christian Wolf},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkeyppEFvS}\n}",
        "github": "https://github.com/fabienbaradel/cophy",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkeyppEFvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "129;1516;192",
        "wc_reply_reviewers": "0;131;0",
        "wc_reply_authors": "38;1161;416",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            612.3333333333334,
            639.506233137897
        ],
        "wc_reply_reviewers_avg": [
            43.666666666666664,
            61.753992223625154
        ],
        "wc_reply_authors_avg": [
            538.3333333333334,
            466.5521288011543
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 113,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16669009807154422893&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SkezP1HYvS",
        "title": "Diagonal Graph Convolutional Networks with Adaptive Neighborhood Aggregation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Graph convolutional networks (GCNs) and their variants have generalized deep learning methods into non-Euclidean graph data, bringing a substantial improvement on many graph mining tasks. In this paper, we revisit the mathematical foundation of GCNs and study how to extend their representation capacity. We discover that their performance can be improved with an adaptive neighborhood aggregation step. The core idea is to adaptively scale the output signal for each node and automatically train a suitable nonlinear encoder for the input signal. In this work, we present a new method named Diagonal Graph Convolutional Networks (DiagGCN) based on this idea. Importantly, one of the adaptive aggregation techniques\u2014the permutations of diagonal matrices\u2014used in DiagGCN offers a flexible framework to design GCNs and in fact, some of the most expressive GCNs, e.g., the graph attention network, can be reformulated as a particular instance of our model. Standard experiments on open graph benchmarks show that our proposed framework can consistently improve the graph classification accuracy when compared to state-of-the-art baselines.",
        "keywords": "data mining;graph convolutional networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jie Zhang;Yuxiao Dong;Jie Tang",
        "authorids": "zhangjie.exe@gmail.com;yuxdong@microsoft.com;jietang@tsinghua.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhang2020diagonal,\ntitle={Diagonal Graph Convolutional Networks with Adaptive Neighborhood Aggregation},\nauthor={Jie Zhang and Yuxiao Dong and Jie Tang},\nyear={2020},\nurl={https://openreview.net/forum?id=SkezP1HYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkezP1HYvS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "501;195;216",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            304.0,
            139.56360557108002
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:akqUsL0GmTgJ:scholar.google.com/&scioq=Diagonal+Graph+Convolutional+Networks+with+Adaptive+Neighborhood+Aggregation&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "Skg2pkHFwS",
        "title": "Emergence of Collective Policies Inside Simulations with Biased Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose learning of the collective policy solely in simulation so that agents' biases (analoguous to human's cognitive biases) are complemented by one another. ",
        "abstract": "We consider a setting where biases are involved when agents internalise an environment.  Agents have different biases, all of which resulting in imperfect evidence collected for taking optimal actions.  Throughout the interactions, each agent asynchronously internalises their own predictive model of the environment and forms a virtual simulation within which the agent plays trials of the episodes in entirety. In this research, we focus on developing a collective policy trained solely inside agents' simulations, which can then be transferred to the real-world environment. The key idea is to let agents imagine together; make them take turns to host virtual episodes within which all agents participate and interact with their own biased representations. Since agents' biases vary, the collective policy developed while sequentially visiting the internal simulations complement one another's shortcomings. In our experiment, the collective policies consistently achieve significantly higher returns than the best individually trained policies.",
        "keywords": "collective policy;biased representation;model-based RL;simulation;imagination;virtual environment",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jooyeon Kim;Alice Oh",
        "authorids": "jooyeon.kim@kaist.ac.kr;alice.oh@kaist.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkim2020emergence,\ntitle={Emergence of Collective Policies Inside Simulations with Biased Representations},\nauthor={Jooyeon Kim and Alice Oh},\nyear={2020},\nurl={https://openreview.net/forum?id=Skg2pkHFwS}\n}",
        "github": "http://s000.tinyupload.com/?file_id=54935721167326296555",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skg2pkHFwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1109;568;699",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            792.0,
            230.44449801778012
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TZbZDpYuMP0J:scholar.google.com/&scioq=Emergence+of+Collective+Policies+Inside+Simulations+with+Biased+Representations&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Skg3104FDS",
        "title": "First-Order Preconditioning via Hypergradient Descent",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a computationally-efficient method for learning a preconditioning matrix for optimization via hypergradient descent.",
        "abstract": "Standard gradient-descent methods are susceptible to a range of issues that can impede training, such as high correlations and different scaling in parameter space. These difficulties can be addressed by second-order approaches that apply a preconditioning matrix to the gradient to improve convergence.  Unfortunately, such algorithms typically struggle to scale to high-dimensional problems, in part because the calculation of specific preconditioners such as the inverse Hessian or Fisher information matrix is highly expensive. We introduce first-order preconditioning (FOP), a fast, scalable approach that generalizes previous work on hypergradient descent (Almeida et al., 1998; Maclaurin et al., 2015; Baydin et al., 2017) to learn a preconditioning matrix that only makes use of first-order information. Experiments show that FOP is able to improve the performance of standard deep learning optimizers on several visual classification tasks with minimal computational overhead. We also investigate the properties of the learned preconditioning matrices and perform a preliminary theoretical analysis of the algorithm.",
        "keywords": "optimization;deep learning;hypgergradient",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ted Moskovitz;Rui Wang;Janice Lan;Sanyam Kapoor;Thomas Miconi;Jason Yosinski;Aditya Rawal",
        "authorids": "thmoskovitz@gmail.com;ruiwang@uber.com;janlan@uber.com;sanyam@uber.com;tmiconi@uber.com;yosinski@uber.com;aditya.rawal@uber.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nmoskovitz2020firstorder,\ntitle={First-Order Preconditioning via Hypergradient Descent},\nauthor={Ted Moskovitz and Rui Wang and Janice Lan and Sanyam Kapoor and Thomas Miconi and Jason Yosinski and Aditya Rawal},\nyear={2020},\nurl={https://openreview.net/forum?id=Skg3104FDS}\n}",
        "github": "https://drive.google.com/file/d/1vhB4fxDuxaYJcNP6ioEJQf4CLHxhy1ka/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Skg3104FDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "229;371;158",
        "wc_reply_reviewers": "61;0;0",
        "wc_reply_authors": "788;495;519",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            252.66666666666666,
            88.55255815364994
        ],
        "wc_reply_reviewers_avg": [
            20.333333333333332,
            28.755675768252935
        ],
        "wc_reply_authors_avg": [
            600.6666666666666,
            132.82653683992854
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9573997321489744142&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Skg5r1BFvB",
        "title": "Continuous Control with Contexts, Provably",
        "track": "main",
        "status": "Reject",
        "tldr": "We give a provably efficient algorithm for linear quadratic regulator with contexts.",
        "abstract": "A fundamental challenge in artificially intelligence is to build an agent that generalizes and adapts to unseen environments. A common strategy is to build a decoder that takes a context of the unseen new environment and generates a policy. The current paper studies how to build a decoder for the fundamental continuous control environment, linear quadratic regulator (LQR), which can model a wide range of real world physical environments. We present a simple algorithm for this problem, which uses upper confidence bound (UCB) to refine the estimate of the decoder and balance the exploration-exploitation trade-off. Theoretically, our algorithm enjoys a $\\widetilde{O}\\left(\\sqrt{T}\\right)$ regret bound in the online setting where $T$ is the number of environments the agent played. This also implies after playing $\\widetilde{O}\\left(1/\\epsilon^2\\right)$ environments, the agent is able to transfer the learned knowledge to obtain an $\\epsilon$-suboptimal policy for an unseen environment. To our knowledge, this is first provably efficient algorithm to build a decoder in the continuous control setting. While our main focus is theoretical, we also present experiments that demonstrate the effectiveness of our algorithm.",
        "keywords": "continuous control;learning;context",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Simon Du;Mengdi Wang;Ruosong Wang;Lin F. Yang",
        "authorids": "ssdu@ias.edu;mengdiw@princeton.edu;ruosongw@andrew.cmu.edu;linyang@ee.ucla.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ndu2020continuous,\ntitle={Continuous Control with Contexts, Provably},\nauthor={Simon Du and Mengdi Wang and Ruosong Wang and Lin F. Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=Skg5r1BFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skg5r1BFvB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "332;508;396",
        "wc_reply_reviewers": "0;256;0",
        "wc_reply_authors": "0;473;27",
        "reply_reviewers": "0;1;0",
        "reply_authors": "0;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            412.0,
            72.73696905059123
        ],
        "wc_reply_reviewers_avg": [
            85.33333333333333,
            120.67955732250411
        ],
        "wc_reply_authors_avg": [
            166.66666666666666,
            216.89065345365984
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.816496580927726
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3607481110647247478&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Skg7TerKPH",
        "title": "Toward Controllable Text Content Manipulation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Controlled generation of text is of high practical use. Recent efforts have made impressive progress in generating or editing sentences with given textual attributes (e.g., sentiment). This work studies a new practical setting of text content manipulation. Given a structured record, such as (PLAYER: Lebron, POINTS: 20, ASSISTS: 10), and a reference sentence, such as Kobe easily dropped 30 points, we aim to generate a sentence that accurately describes the full content in the record, with the same writing style (e.g., wording, transitions) of the reference. The problem combines the characteristics of data-to-text\ngeneration and style transfer, and is challenging to minimally yet effectively manipulate the text (by rewriting/adding/deleting text portions) to ensure fidelity to the structured content. We derive two datasets from the data-to-text task as our testbed, and develop a neural method with weakly supervised competing objectives and explicit content coverage constraints. Automatic and human evaluations show superiority of our approach over competitive methods including a strong rule-based baseline and prior approaches designed for style transfer.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shuai Lin;Wentao Wang;Zhiting Hu;Zichao Yang;Xiaodan Liang;Haoran Shi;Frank Xu;Eric Xing",
        "authorids": "shuailin97@gmail.com;;zhitinghu@gmail.com;;;haoranshi97@gmail.com;;eric.xing@petuum.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Skg7TerKPH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "506;159;333",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            332.6666666666667,
            141.66235287549839
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:v8b0zlqoMNAJ:scholar.google.com/&scioq=Toward+Controllable+Text+Content+Manipulation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Skg7VAEKDS",
        "title": "When Do Variational Autoencoders Know What They Don't Know?",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Recently, the ability of deep generative models to detect outliers has been called into question because of the demonstration that they frequently assign higher probability density to samples from completely different data sets than were used for training.  For example, a model trained on CIFAR-10 may counter-intuitively attribute higher likelihood to samples obtained from SVHN.  In this work, we closely examine this phenomena in the specific context of variational autoencoders, a commonly-used approach for anomaly detection.  In particular, we demonstrate that VAEs, when appropriately designed and trained, are in fact often proficient in differentiating inlier and outlier distributions, e.g., FashionMNIST vs MNIST, CIFAR-10 vs SVHN and CelebA.  We describe various mechanisms that mitigate this capability, including the paradoxical necessity of large or unbounded gradients, which have sometimes been observed to occur during training of VAE models.",
        "keywords": "variational autoencoder;generative model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bin Dai;David Wipf",
        "authorids": "daib13@mails.tsinghua.edu.cn;davidwipf@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skg7VAEKDS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "462;272;967",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            567.0,
            293.2859810264832
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=386433572827972643&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Skg8gJBFvr",
        "title": "Filling the Soap Bubbles: Efficient Black-Box Adversarial Certification with Non-Gaussian Smoothing",
        "track": "main",
        "status": "Reject",
        "tldr": "We improve existing certification results with a new certification framework by reformulating the original problem to a functional optimization one, and design a new distribution family which suits this task better through this framework.",
        "abstract": "Randomized classifiers have been shown to provide a promising approach for achieving certified robustness against adversarial attacks in deep learning. However, most existing methods only leverage Gaussian smoothing noise and only work for $\\ell_2$ perturbation. We propose a general framework of adversarial certification with non-Gaussian noise and for more general types of attacks, from a unified functional optimization perspective. Our new framework allows us to identify a key trade-off between accuracy and robustness via designing smoothing distributions, helping to design two new families of non-Gaussian smoothing distributions that work more efficiently for $\\ell_2$ and $\\ell_\\infty$ attacks, respectively. Our proposed methods achieve better results than previous works and provide a new perspective on randomized smoothing certification.",
        "keywords": "Adversarial Certification;Randomized Smoothing;Functional Optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dinghuai Zhang*;Mao Ye*;Chengyue Gong*;Zhanxing Zhu;Qiang Liu",
        "authorids": "zhangdinghuai@pku.edu.cn;lushleaf21@gmail.com;cygong@cs.utexas.edu;zhanxing.zhu@pku.edu.cn;lqiang@cs.utexas.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang*2020filling,\ntitle={Filling the Soap Bubbles: Efficient Black-Box Adversarial Certification with Non-Gaussian Smoothing},\nauthor={Dinghuai Zhang* and Mao Ye* and Chengyue Gong* and Zhanxing Zhu and Qiang Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=Skg8gJBFvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skg8gJBFvr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "580;629;526",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1117;569;387",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            578.3333333333334,
            42.06608557443342
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            691.0,
            310.25580843340657
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7393821482210101823&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Skg9FhVYPS",
        "title": "Smooth Regularized Reinforcement Learning",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "",
        "abstract": "place holder",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haoming Jiang",
        "authorids": "jianghm.ustc@gatech.edu",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nanonymous2020smooth,\ntitle={Smooth Regularized Reinforcement Learning },\nauthor={Anonymous},\nbooktitle={Submitted to International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skg9FhVYPS},\nnote={under review}\n}",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=Skg9FhVYPS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "Skg9aAEKwH",
        "title": "Visual Hide and Seek",
        "track": "main",
        "status": "Reject",
        "tldr": "We train an agent to hide from a predator, and experiments suggest useful representations emerge.",
        "abstract": "We train embodied agents to play Visual Hide and Seek where a prey must navigate in a simulated environment in order to avoid capture from a predator. We place a variety of obstacles in the environment for the prey to hide behind, and we only give the agents partial observations of their environment using an egocentric perspective. Although we train the model to play this game from scratch without any prior knowledge of its visual world, experiments and visualizations show that a representation of other agents automatically emerges in the learned representation. Furthermore, we quantitatively analyze how agent weaknesses, such as slower speed, effect the learned policy. Our results suggest that, although agent weaknesses make the learning problem more challenging, they also cause useful features to emerge in the representation.",
        "keywords": "Embodied Learning;Self-supervised Learning;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Boyuan Chen;Shuran Song;Hod Lipson;Carl Vondrick",
        "authorids": "bchen@cs.columbia.edu;shurans@cs.columbia.edu;hod.lipson@columbia.edu;vondrick@cs.columbia.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nchen2020visual,\ntitle={Visual Hide and Seek},\nauthor={Boyuan Chen and Shuran Song and Hod Lipson and Carl Vondrick},\nyear={2020},\nurl={https://openreview.net/forum?id=Skg9aAEKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skg9aAEKwH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "444;278;826",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "892;758;826",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            516.0,
            229.43989772196696
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            825.3333333333334,
            54.70730197047639
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8183210964469858945&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Skg9jnVFvH",
        "title": "Progressive Upsampling Audio Synthesis via Effective Adversarial Training",
        "track": "main",
        "status": "Reject",
        "tldr": "We proposed a novel raw-waveform generation method, which uses only 5% of the parameters of the existing model while maintaining the output quality.",
        "abstract": "This paper proposes a novel generative model called PUGAN, which progressively synthesizes high-quality audio in a raw waveform. PUGAN leverages on the recently proposed idea of progressive generation of higher-resolution images by stacking multiple encode-decoder architectures. To effectively apply it to raw audio generation, we propose two novel modules: (1) a neural upsampling layer and (2) a sinc convolutional layer. Compared to the existing state-of-the-art model called WaveGAN, which uses a single decoder architecture, our model generates audio signals and converts them in a higher resolution in a progressive manner, while using a significantly smaller number of parameters, e.g., 20x smaller for 44.1kHz output, than an existing technique called WaveGAN. Our experiments show that the audio signals can be generated in real-time with the comparable quality to that of WaveGAN with respect to the inception scores and the human evaluation.",
        "keywords": "audio synthesis;sound effect generation;generative adversarial network;progressive training;raw-waveform",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Youngwoo Cho;Minwook Chang;Gerard Jounghyun Kim;Jaegul Choo",
        "authorids": "cyw314@gmail.com;fromme0528@gmail.com;gjkim@korea.ac.kr;jchoo@korea.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ncho2020progressive,\ntitle={Progressive Upsampling Audio Synthesis via Effective Adversarial Training},\nauthor={Youngwoo Cho and Minwook Chang and Gerard Jounghyun Kim and Jaegul Choo},\nyear={2020},\nurl={https://openreview.net/forum?id=Skg9jnVFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skg9jnVFvH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "222;542;339",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "559;850;172",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            367.6666666666667,
            132.20270630949867
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            527.0,
            277.7156819482832
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ltKQd53D57gJ:scholar.google.com/&scioq=Progressive+Upsampling+Audio+Synthesis+via+Effective+Adversarial+Training&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SkgBfaNKPr",
        "title": "Topology of deep neural networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We show that neural networks operate by changing topologly of a data set and explore how architectural choices effect this change.",
        "abstract": "We study how the topology of a data set comprising two components representing two classes of objects in a binary classification problem changes as it passes through the layers of a well-trained neural network, i.e., one with perfect accuracy on training set and a generalization error of less than 1%. The goal is to shed light on two well-known mysteries in deep neural networks: (i) a nonsmooth activation function like ReLU outperforms a smooth one like hyperbolic tangent; (ii) successful neural network architectures rely on having many layers, despite the fact that a shallow network is able to approximate any function arbitrary well. We performed extensive experiments on persistent homology of a range of point cloud data sets. The results consistently demonstrate the following: (1) Neural networks operate by changing topology, transforming a topologically complicated data set into a topologically simple one as it passes through the layers. No matter how complicated the topology of the data set we begin with, when passed through a well-trained neural network, the Betti numbers of both components invariably reduce to their lowest possible values: zeroth Betti number is one and all higher Betti numbers are zero. Furthermore, (2) the reduction in Betti numbers is significantly faster for ReLU activation compared to hyperbolic tangent activation --- consistent with the fact that the former define nonhomeomorphic maps (that change topology) whereas the latter define homeomorphic maps (that preserve topology). Lastly, (3) shallow and deep networks process the same data set differently --- a shallow network operates mainly through changing geometry and changes topology only in its final layers, a deep network spreads topological changes more evenly across all its layers.",
        "keywords": "theoretical issues in deep learning;topology;Betti numbers",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gregory Naitzat;Andrey Zhitnikov;Lek-Heng Lim",
        "authorids": "gregn@uchicago.edu;andreyz@technion.ac.il;lekheng@galton.uchicago.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nnaitzat2020topology,\ntitle={Topology of deep neural networks},\nauthor={Gregory Naitzat and Andrey Zhitnikov and Lek-Heng Lim},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgBfaNKPr}\n}",
        "github": "https://github.com/topnn/topnn_framework/blob/master/readme.md",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkgBfaNKPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1815;730;760",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1101.6666666666667,
            504.5515060152157
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 185,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=218257590655721854&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "SkgC6TNFvr",
        "title": "Reinforced active learning for image segmentation",
        "track": "main",
        "status": "Poster",
        "tldr": "Learning a labeling policy with reinforcement learning to reduce labeling effort for the task of semantic segmentation",
        "abstract": "Learning-based approaches for semantic segmentation have two inherent challenges. First, acquiring pixel-wise labels is expensive and time-consuming. Second, realistic segmentation datasets are highly unbalanced: some categories are much more abundant than others, biasing the performance to the most represented ones. In this paper, we are interested in focusing human labelling effort on a small subset of a larger pool of data, minimizing this effort while maximizing performance of a segmentation model on a hold-out set. We present a new active learning strategy for semantic segmentation based on deep reinforcement learning (RL). An agent learns a policy to select a subset of small informative image regions -- opposed to entire images -- to be labeled, from a pool of unlabeled data. The region selection decision is made based on predictions and uncertainties of the segmentation model being trained. Our method proposes a new modification of the deep Q-network (DQN) formulation for active learning, adapting it to the large-scale nature of semantic segmentation problems. We test the proof of concept in CamVid and provide results in the large-scale dataset Cityscapes. On Cityscapes, our deep RL region-based DQN approach requires roughly 30% less additional labeled data than our most competitive baseline to reach the same performance. Moreover, we find that our method asks for more labels of under-represented categories compared to the baselines, improving their performance and helping to mitigate class imbalance.",
        "keywords": "semantic segmentation;active learning;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arantxa Casanova;Pedro O. Pinheiro;Negar Rostamzadeh;Christopher J. Pal",
        "authorids": "arantxa.casanova-paga@polymtl.ca;pedro@opinheiro.com;negar@elementai.com;chris.j.pal@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nCasanova2020Reinforced,\ntitle={Reinforced active learning for image segmentation},\nauthor={Arantxa Casanova and Pedro O. Pinheiro and Negar Rostamzadeh and Christopher J. Pal},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgC6TNFvr}\n}",
        "github": "[![github](/images/github_icon.svg) ArantxaCasanova/ralis](https://github.com/ArantxaCasanova/ralis)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkgC6TNFvr",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "697;291",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "896;590",
        "reply_reviewers": "0;0",
        "reply_authors": "2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            494.0,
            203.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            743.0,
            153.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 143,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1054013285080220526&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkgC6yHtvB",
        "title": "Hyperbolic Image Embeddings",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We show that hyperbolic embeddings are useful for high-level computer vision tasks, especially for few-shot classification.",
        "abstract": "Computer vision tasks such as image classification, image retrieval and few-shot learning are currently dominated by Euclidean and spherical embeddings, so that the final decisions about class belongings or the degree of similarity are made using linear hyperplanes, Euclidean distances, or spherical geodesic distances (cosine similarity). In this work, we demonstrate that in many practical scenarios hyperbolic embeddings provide a better alternative.",
        "keywords": "hyperbolic;poincare;image embeddings;few show learning;reidentification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Valentin Khrulkov;Leyla Mirvakhabova;Evgeniya Ustinova;Ivan Oseledets;Victor Lempitsky",
        "authorids": "khrulkov.v@gmail.com;leyla.mirvakhabova@skoltech.ru;evgeniya.ustinova@skoltech.ru;i.oseledets@skoltech.ru;v.lempitsky@samsung.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "https://github.com/hyperbolic-embeddings/hyperbolic-image-embeddings",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkgC6yHtvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "184;299;704",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            395.6666666666667,
            223.0221712944453
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 372,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10454572213874629322&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 14
    },
    {
        "id": "SkgGCkrKvH",
        "title": "Decentralized Deep Learning with Arbitrary Communication Compression",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose Choco-SGD---decentralized SGD with compressed communication---for non-convex objectives and show its strong performance in various deep learning applications (on-device learning, datacenter case).",
        "abstract": "Decentralized training of deep learning models is a key element for enabling data privacy and on-device learning over networks, as well as for efficient scaling to large compute clusters. As current approaches are limited by network bandwidth, we propose the use of communication compression in the decentralized training context. We show that Choco-SGD achieves linear speedup in the number of workers for arbitrary high compression ratios on general non-convex functions, and non-IID training data.  We demonstrate the practical performance of the algorithm in two key scenarios: the training of deep learning models (i) over decentralized user devices, connected by a peer-to-peer network and (ii) in a datacenter. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anastasia Koloskova*;Tao Lin*;Sebastian U Stich;Martin Jaggi",
        "authorids": "anastasia.koloskova@epfl.ch;tao.lin@epfl.ch;sebastian.stich@epfl.ch;martin.jaggi@epfl.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nKoloskova*2020Decentralized,\ntitle={Decentralized Deep Learning with Arbitrary Communication Compression},\nauthor={Anastasia Koloskova* and Tao Lin* and Sebastian U Stich and Martin Jaggi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgGCkrKvH}\n}",
        "github": "https://github.com/epfml/ChocoSGD",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkgGCkrKvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "248;141;373",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "360;175;83",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            254.0,
            94.80857907735283
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            206.0,
            115.18969861349002
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 280,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11705017815367904988&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SkgGjRVKDS",
        "title": "Towards Stabilizing Batch Statistics in Backward Propagation of Batch Normalization",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a novel normalization method to handle small batch size cases.",
        "abstract": "Batch Normalization (BN) is one of the most widely used techniques in Deep Learning field. But its performance can awfully degrade with insufficient batch size. This weakness limits the usage of BN on many computer vision tasks like detection or segmentation, where batch size is usually small due to the constraint of memory consumption. Therefore many modified normalization techniques have been proposed, which either fail to restore the performance of BN completely, or have to introduce additional nonlinear operations in inference procedure and increase huge consumption. In this paper, we reveal that there are two extra batch statistics involved in backward propagation of BN, on which has never been well discussed before. The extra batch statistics associated with gradients also can severely affect the training of deep neural network. Based on our analysis, we propose a novel normalization method, named Moving Average Batch Normalization (MABN). MABN can completely restore the performance of vanilla BN in small batch cases, without introducing any additional nonlinear operations in inference procedure. We prove the benefits of MABN by both theoretical analysis and experiments. Our experiments demonstrate the effectiveness of MABN in multiple computer vision tasks including ImageNet and COCO. The code has been released in https://github.com/megvii-model/MABN.",
        "keywords": "batch normalization;small batch size;backward propagation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junjie Yan;Ruosi Wan;Xiangyu Zhang;Wei Zhang;Yichen Wei;Jian Sun",
        "authorids": "jjyan17@fudan.edu.cn;wanruosi@megvii.com;zhangxiangyu@megvii.com;weizh@fudan.edu.cn;weiyichen@megvii.com;sunjian@megvii.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nYan2020Towards,\ntitle={Towards Stabilizing Batch Statistics in Backward Propagation of Batch Normalization},\nauthor={Junjie Yan and Ruosi Wan and Xiangyu Zhang and Wei Zhang and Yichen Wei and Jian Sun},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgGjRVKDS}\n}",
        "github": "https://github.com/megvii-model/MABN",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkgGjRVKDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "351;332;129",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "730;822;26",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            270.6666666666667,
            100.47332426514457
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            526.0,
            355.54277754816883
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2467606863922912536&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SkgHtkrYPH",
        "title": "Fast Sparse ConvNets",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Sparse MobileNets are faster than Dense ones with the appropriate kernels.",
        "abstract": "Historically, the pursuit of efficient inference has been one of the driving forces be-hind the research into new deep learning architectures and building blocks. Some of the recent examples include:  the squeeze-and-excitation module of (Hu et al.,2018), depthwise separable convolutions in Xception (Chollet, 2017), and the inverted bottleneck in MobileNet v2 (Sandler et al., 2018).  Notably, in all of these cases, the resulting building blocks enabled not only higher efficiency, but also higher accuracy, and found wide adoption in the field. In this work, we further expand the arsenal of efficient building blocks for neural network architectures; but instead of combining standard primitives (such as convolution), we advocate for the replacement of these dense primitives with their sparse counterparts.  While the idea of using sparsity to decrease the parameter count is not new (Mozer & Smolensky, 1989), the conventional wisdom is that this reduction in theoretical FLOPs does not translate into real-world efficiency gains.  We aim to correct this misconception by introducing a family of efficient sparse kernels for several hardware platforms, which we plan to open-source for the benefit of the community. Equipped with our efficient implementation of sparse primitives,  we show that sparse versions of MobileNet  v1 and MobileNet  v2  architectures  substantially outperform strong dense baselines on the efficiency-accuracy curve.   On Snapdragon 835 our sparse networks outperform their dense equivalents by 1.1\u22122.2x \u2013 equivalent to approximately one entire generation of improvement.  We hope that our findings will facilitate wider adoption of sparsity as a tool for creating efficient and accurate deep learning architectures.",
        "keywords": "Sparsity;Vision;CNNs;ConvNets;Inference;Mobile;Kernels",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Erich Elsen;Marat Dukhan;Trevor Gale;Karen Simonyan",
        "authorids": "eriche@google.com;maratek@google.com;tgale@google.com;simonyan@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkgHtkrYPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "192;322;80",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "214;15;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            198.0,
            98.88714105821174
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            76.33333333333333,
            97.537457192381
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 176,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13583809085164181826&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "SkgJOAEtvr",
        "title": "INTERNAL-CONSISTENCY CONSTRAINTS FOR EMERGENT COMMUNICATION",
        "track": "main",
        "status": "Reject",
        "tldr": "Internal-consistency constraints improve agents ability to develop emergent protocols that generalize across communicative roles.",
        "abstract": "When communicating, humans rely on internally-consistent language representations. That is, as speakers, we expect listeners to behave the same way we do when we listen. This work proposes several methods for encouraging such internal consistency in dialog agents in an emergent communication setting. We consider two hypotheses about the effect of internal-consistency constraints: 1) that they improve agents\u2019 ability to refer to unseen referents, and 2) that they improve agents\u2019 ability to generalize across communicative roles (e.g. performing as a speaker de- spite only being trained as a listener). While we do not find evidence in favor of the former, our results show significant support for the latter.",
        "keywords": "Emergent Communication;Speaker-Listener Models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Charles Lovering;Ellie Pavlick",
        "authorids": "charles_lovering@brown.edu;ellie_pavlick@brown.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlovering2020internalconsistency,\ntitle={{\\{}INTERNAL{\\}}-{\\{}CONSISTENCY{\\}} {\\{}CONSTRAINTS{\\}} {\\{}FOR{\\}} {\\{}EMERGENT{\\}} {\\{}COMMUNICATION{\\}}},\nauthor={Charles Lovering and Ellie Pavlick},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgJOAEtvr}\n}",
        "github": "http://bit.ly/internal-consistency-emergent-communication",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkgJOAEtvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "813;139;875",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "689;0;451",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            609.0,
            333.30266525587024
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            380.0,
            285.72830917965877
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a-KmZ-WjQAYJ:scholar.google.com/&scioq=INTERNAL-CONSISTENCY+CONSTRAINTS+FOR+EMERGENT+COMMUNICATION&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SkgKO0EtvS",
        "title": "Neural Execution of Graph Algorithms",
        "track": "main",
        "status": "Poster",
        "tldr": "We supervise graph neural networks to imitate intermediate and step-wise outputs of classical graph algorithms, recovering highly favourable insights.",
        "abstract": "Graph Neural Networks (GNNs) are a powerful representational tool for solving problems on graph-structured inputs. In almost all cases so far, however, they have been applied to directly recovering a final solution from raw inputs, without explicit guidance on how to structure their problem-solving. Here, instead, we focus on learning in the space of algorithms: we train several state-of-the-art GNN architectures to imitate individual steps of classical graph algorithms, parallel (breadth-first search, Bellman-Ford) as well as sequential (Prim's algorithm). As graph algorithms usually rely on making discrete decisions within neighbourhoods, we hypothesise that maximisation-based message passing neural networks are best-suited for such objectives, and validate this claim empirically. We also demonstrate how learning in the space of algorithms can yield new opportunities for positive transfer between tasks---showing how learning a shortest-path algorithm can be substantially improved when simultaneously learning a reachability algorithm.",
        "keywords": "Graph Neural Networks;Graph Algorithms;Learning to Execute;Program Synthesis;Message Passing Neural Networks;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Petar Veli\u010dkovi\u0107;Rex Ying;Matilde Padovano;Raia Hadsell;Charles Blundell",
        "authorids": "petarv@google.com;rexying@stanford.edu;mp861@cam.ac.uk;raia@google.com;cblundell@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nVeli\u010dkovi\u01072020Neural,\ntitle={Neural Execution of Graph Algorithms},\nauthor={Petar Veli\u010dkovi\u0107 and Rex Ying and Matilde Padovano and Raia Hadsell and Charles Blundell},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgKO0EtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkgKO0EtvS",
        "pdf_size": 0,
        "rating": "1;8;8",
        "confidence": "0;0;0",
        "wc_review": "197;285;592",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "336;138;359",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            3.299831645537222
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            358.0,
            169.3182407972238
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            277.6666666666667,
            99.20461458800975
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 205,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12624449635904136865&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SkgNZeSKPB",
        "title": "Gating Revisited: Deep Multi-layer RNNs That Can Be Trained",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We analyze the gradient propagation in deep RNNs and from our analysis, we propose a new multi-layer deep RNN.",
        "abstract": "Recurrent Neural Networks (RNNs) are widely used models for sequence data. Just like for feedforward networks, it has become common to build \"deep\" RNNs, i.e., stack multiple recurrent layers to obtain higher-level abstractions of the data. However, this works only for a handful of layers. Unlike feedforward networks, stacking more than a few recurrent units (e.g., LSTM cells) usually hurts model performance, the reason being vanishing or exploding gradients during training. We investigate the training of multi-layer RNNs and examine the magnitude of the gradients as they propagate through the network. We show that, depending on the structure of the basic recurrent unit, the gradients are systematically attenuated or amplified, so that with an increasing depth they tend to vanish, respectively explode. Based on our analysis we design a new type of gated cell that better preserves gradient magnitude, and therefore makes it possible to train deeper RNNs. We experimentally validate our design with five different sequence modelling tasks on three different datasets. The proposed stackable recurrent (STAR) cell allows for substantially deeper recurrent architectures, with improved performance.",
        "keywords": "Deep RNN;Multi-layer RNN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mehmet Ozgur Turkoglu;Stefano D'Aronco;Jan Dirk Wegner;Konrad Schindler",
        "authorids": "ozgur.turkoglu@geod.baug.ethz.ch;stefano.daronco@geod.baug.ethz.ch;jan.wegner@geod.baug.ethz.ch;schindler@geod.baug.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkgNZeSKPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "568;272;317",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            385.6666666666667,
            130.23141795366516
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 86,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12696490589494327418&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SkgODpVFDr",
        "title": "Incorporating Horizontal Connections in Convolution by Spatial Shuffling",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose spatially shuffled convolution that the regular convolution incorporates the information from outside of its receptive field.",
        "abstract": "Convolutional Neural Networks (CNNs) are composed of multiple convolution layers and show elegant performance in vision tasks.\nThe design of the regular convolution is based on the Receptive Field (RF) where the information within a specific region is processed.\nIn the view of the regular convolution's RF, the outputs of neurons in lower layers with smaller RF are bundled to create neurons in higher layers with larger RF. \nAs a result, the neurons in high layers are able to capture the global context even though the neurons in low layers only see the local information.\nHowever, in lower layers of the biological brain, the information outside of the RF changes the properties of neurons.\nIn this work, we extend the regular convolution and propose spatially shuffled convolution (ss convolution).\nIn ss convolution, the regular convolution is able to use the information outside of its RF by spatial shuffling which is a simple and lightweight operation.\nWe perform experiments on CIFAR-10 and ImageNet-1k dataset, and show that ss convolution improves the classification performance across various CNNs.",
        "keywords": "shuffle;convolution;receptive field;classification;horizontal connections",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ikki Kishida;Hideki Nakayama",
        "authorids": "kishida@nlab.ci.i.u-tokyo.ac.jp;nakayama@nlab.ci.i.u-tokyo.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkishida2020incorporating,\ntitle={Incorporating Horizontal Connections in Convolution by Spatial Shuffling},\nauthor={Ikki Kishida and Hideki Nakayama},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgODpVFDr}\n}",
        "github": "https://github.com/AccountForSubmission/ICLR2020_Spatially_Shuffled_Convolution",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkgODpVFDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "314;477;130",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            307.0,
            141.74860375561613
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7699865443648450410&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SkgOzlrKvH",
        "title": "The Role of Embedding Complexity in Domain-invariant Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "We study the effect of the embedding complexity in learning domain-invariant representations and develop a strategy that mitigates sensitivity to it.",
        "abstract": "Unsupervised domain adaptation aims to generalize the hypothesis trained in a source domain to an unlabeled target domain. One popular approach to this problem is to learn domain-invariant embeddings for both domains. In this work, we study, theoretically and empirically, the effect of the embedding complexity on generalization to the target domain. In particular, this complexity affects an upper bound on the target risk; this is reflected in experiments, too. Next, we specify our theoretical framework to multilayer neural networks. As a result, we develop a strategy that mitigates sensitivity to the embedding complexity, and empirically achieves performance on par with or better than the best layer-dependent complexity tradeoff.",
        "keywords": "domain adaptation;domain-invariant representations;model complexity;theory;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ching-Yao Chuang;Antonio Torralba;Stefanie Jegelka",
        "authorids": "cychuang@mit.edu;torralba@mit.edu;stefje@mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchuang2020the,\ntitle={The Role of Embedding Complexity in Domain-invariant Representations},\nauthor={Ching-Yao Chuang and Antonio Torralba and Stefanie Jegelka},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgOzlrKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkgOzlrKvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "334;407;460",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "474;307;409",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            400.3333333333333,
            51.65483735549094
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            396.6666666666667,
            68.7329776906415
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1590383466308778550&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkgPvlSYwS",
        "title": "Reasoning-Aware Graph Convolutional Network for Visual Question Answering",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Relational reasoning methods based on graph networks are currently state-of-the-art models for Visual Question Answering (VQA) tasks involving real images. Although graph networks are used in these models to enrich visual representations by encoding question-adaptive inter-object relations, these simple graph networks is arguably insufficient to perform visual reasoning for VQA tasks. In this paper, we propose a Reasoning-Aware Graph Convolutional Networks (RA-GCN) that goes one step further towards visual reasoning for GCNs. Our first contribution is the introduction of visual reasoning ability into conventional GCNs. Secondly, we strengthen the expressive power of GCNs via introducing node-sensitive kernel parameters based on edge features to address the limitation of shared transformation matrix for each node in GCNs. Finally, we provide a novel iterative reasoning network architecture for solving VQA task via embedding the RA-GCN module into an iterative process. We evaluate our model on the VQA-CP v2, GQA and Clevr dataset. Our final RA-GCN network successfully achieves state-of-the-art accuracy which is 42.3% on the VQA-CP v2, and highly competitive 62.4% accuracy on the GQA, as well as 90.0% on val split of Clevr dataset.",
        "keywords": "graph convolutional networks;visual reasoning;visual question answering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yangyang Cheng;Chun Yuan",
        "authorids": "chengyang317@gmail.com;yuanc@sz.tsinghua.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkgPvlSYwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "207;255;642",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            368.0,
            194.73571834668647
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1274860479260007237&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SkgQwpVYwH",
        "title": "Credible Sample Elicitation by Deep Learning, for Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposes a deep learning aided method to elicit credible samples from self-interested agents. ",
        "abstract": "It is important to collect credible training samples $(x,y)$ for building data-intensive learning systems (e.g., a deep learning system). In the literature, there is a line of studies on eliciting distributional information from self-interested agents who hold a relevant information.  Asking people to report complex distribution $p(x)$, though theoretically viable, is challenging in practice. This is primarily due to the heavy cognitive loads required for human agents to reason and report this high dimensional information. Consider the example where we are interested in building an image classifier via first collecting a certain category of  high-dimensional image data. While classical elicitation results apply to eliciting a complex and generative (and continuous) distribution $p(x)$ for this image data, we are interested in eliciting samples $x_i \\sim p(x)$ from agents. This paper introduces a deep learning aided method to incentivize credible sample contributions from selfish and rational agents. The challenge to do so is to design an incentive-compatible score function to score each reported sample to induce truthful reports, instead of an arbitrary or even adversarial one. We show that with accurate estimation of a certain $f$-divergence function we are able to achieve approximate incentive compatibility in eliciting truthful samples. We then present an efficient estimator with theoretical guarantee via studying the variational forms of $f$-divergence function. Our work complements the literature of information elicitation via introducing the problem of \\emph{sample elicitation}.  We also show a connection between this sample elicitation problem and $f$-GAN, and how this connection can help reconstruct an estimator of the distribution based on collected samples.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yang Liu;Zuyue Fu;Zhuoran Yang;Zhaoran Wang",
        "authorids": "yangliu@ucsc.edu;zuyuefu2022@u.northwestern.edu;zy6@princeton.edu;zhaoranwang@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nliu2020credible,\ntitle={Credible Sample Elicitation by Deep Learning, for Deep Learning},\nauthor={Yang Liu and Zuyue Fu and Zhuoran Yang and Zhaoran Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgQwpVYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkgQwpVYwH",
        "pdf_size": 0,
        "rating": "1;6",
        "confidence": "0;0",
        "wc_review": "224;220",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "180;307",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            222.0,
            2.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            243.5,
            63.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oZfxwhzen9MJ:scholar.google.com/&scioq=Credible+Sample+Elicitation+by+Deep+Learning,+for+Deep+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SkgRW64twr",
        "title": "Deep Multi-View Learning via Task-Optimal CCA",
        "track": "main",
        "status": "Reject",
        "tldr": "Learn a projection to a shared latent space that is also discriminative, improving cross-view classification, regularization with a second view during training, and multi-view prediction.",
        "abstract": "Canonical Correlation Analysis (CCA) is widely used for multimodal data analysis and, more recently, for discriminative tasks such as multi-view learning; however, it makes no use of class labels.  Recent CCA methods have started to address this weakness but are limited in that they do not simultaneously optimize the CCA projection for discrimination and the CCA projection itself, or they are linear only. We address these deficiencies by simultaneously optimizing a CCA-based and a task objective in an end-to-end manner. Together, these two objectives learn a non-linear CCA projection to a shared latent space that is highly correlated and discriminative. Our method shows a significant improvement over previous state-of-the-art (including deep supervised approaches) for cross-view classification (8.5% increase), regularization with a second view during training when only one view is available at test time (2.2-3.2%), and semi-supervised learning (15%) on real data.",
        "keywords": "multi-view;components analysis;CCA;representation learning;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Heather D. Couture;Roland Kwitt;J.S. Marron;Melissa Troester;Charles M. Perou;Marc Niethammer",
        "authorids": "heather@pixelscientia.com;roland.kwitt@gmail.com;marron@unc.edu;troester@unc.edu;chuck_perou@med.unc.edu;mn@cs.unc.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ncouture2020deep,\ntitle={Deep Multi-View Learning via Task-Optimal {\\{}CCA{\\}}},\nauthor={Heather D. Couture and Roland Kwitt and J.S. Marron and Melissa Troester and Charles M. Perou and Marc Niethammer},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgRW64twr}\n}",
        "github": "https://drive.google.com/file/d/1Y53uvxmwNdaZIATGmyRkkLyhLf2TDmVl/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkgRW64twr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "384;188;243",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "816;270;27",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            271.6666666666667,
            82.54426421960252
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            371.0,
            329.9302956686457
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8333680426672308191&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SkgS2lBFPS",
        "title": "A Bilingual Generative Transformer for Semantic Sentence Embedding",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Semantic sentence embedding models take natural language sentences and turn them into vectors, such that similar vectors indicate similarity in the semantics between the sentences. Bilingual data offers a useful signal for learning such embeddings: properties shared by both sentences in a translation pair are likely semantic, while divergent properties are likely stylistic or language-specific. We propose a deep latent variable model that attempts to perform source separation on parallel sentences, isolating what they have in common in a latent semantic vector, and explaining what is left over with language-specific latent vectors. Our proposed approach differs from past work on semantic sentence encoding in two ways. First, by using a variational probabilistic framework, we introduce priors that encourage source separation, and can use our model\u2019s posterior to predict sentence embeddings for monolingual data at test time. Second, we use high- capacity transformers as both data generating distributions and inference networks \u2013 contrasting with most past work on sentence embeddings. In experiments, our approach substantially outperforms the state-of-the-art on a standard suite of se- mantic similarity evaluations. Further, we demonstrate that our approach yields the largest gains on more difficult subsets of test where simple word overlap is not a good indicator of similarity.",
        "keywords": "sentence embedding;semantic similarity;multilingual;latent variables;vae",
        "primary_area": "",
        "supplementary_material": "",
        "author": "John Wieting;Graham Neubig;Taylor Berg-Kirkpatrick",
        "authorids": "jwieting@cs.cmu.edu;gneubig@cs.cmu.edu;tberg@eng.ucsd.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwieting2020a,\ntitle={A Bilingual Generative Transformer for Semantic Sentence Embedding},\nauthor={John Wieting and Graham Neubig and Taylor Berg-Kirkpatrick},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgS2lBFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkgS2lBFPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "309;185;579",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "361;363;834",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.6666666666667,
            164.48978353955266
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            519.3333333333334,
            222.50443191591089
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7807340292288236399&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkgTR3VFvH",
        "title": "Pipelined Training with Stale Weights of Deep Convolutional Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Accelerating CNN training on a Pipeline of Accelerators with Stale Weights",
        "abstract": "The growth in the complexity of Convolutional Neural Networks (CNNs) is increasing interest in partitioning a network across multiple accelerators during training and pipelining the backpropagation computations over the accelerators. Existing approaches avoid or limit the use of stale weights through techniques such as micro-batching or weight stashing. These techniques either underutilize of accelerators or increase memory footprint. We explore the impact of stale weights on the statistical efficiency and performance in a pipelined backpropagation scheme that maximizes accelerator utilization and keeps memory overhead modest. We use 4 CNNs (LeNet-5, AlexNet, VGG and ResNet) and show that when pipelining is limited to early layers in a network, training with stale weights converges and results in models with comparable inference accuracies to those resulting from non-pipelined training on MNIST and CIFAR-10 datasets; a drop in accuracy of 0.4%, 4%, 0.83% and 1.45% for the 4 networks, respectively. However, when pipelining is deeper in the network, inference accuracies drop significantly. We propose combining pipelined and non-pipelined training in a hybrid scheme to address this drop. We demonstrate the implementation and performance of our pipelined backpropagation in PyTorch on 2 GPUs using ResNet, achieving speedups of up to 1.8X over a 1-GPU baseline, with a small drop in inference accuracy.",
        "keywords": "Distributed CNN Training;Pipelined Backpropagation;Training with Stale Weights",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lifu Zhang;Tarek S. Abdelrahman",
        "authorids": "lifu.zhang@mail.utoronto.ca;tsa@ece.utoronto.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzhang2020pipelined,\ntitle={Pipelined Training with Stale Weights of Deep Convolutional Neural Networks},\nauthor={Lifu Zhang and Tarek S. Abdelrahman},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgTR3VFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkgTR3VFvH",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "217;207;167;325",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "481;458;241;197",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            229.0,
            58.497863208838666
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            344.25,
            126.47405860491708
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z28sBBFwvLUJ:scholar.google.com/&scioq=Pipelined+Training+with+Stale+Weights+of+Deep+Convolutional+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 17
    },
    {
        "id": "SkgWIxSFvr",
        "title": "FLAT MANIFOLD VAES",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Latent-variable models represent observed data by mapping a prior distribution over some latent space to an observed space.  Often, the prior distribution is specified by the user to be very simple, effectively shifting the burden of a learning algorithm to the estimation of a highly non-linear likelihood function. This poses a problem for the calculation of a popular distance function, the geodesic between data points in the latent space, as this is often solved iteratively via numerical methods. These are less effective if the problem at hand is not well captured by first or second-order approximations. In this work, we propose less complex likelihood functions by allowing complex distributions and explicitly penalising the curvature of the decoder. This results in geodesics which are approximated well by the Euclidean distance in latent space, decreasing the runtime by a factor of 1,000 with little loss in accuracy. \n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nutan Chen;Alexej Klushyn;Francesco Ferroni;Justin Bayer;Patrick van der Smagt",
        "authorids": "nutan.chen@gmail.com;a.klushyn@gmail.com;francescoferroni1@gmail.com;bayer.justin@googlemail.com;smagt@argmax.ai",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchen2020flat,\ntitle={{\\{}FLAT{\\}} {\\{}MANIFOLD{\\}} {\\{}VAES{\\}}},\nauthor={Nutan Chen and Alexej Klushyn and Francesco Ferroni and Justin Bayer and Patrick van der Smagt},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgWIxSFvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkgWIxSFvr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "708;323;165",
        "wc_reply_reviewers": "614;0;0",
        "wc_reply_authors": "1059;315;318",
        "reply_reviewers": "4;0;0",
        "reply_authors": "4;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            398.6666666666667,
            228.04434266655733
        ],
        "wc_reply_reviewers_avg": [
            204.66666666666666,
            289.44237576569344
        ],
        "wc_reply_authors_avg": [
            564.0,
            350.0199994286041
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            1.8856180831641267
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:c5eyROwyv4gJ:scholar.google.com/&scioq=FLAT+MANIFOLD+VAES&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SkgWeJrYwr",
        "title": "Efficient Wrapper Feature Selection using Autoencoder and Model Based Elimination",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a computationally efficient wrapper feature selection method - called Autoencoder and Model Based Elimination of features using Relevance and Redundancy scores (AMBER) - that uses a single ranker model along with autoencoders to perform greedy backward elimination of features. The ranker model is used to prioritize the removal of features that are not critical to the classification task, while the autoencoders are used to prioritize the elimination of correlated features. We demonstrate the superior feature selection ability of AMBER on 4 well known datasets corresponding to different domain applications via comparing the accuracies with other computationally efficient state-of-the-art feature selection techniques. Interestingly, we find that the ranker model that is used for feature selection does not necessarily have to be the same as the final classifier that is trained on the selected features. Finally, we hypothesize that overfitting the ranker model on the training set facilitates the selection of more salient features.",
        "keywords": "Wrapper Feature Selection;AMBER;Ranker Model;Generative Training;Wireless Subsampling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sharan Ramjee;Aly El Gamal",
        "authorids": "sramjee@purdue.edu;elgamala@purdue.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nramjee2020efficient,\ntitle={Efficient Wrapper Feature Selection using Autoencoder and Model Based Elimination},\nauthor={Sharan Ramjee and Aly El Gamal},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgWeJrYwr}\n}",
        "github": "https://github.com/amber-iclr/AMBER",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkgWeJrYwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "703;360;265",
        "wc_reply_reviewers": "88;0;0",
        "wc_reply_authors": "763;185;331",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            442.6666666666667,
            188.12466315953603
        ],
        "wc_reply_reviewers_avg": [
            29.333333333333332,
            41.48359782961079
        ],
        "wc_reply_authors_avg": [
            426.3333333333333,
            245.40759745551662
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6626349676581130731&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkgYJaEFwS",
        "title": "PAC-Bayes Few-shot Meta-learning with Implicit Learning of Model Prior Distribution",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Bayesian meta-learning using PAC-Bayes framework and implicit prior distributions",
        "abstract": "We introduce a new and rigorously-formulated PAC-Bayes few-shot meta-learning algorithm that implicitly learns a model prior distribution of interest. Our proposed method extends the PAC-Bayes framework from a single task setting to the few-shot meta-learning setting to upper-bound generalisation errors on unseen tasks. We  also propose a generative-based approach to model the shared prior and task-specific posterior more expressively compared to the usual diagonal Gaussian assumption. We show that the models trained with our proposed meta-learning algorithm are well calibrated and accurate, with state-of-the-art calibration and classification results on mini-ImageNet benchmark, and competitive results in a multi-modal task-distribution regression.",
        "keywords": "meta-learning;few-shot learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cuong Nguyen;Thanh-Toan Do;Gustavo Carneiro",
        "authorids": "cuong.nguyen@adelaide.edu.au;thanh-toan.do@liverpool.ac.uk;gustavo.carneiro@adelaide.edu.au",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://www.dropbox.com/s/fa0msqrr74psaej/SImBa_code.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkgYJaEFwS",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "608;495;156",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            419.6666666666667,
            192.06306834532825
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EX--EwJJKDoJ:scholar.google.com/&scioq=PAC-Bayes+Few-shot+Meta-learning+with+Implicit+Learning+of+Model+Prior+Distribution&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Skgaia4tDH",
        "title": "Localized Generations with Deep Neural Networks for Multi-Scale Structured Datasets",
        "track": "main",
        "status": "Reject",
        "tldr": "Generalized Variational Autoencoder to be applicable to the dataset with local structure while keeping to avoid a heavy computation by the meta-learning with structural similarity assumption.",
        "abstract": "Extracting the hidden structure of the external environment is an essential component of intelligent agents and human learning. The real-world datasets that we are interested in are often characterized by the locality: the change in the structural relationship between the data points depending on location in observation space. The local learning approach extracts semantic representations for these datasets by training the embedding model from scratch for each local neighborhood, respectively. However, this approach is only limited to use with a simple model, since the complex model, including deep neural networks, requires a massive amount of data and extended training time. In this study, we overcome this trade-off based on the insight that the real-world dataset often shares some structural similarity between each neighborhood. We propose to utilize the embedding model for the other local structure as a weak form of supervision. Our proposed model, the Local VAE, generalize the Variational Autoencoder to have the different model parameters for each local subset and train these local parameters by the gradient-based meta-learning. Our experimental results showed that the Local VAE succeeded in learning the semantic representations for the dataset with local structure, including the 3D Shapes Dataset, and generated high-quality images.",
        "keywords": "Variational autoencoder;Local learning;Model-agnostic meta-learning;Disentangled representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yoshihiro Nagano;Shiro Takagi;Yuki Yoshida;Masato Okada",
        "authorids": "nagano@mns.k.u-tokyo.ac.jp;takagi@mns.k.u-tokyo.ac.jp;yoshida@mns.k.u-tokyo.ac.jp;okada@edu.k.u-tokyo.ac.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nnagano2020localized,\ntitle={Localized Generations with Deep Neural Networks for Multi-Scale Structured Datasets},\nauthor={Yoshihiro Nagano and Shiro Takagi and Yuki Yoshida and Masato Okada},\nyear={2020},\nurl={https://openreview.net/forum?id=Skgaia4tDH}\n}",
        "github": "https://drive.google.com/file/d/1vHfEt5RGrlLM77Ae0nCkRjIaQFtTqzMM/view",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skgaia4tDH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "271;245;238",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "462;743;241",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            251.33333333333334,
            14.197026292697903
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            482.0,
            205.42800847661124
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EOO4Xz1mgD4J:scholar.google.com/&scioq=Localized+Generations+with+Deep+Neural+Networks+for+Multi-Scale+Structured+Datasets&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Skgb5h4KPH",
        "title": "Frequency Principle: Fourier Analysis Sheds Light on Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "In real problems, we found that DNNs often fit target functions from low to high frequencies during the training process.",
        "abstract": "We study the training process of Deep Neural Networks (DNNs) from the Fourier analysis perspective. We demonstrate a very universal Frequency Principle (F-Principle) --- DNNs often fit target functions from low to high frequencies --- on high-dimensional benchmark datasets, such as MNIST/CIFAR10, and deep networks, such as VGG16. This F-Principle of DNNs is opposite to the learning behavior of most conventional iterative numerical schemes (e.g., Jacobi method), which exhibits faster convergence for higher frequencies, for various scientific computing problems. With a naive theory, we illustrate that this F-Principle results from the regularity of the commonly used activation functions. The F-Principle implies an implicit bias that DNNs tend to fit training data by a low-frequency function. This understanding provides an explanation of good generalization of DNNs on most real datasets and bad generalization of DNNs on parity function or randomized dataset.",
        "keywords": "deep learning;training behavior;Fourier analysis;generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhi-Qin John Xu;Yaoyu Zhang;Tao Luo;Yanyang Xiao;Zheng Ma",
        "authorids": "xuzhiqin@sjtu.edu.cn;yaoyu@ias.edu;luo196@purdue.edu;xyy82148@gmail.com;ma531@purdue.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nxu2020frequency,\ntitle={Frequency Principle: Fourier Analysis Sheds Light on Deep Neural Networks},\nauthor={Zhi-Qin John Xu and Yaoyu Zhang and Tao Luo and Yanyang Xiao and Zheng Ma},\nyear={2020},\nurl={https://openreview.net/forum?id=Skgb5h4KPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skgb5h4KPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "474;294;266",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "768;329;223",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.6666666666667,
            92.16410484685575
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            440.0,
            235.9336064800152
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 663,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15101737643933736623&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SkgbmyHFDS",
        "title": "What Can Learned Intrinsic Rewards Capture?",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Reinforcement learning agents can include different components, such as policies, value functions, state representations, and environment models. Any or all of these can be the loci of knowledge, i.e., structures where knowledge, whether given or learned, can be deposited and reused. Regardless of its composition, the objective of an agent is behave so as to maximise the sum of suitable scalar functions of state: the rewards. As far as the learning algorithm is concerned, these rewards are typically given and immutable. In this paper we instead consider the proposition that the reward function itself may be a good locus of knowledge. This is consistent with a common use, in the literature, of hand-designed intrinsic rewards to improve the learning dynamics of an agent. We adopt a multi-lifetime setting of the Optimal Rewards Framework, and investigate how meta-learning can be used to find good reward functions in a data-driven way. To this end, we propose to meta-learn an intrinsic reward function that allows agents to maximise their extrinsic rewards accumulated until the end of their lifetimes. This long-term lifetime objective allows our learned intrinsic reward to generate systematic multi-episode exploratory behaviour. Through proof-of-concept experiments, we elucidate interesting forms of knowledge that may be captured by a suitably trained intrinsic reward such as the usefulness of exploring uncertain states and rewards.",
        "keywords": "reinforcement learning;deep reinforcement learning;intrinsic movitation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zeyu Zheng;Junhyuk Oh;Matteo Hessel;Zhongwen Xu;Manuel Kroiss;Hado van Hasselt;David Silver;Satinder Singh",
        "authorids": "zeyu@umich.edu;junhyuk@google.com;mtthss@google.com;zhongwen@google.com;makro@google.com;hado@google.com;davidsilver@google.com;baveja@google.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nzheng2020what,\ntitle={What Can Learned Intrinsic Rewards Capture?},\nauthor={Zeyu Zheng and Junhyuk Oh and Matteo Hessel and Zhongwen Xu and Manuel Kroiss and Hado van Hasselt and David Silver and Satinder Singh},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgbmyHFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkgbmyHFDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "573;746;685",
        "wc_reply_reviewers": "262;0;309",
        "wc_reply_authors": "217;370;452",
        "reply_reviewers": "1;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            668.0,
            71.64263162856783
        ],
        "wc_reply_reviewers_avg": [
            190.33333333333334,
            135.94688505278654
        ],
        "wc_reply_authors_avg": [
            346.3333333333333,
            97.38697152197629
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 104,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5656762215799115632&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "Skgeip4FPr",
        "title": "Neural networks are a priori biased towards Boolean functions with low entropy",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that neural networks are biased towards functions with high class imbalance (low entropy) at initialization; we prove the exact form of the bias for the perceptron, and some properties for multi-layer networks",
        "abstract": "Understanding the inductive bias of neural networks is critical to explaining their ability to generalise.  Here,  \nfor one of the simplest neural networks -- a single-layer perceptron with $n$ input neurons,  one output neuron, and no threshold bias term -- we prove that upon random initialisation of weights, the a priori probability  $P(t)$ that it represents a Boolean function that classifies $t$ points in $\\{0,1\\}^n$ as $1$ has a remarkably simple form: $\nP(t) = 2^{-n} \\,\\, {\\rm for} \\,\\, 0\\leq t < 2^n$.\nSince a perceptron can express far fewer Boolean functions with small or large values of $t$ (low \"entropy\") than with intermediate values of $t$ (high \"entropy\") there is, on average, a strong intrinsic a-priori bias towards individual functions with low entropy. Furthermore, within a class of functions with fixed $t$, we often observe a further intrinsic bias towards functions of lower complexity.\nFinally, we prove that, regardless of the distribution of inputs, the bias towards low entropy becomes monotonically stronger upon adding ReLU layers, and empirically show that increasing the variance of the bias term has a similar effect.",
        "keywords": "class imbalance;perceptron;inductive bias;simplicity bias;initialization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chris Mingard;Joar Skalse;Guillermo Valle-P\u00e9rez;David Mart\u00ednez-Rubio;Vladimir Mikulik;Ard A. Louis",
        "authorids": "christopher.mingard@hertford.ox.ac.uk;joar.skalse@hertford.ox.ac.uk;guillermo.valle@dtc.ox.ac.uk;david.martinez@cs.ox.ac.uk;vladimir.mikulik@hertford.ox.ac.uk;ard.louis@physics.ox.ac.uk",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nmingard2020neural,\ntitle={Neural networks are a priori biased towards Boolean functions with low entropy},\nauthor={Chris Mingard and Joar Skalse and Guillermo Valle-P{\\'e}rez and David Mart{\\'\\i}nez-Rubio and Vladimir Mikulik and Ard A. Louis},\nyear={2020},\nurl={https://openreview.net/forum?id=Skgeip4FPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skgeip4FPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "543;193;280",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "954;187;816",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            338.6666666666667,
            148.78694685877372
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            652.3333333333334,
            333.8286320188182
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=984383365394041734&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Skgfr1rYDH",
        "title": "SoftAdam: Unifying SGD and Adam for better stochastic gradient descent",
        "track": "main",
        "status": "Reject",
        "tldr": "An algorithm for unifying SGD and Adam and empirical study of its performance",
        "abstract": "Abstract Stochastic gradient descent (SGD) and Adam are commonly used to optimize deep neural networks, but choosing one usually means making tradeoffs between speed, accuracy and stability. Here we present an intuition for why the tradeoffs exist as well as a method for unifying the two in a continuous way. This makes it possible to control the way models are trained in much greater detail. We show that for default parameters, the new algorithm equals or outperforms SGD and Adam across a range of models for image classification tasks and outperforms SGD for language modeling tasks.",
        "keywords": "Optimization;SGD;Adam;Generalization;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Abraham J. Fetterman;Christina H. Kim;Joshua Albrecht",
        "authorids": "abe@sourceress.co;christina@sourceress.co;josh@sourceress.co",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfetterman2020softadam,\ntitle={SoftAdam: Unifying {\\{}SGD{\\}} and Adam for better stochastic gradient descent},\nauthor={Abraham J. Fetterman and Christina H. Kim and Joshua Albrecht},\nyear={2020},\nurl={https://openreview.net/forum?id=Skgfr1rYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skgfr1rYDH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "301;285;534",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "209;149;628",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            373.3333333333333,
            113.79611397387679
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            328.6666666666667,
            213.0732789962698
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nRqi96IMKCwJ:scholar.google.com/&scioq=SoftAdam:+Unifying+SGD+and+Adam+for+better+stochastic+gradient+descent&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SkgjKR4YwH",
        "title": "MixUp as Directional Adversarial Training",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a novel interpretation of MixUp as belonging to a class highly analogous to adversarial training, and on this basis we introduce a simple generalization which outperforms MixUp",
        "abstract": "MixUp is a data augmentation scheme in which pairs of training samples and their corresponding labels are mixed using linear coefficients. Without label mixing, MixUp becomes a more conventional scheme: input samples are moved but their original labels are retained. Because samples are preferentially moved in the direction of other classes \\iffalse -- which are typically clustered in input space -- \\fi we refer to this method as directional adversarial training, or DAT. We show that under two mild conditions, MixUp asymptotically convergences to a subset of DAT. We define untied MixUp (UMixUp), a superset of MixUp wherein training labels are mixed with different linear coefficients to those of their corresponding samples. We show that under the same mild conditions, untied MixUp converges to the entire class of DAT schemes. Motivated by the understanding that UMixUp is both a generalization of MixUp and a form of adversarial training, we experiment with different datasets and loss functions to show that UMixUp provides improved performance over MixUp. In short, we present a novel interpretation of MixUp as belonging to a class highly analogous to adversarial training, and on this basis we introduce a simple generalization which outperforms MixUp.",
        "keywords": "MixUp;Adversarial Training;Untied MixUp",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guillaume Perrault-Archambault;Yongyi Mao;Hongyu Guo;Richong Zhang",
        "authorids": "gperr050@uottawa.ca;yymao@eecs.uottawa.ca;hongyu.guo@nrc-cnrc.gc.ca;zhangrc@act.buaa.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nperrault-archambault2020mixup,\ntitle={MixUp as Directional Adversarial Training},\nauthor={Guillaume Perrault-Archambault and Yongyi Mao and Hongyu Guo and Richong Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgjKR4YwH}\n}",
        "github": "https://github.com/mixupAsDirectionalAdversarial/mixup_as_dat",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkgjKR4YwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "117;433;236",
        "wc_reply_reviewers": "329;0;0",
        "wc_reply_authors": "1174;723;537",
        "reply_reviewers": "2;0;0",
        "reply_authors": "4;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            262.0,
            130.30988706413135
        ],
        "wc_reply_reviewers_avg": [
            109.66666666666667,
            155.0920873402494
        ],
        "wc_reply_authors_avg": [
            811.3333333333334,
            267.45009918778413
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10330632402435057340&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SkglVlSFPS",
        "title": "Uncertainty - sensitive learning and planning with ensembles",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a reinforcement learning framework for discrete environments in which an agent optimizes its behavior on two timescales. For the short one, it uses tree search methods to perform tactical decisions. The long strategic level is handled with an ensemble of value functions learned using $TD$-like backups. Combining these two techniques brings synergies. The planning module performs \\textit{what-if} analysis allowing to avoid short-term pitfalls and boost backups of the value function. Notably, our method performs well in environments with sparse rewards where standard $TD(1)$ backups fail. On the other hand, the value functions compensate for inherent short-sightedness of planning. Importantly, we use ensembles to measure the epistemic uncertainty of value functions. This serves two purposes: a) it stabilizes planning, b) it guides exploration. \n\nWe evaluate our methods on discrete environments with sparse rewards: the Deep sea chain environment, toy Montezuma's Revenge, and Sokoban. In all the cases, we obtain speed-up of learning and boost to the final performance.",
        "keywords": "deep reinfocement learning;mcts;ensembles;uncertainty",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Piotr Mi\u0142o\u015b;\u0141ukasz Kuci\u0144ski;Konrad Czechowski;Piotr Kozakowski;Maciej Klimek",
        "authorids": "pmilos@mimuw.edu.pl;lukasz.kucinski@gmail.com;konrad.czechowski@gmail.com;p.kozakowski@mimuw.edu.pl;maciej.klimek@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nmi{\\l}o{\\'s}2020uncertainty,\ntitle={Uncertainty - sensitive learning and planning with ensembles},\nauthor={Piotr Mi{\\l}o{\\'s} and {\\L}ukasz Kuci{\\'n}ski and Konrad Czechowski and Piotr Kozakowski and Maciej Klimek},\nyear={2020},\nurl={https://openreview.net/forum?id=SkglVlSFPS}\n}",
        "github": "https://github.com/learningandplanningICLR/learningandplanning",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkglVlSFPS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "567;618;116",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "214;391;5",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            433.6666666666667,
            225.58713517298713
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            203.33333333333334,
            157.76424042081132
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=177076105332746736&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkgpBJrtvS",
        "title": "Contrastive Representation Distillation",
        "track": "main",
        "status": "Poster",
        "tldr": "Representation/knowledge distillation by maximizing mutual information between teacher and student",
        "abstract": "  Often we wish to transfer representational knowledge from one neural network to another. Examples include distilling a large network into a smaller one, transferring knowledge from one sensory modality to a second, or ensembling a collection of models into a single estimator. Knowledge distillation, the standard approach to these problems, minimizes the KL divergence between the probabilistic outputs of a teacher and student network. We demonstrate that this objective ignores important structural knowledge of the teacher network. This motivates an alternative objective by which we train a student to capture significantly more information in the teacher's representation of the data. We formulate this objective as contrastive learning. Experiments demonstrate that our resulting new objective outperforms knowledge distillation on a variety of knowledge transfer tasks, including single model compression, ensemble distillation, and cross-modal transfer. When combined with knowledge distillation, our method sets a state of the art in many transfer tasks, sometimes even outperforming the teacher network.",
        "keywords": "Knowledge Distillation;Representation Learning;Contrastive Learning;Mutual Information",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yonglong Tian;Dilip Krishnan;Phillip Isola",
        "authorids": "yonglong@mit.edu;dilipkay@google.com;phillipi@mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nTian2020Contrastive,\ntitle={Contrastive Representation Distillation},\nauthor={Yonglong Tian and Dilip Krishnan and Phillip Isola},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgpBJrtvS}\n}",
        "github": "https://github.com/HobbitLong/RepDistiller",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkgpBJrtvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "230;157;400",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "342;85;482",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            262.3333333333333,
            101.80482415987085
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            303.0,
            164.4039739990085
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1458,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11598873002614112751&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkgpGgrYPH",
        "title": "Residual EBMs: Does Real vs. Fake Text Discrimination Generalize?",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A residual EBM for text whose formulation is equivalent to discriminating between human and machine generated text. We study its generalization behavior.",
        "abstract": "Energy-based models (EBMs), a.k.a. un-normalized models, have had recent successes in continuous spaces. However, they have not been successfully applied to model text sequences.  While decreasing the energy at training samples is straightforward, mining (negative) samples where the energy should be increased is difficult.   In part, this is because standard gradient-based methods are not readily applicable when the input is high-dimensional and discrete.  Here, we side-step this issue by generating negatives using pre-trained auto-regressive language models.  The EBM then works\nin the {\\em residual} of the language model; and is trained to discriminate real text from text generated by the auto-regressive models.\nWe  investigate the generalization ability of residual EBMs, a pre-requisite for using them in other applications.  We extensively analyze generalization for the task of classifying whether an input is machine or human generated, a natural task given the training loss and how we mine negatives. Overall, we observe that EBMs can generalize remarkably well to changes in the architecture of the generators producing negatives. However, EBMs exhibit more sensitivity to the training set used by such generators.",
        "keywords": "energy-based models;real fake discrimination;text modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anton Bakhtin;Sam Gross;Myle Ott;Yuntian Deng;Marc'Aurelio Ranzato;Arthur Szlam",
        "authorids": "yolo@fb.com;sgross@fb.com;myleott@fb.com;dengyuntian@g.harvard.edu;ranzato@fb.com;aszlam@fb.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkgpGgrYPH",
        "pdf_size": 0,
        "rating": "1;1;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "325;311;215;383",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            308.5,
            60.355198616192126
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:O0920COuZfQJ:scholar.google.com/&scioq=Residual+EBMs:+Does+Real+vs.+Fake+Text+Discrimination+Generalize%3F&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Skgq1ANFDB",
        "title": "Curvature-based Robustness Certificates against Adversarial Examples",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "A robustness certificate against adversarial examples is the minimum distance of a given input to the decision boundary of the classifier (or its lower bound). For {\\it any} perturbation of the input with a magnitude smaller than the certificate value, the classification output will provably remain unchanged. Computing exact robustness certificates for deep classifiers is difficult in general since it requires solving a non-convex optimization. In this paper, we provide computationally-efficient robustness certificates for deep classifiers with differentiable activation functions in two steps. First, we show that if the eigenvalues of the Hessian of the network (curvatures of the network) are bounded, we can compute a robustness certificate in the $l_2$ norm efficiently using convex optimization. Second, we derive a computationally-efficient differentiable upper bound on the curvature of a deep network. We also use the curvature bound as a regularization term during the training of the network to boost its certified robustness against adversarial examples. Putting these results together leads to our proposed {\\bf C}urvature-based {\\bf R}obustness {\\bf C}ertificate (CRC) and {\\bf C}urvature-based {\\bf R}obust {\\bf T}raining (CRT). Our numerical results show that CRC outperforms CROWN's certificate by an order of magnitude while CRT leads to higher certified accuracy compared to standard adversarial training and TRADES.\n",
        "keywords": "Adversarial examples;Robustness certificates;Adversarial attacks;Machine Learning Security",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sahil Singla;Soheil Feizi",
        "authorids": "ssingla@cs.umd.edu;sfeizi@cs.umd.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsingla2020curvaturebased,\ntitle={Curvature-based Robustness Certificates against Adversarial Examples},\nauthor={Sahil Singla and Soheil Feizi},\nyear={2020},\nurl={https://openreview.net/forum?id=Skgq1ANFDB}\n}",
        "github": "https://drive.google.com/open?id=1uSqWcHqWOE5_LmWljsyWSguny0BLagc3",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skgq1ANFDB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "187;962;185",
        "wc_reply_reviewers": "0;380;0",
        "wc_reply_authors": "333;1293;183",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;3;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            444.6666666666667,
            365.8108193527845
        ],
        "wc_reply_reviewers_avg": [
            126.66666666666667,
            179.13371790059205
        ],
        "wc_reply_authors_avg": [
            603.0,
            491.73163412576986
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:76nZvuqqTQMJ:scholar.google.com/&scioq=Curvature-based+Robustness+Certificates+against+Adversarial+Examples&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SkgrV6EFPB",
        "title": "Towards Unifying Neural Architecture Space Exploration and Generalization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In this paper, we address a fundamental research question of significant practical interest: Can certain theoretical characteristics of CNN architectures indicate a priori (i.e., without training) which models with highly different number of parameters and layers achieve a similar generalization performance? To answer this question, we model CNNs from a network science perspective and introduce a new, theoretically-grounded, architecture-level metric called NN-Mass. We also integrate, for the first time, the PAC-Bayes theory of generalization with small-world networks to discover new synergies among our proposed NN-Mass metric, architecture characteristics, and model generalization. With experiments on real datasets such as CIFAR-10/100, we provide  extensive empirical evidence for our theoretical findings. Finally, we exploit these new insights for model compression and achieve up to 3x fewer parameters and FLOPS, while losing minimal accuracy (e.g., 96.82% vs. 97%) over large CNNs on the CIFAR-10 dataset.",
        "keywords": "Neural Architecture Space Exploration;Generalization;Model Compression;Network Science;Convolutional Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kartikeya Bhardwaj;Radu Marculescu",
        "authorids": "bhardwajkartikeya@gmail.com;radum@cmu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkgrV6EFPB",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "581;279",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            430.0,
            151.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16989229484021398172&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SkgsACVKPH",
        "title": "Picking Winning Tickets Before Training by Preserving Gradient Flow",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduced a pruning criterion for pruning networks before training by preserving gradient flow.",
        "abstract": "Overparameterization has been shown to benefit both the optimization and generalization of neural networks, but large networks are resource hungry at both training and test time.  Network pruning can reduce test-time resource requirements, but is typically applied to trained networks and therefore cannot avoid the expensive training process. We aim to prune networks at initialization, thereby saving resources at training time as well. Specifically, we argue that efficient training requires preserving the gradient flow through the network. This leads to a simple but effective pruning criterion we term Gradient Signal Preservation (GraSP). We empirically investigate the effectiveness of the proposed method with extensive experiments on CIFAR-10, CIFAR-100, Tiny-ImageNet and ImageNet, using VGGNet and ResNet architectures. Our method can prune 80% of the weights of a VGG-16 network on ImageNet at initialization, with only a 1.6% drop in top-1 accuracy. Moreover, our method achieves significantly better performance than the baseline at extreme sparsity levels. Our code is made public\nat: https://github.com/alecwangcq/GraSP.",
        "keywords": "neural network;pruning before training;weight pruning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chaoqi Wang;Guodong Zhang;Roger Grosse",
        "authorids": "cqwang@cs.toronto.edu;gdzhang@cs.toronto.edu;rgrosse@cs.toronto.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nWang2020Picking,\ntitle={Picking Winning Tickets Before Training by Preserving Gradient Flow},\nauthor={Chaoqi Wang and Guodong Zhang and Roger Grosse},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgsACVKPH}\n}",
        "github": "[![github](/images/github_icon.svg) alecwangcq/GraSP](https://github.com/alecwangcq/GraSP) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SkgsACVKPH)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkgsACVKPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "438;749;1067",
        "wc_reply_reviewers": "209;0;392",
        "wc_reply_authors": "858;375;2336",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;6",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            751.3333333333334,
            256.7934751680597
        ],
        "wc_reply_reviewers_avg": [
            200.33333333333334,
            160.1506235461549
        ],
        "wc_reply_authors_avg": [
            1189.6666666666667,
            834.219129219389
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            3.0,
            2.160246899469287
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 775,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9466463567127487961&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkgscaNYPS",
        "title": "The asymptotic spectrum of the Hessian of DNN throughout training",
        "track": "main",
        "status": "Poster",
        "tldr": "Description of the limiting spectrum of the Hesian of the loss surface of DNNs in the infinite-width limit.",
        "abstract": "The dynamics of DNNs during gradient descent is described by the so-called Neural Tangent Kernel (NTK). In this article, we show that the NTK allows one to gain precise insight into the Hessian of the cost of DNNs: we obtain a full characterization of the asymptotics of the spectrum of the Hessian, at initialization and during training. ",
        "keywords": "theory of deep learning;loss surface;training;fisher information matrix",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arthur Jacot;Franck Gabriel;Clement Hongler",
        "authorids": "arthur.jacot@epfl.ch;franck.gabriel@epfl.ch;clement.hongler@epfl.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nJacot2020The,\ntitle={The asymptotic spectrum of the Hessian of DNN throughout training},\nauthor={Arthur Jacot and Franck Gabriel and Clement Hongler},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgscaNYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkgscaNYPS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "271;385;271",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "277;385;132",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            309.0,
            53.74011537017761
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            264.6666666666667,
            103.65434010316316
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 33,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12200106724460108029&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkgtbaVYvH",
        "title": "AutoLR: A Method for Automatic Tuning of Learning Rate",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a method to automatically tune learning rate while training DNNs, and achieve or beat generalization accuracy of SOTA learning rates schedules for ImageNet (Resnet-50), Cifar-10 (Resnet-18), IWSLT (Transformer), Squad (Bert)",
        "abstract": "One very important hyperparameter for training deep neural networks is the\nlearning rate of the optimizer. The choice of learning rate schedule determines\nthe computational cost of getting close to a minima, how close you actually get\nto the minima, and most importantly the kind of local minima (wide/narrow)\nattained. The kind of minima attained has a significant impact on the\ngeneralization accuracy of the network. Current systems employ hand tuned\nlearning rate schedules, which are painstakingly tuned for each network and\ndataset. Given that the state space of schedules is huge, finding a\nsatisfactory learning rate schedule can be very time consuming. In this paper,\nwe present AutoLR, a method for auto-tuning the learning rate as training\nproceeds. Our method works with any optimizer, and we demonstrate results on\nSGD, Momentum, and Adam optimizers.\n\nWe extensively evaluate AutoLR on multiple datasets, models, and across\nmultiple optimizers. We compare favorably against state of the art learning\nrate schedules for the given dataset and models, including for ImageNet on\nResnet-50, Cifar-10 on Resnet-18, and SQuAD fine-tuning on BERT. For example,\nAutoLR achieves an EM score of 81.2 on SQuAD v1.1 with BERT_BASE compared to\n80.8 reported in (Devlin et al. (2018)) by just auto-tuning the learning rate\nschedule. To the best of our knowledge, this is the first automatic learning\nrate tuning scheme to achieve state of the art generalization accuracy on these\ndatasets with the given models.\n",
        "keywords": "Automatic Learning Rate;Deep Learning;Generalization;Stochastic Optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nipun Kwatra;V Thejas;Nikhil Iyer;Ramachandran Ramjee;Muthian Sivathanu",
        "authorids": "nkwatra@microsoft.com;thejasvenkatesh97@gmail.com;t-niiyer@microsoft.com;ramjee@microsoft.com;muthian@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nkwatra2020autolr,\ntitle={Auto{\\{}LR{\\}}: A Method for Automatic Tuning of Learning Rate},\nauthor={Nipun Kwatra and V Thejas and Nikhil Iyer and Ramachandran Ramjee and Muthian Sivathanu},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgtbaVYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkgtbaVYvH",
        "pdf_size": 0,
        "rating": "3;6",
        "confidence": "0;0",
        "wc_review": "908;474",
        "wc_reply_reviewers": "383;0",
        "wc_reply_authors": "2457;1246",
        "reply_reviewers": "2;0",
        "reply_authors": "5;2",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            691.0,
            217.0
        ],
        "wc_reply_reviewers_avg": [
            191.5,
            191.5
        ],
        "wc_reply_authors_avg": [
            1851.5,
            605.5
        ],
        "reply_reviewers_avg": [
            1.0,
            1.0
        ],
        "reply_authors_avg": [
            3.5,
            1.5
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8067445004805021546&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SkgvvCVtDS",
        "title": "DeepSimplex: Reinforcement Learning of Pivot Rules Improves the Efficiency of Simplex Algorithm in Solving Linear Programming Problems",
        "track": "main",
        "status": "Reject",
        "tldr": "Learning pivoting rules of the simplex algorithm for solving linear programs to improve the solution times, demonstrated on linear approximations of travelling salesman problem.",
        "abstract": "Linear Programs (LPs) are a fundamental class of optimization problems with a wide variety of applications. Fast algorithms for solving LPs are the workhorse of many combinatorial optimization algorithms, especially those involving integer programming. One popular method to solve LPs is the simplex method which, at each iteration, traverses the surface of the polyhedron of feasible solutions. At each vertex of the polyhedron, one of several heuristics chooses the next neighboring vertex, and these vary in accuracy and computational cost. We use deep value-based reinforcement learning to learn a pivoting strategy that at each iteration chooses between two of the most popular pivot rules -- Dantzig and steepest edge. \nBecause the latter is typically more accurate and computationally costly than the former, we assign a higher wall time-based cost to steepest edge iterations than Dantzig iterations. We optimize this weighted cost on a neural net architecture designed for the simplex algorithm. We obtain between 20% to 50% reduction in the gap between weighted iterations of the individual pivoting rules, and the best possible omniscient policies for LP relaxations of randomly generated instances of five-city Traveling Salesman Problem. ",
        "keywords": "Simplex Algorithm;Pivoting Rules;Reinforcement Learning;Combinatorial Optimization;Supervised Learning;Travelling Salesman Problem",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Varun Suriyanarayana;Onur Tavaslioglu;Ankit B. Patel;Andrew J. Schaefer",
        "authorids": "vs478@cornell.edu;onur.tavaslioglu@bcm.edu;ankit.patel@bcm.edu;andrew.schaefer@rice.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsuriyanarayana2020deepsimplex,\ntitle={DeepSimplex: Reinforcement Learning of Pivot Rules Improves the Efficiency of Simplex Algorithm in Solving Linear Programming Problems},\nauthor={Varun Suriyanarayana and Onur Tavaslioglu and Ankit B. Patel and Andrew J. Schaefer},\nyear={2020},\nurl={https://openreview.net/forum?id=SkgvvCVtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkgvvCVtDS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "545;203;815",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "80;19;428",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            521.0,
            250.4236410565105
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            175.66666666666666,
            180.15610514834688
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6613813157226599099&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Skgvy64tvr",
        "title": "Enhancing Adversarial Defense by k-Winners-Take-All",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose a simple change to existing neural network structures for better defending against gradient-based adversarial attacks, using the k-winners-take-all activation function.",
        "abstract": "We propose a simple change to existing neural network structures for better defending against gradient-based adversarial attacks. Instead of using popular activation functions (such as ReLU), we advocate the use of k-Winners-Take-All (k-WTA) activation, a C0 discontinuous function that purposely invalidates the neural network model\u2019s gradient at densely distributed input data points. The proposed k-WTA activation can be readily used in nearly all existing networks and training methods with no significant overhead. Our proposal is theoretically rationalized. We analyze why the discontinuities in k-WTA networks can largely prevent gradient-based search of adversarial examples and why they at the same time remain innocuous to the network training. This understanding is also empirically backed. We test k-WTA activation on various network structures optimized by a training method, be it adversarial training or not. In all cases, the robustness of k-WTA networks outperforms that of traditional networks under white-box attacks.",
        "keywords": "adversarial defense;activation function;winner takes all",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chang Xiao;Peilin Zhong;Changxi Zheng",
        "authorids": "chang@cs.columbia.edu;pz2225@columbia.edu;cxz@cs.columbia.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nXiao2020Enhancing,\ntitle={Enhancing Adversarial Defense by k-Winners-Take-All},\nauthor={Chang Xiao and Peilin Zhong and Changxi Zheng},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skgvy64tvr}\n}",
        "github": "https://github.com/a554b554/kWTA-Activation",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skgvy64tvr",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "564;365;225",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "699;269;95",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            384.6666666666667,
            139.09309432494803
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            354.3333333333333,
            253.8573527703217
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 139,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11915603925298453431&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Skgxcn4YDS",
        "title": "LAMOL: LAnguage MOdeling for Lifelong Language Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "Language modeling for lifelong language learning.",
        "abstract": "Most research on lifelong learning applies to images or games, but not language.\nWe present LAMOL, a simple yet effective method for lifelong language learning (LLL) based on language modeling.\nLAMOL replays pseudo-samples of previous tasks while requiring no extra memory or model capacity.\nSpecifically, LAMOL is a language model that simultaneously learns to solve the tasks and generate training samples.\nWhen the model is trained for a new task, it generates pseudo-samples of previous tasks for training alongside data for the new task.\nThe results show that LAMOL prevents catastrophic forgetting without any sign of intransigence and can perform five very different language tasks sequentially with only one model. \nOverall, LAMOL outperforms previous methods by a considerable margin and is only 2-3% worse than multitasking, which is usually considered the LLL upper bound.\nThe source code is available at https://github.com/jojotenya/LAMOL.",
        "keywords": "NLP;Deep Learning;Lifelong Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fan-Keng Sun*;Cheng-Hao Ho*;Hung-Yi Lee",
        "authorids": "fankeng@mit.edu;jojotenya@gmail.com;hungyilee@ntu.edu.tw",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nsun2020lamal,\ntitle={{\\{}LAMAL{\\}}: {\\{}LA{\\}}nguage Modeling Is All You Need for Lifelong Language Learning},\nauthor={Fan-Keng Sun and Cheng-Hao Ho and Hung-Yi Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skgxcn4YDS}\n}",
        "github": "https://github.com/jojotenya/LAMOL",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skgxcn4YDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "178;211;401",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "273;254;899",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            263.3333333333333,
            98.2728627625936
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            475.3333333333333,
            299.67797531498525
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 261,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16454938344621096337&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Skgy464Kvr",
        "title": "Detecting and Diagnosing Adversarial Images with Class-Conditional Capsule Reconstructions",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Adversarial examples raise questions about whether neural network models are sensitive to the same visual features as humans. In this paper, we first detect adversarial examples or otherwise corrupted images based on a class-conditional reconstruction of the input. To specifically attack our detection mechanism, we propose the Reconstructive Attack which seeks both to cause a misclassification and a low reconstruction error. This reconstructive attack produces undetected adversarial examples but with much smaller success rate. Among all these attacks, we find that CapsNets always perform better than convolutional networks. Then, we diagnose the adversarial examples for CapsNets and find that the success of the reconstructive attack is highly related to the visual similarity between the source and target class. Additionally, the resulting perturbations can cause the input image to appear visually more like the target class and hence become non-adversarial. This suggests that CapsNets use features that are more aligned with human perception and have the potential to address the central issue raised by adversarial examples.",
        "keywords": "Adversarial Examples;Detection of adversarial attacks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yao Qin;Nicholas Frosst;Sara Sabour;Colin Raffel;Garrison Cottrell;Geoffrey Hinton",
        "authorids": "yaq007@eng.ucsd.edu;frosst@google.com;sasabour@google.com;craffel@google.com;gary@eng.ucsd.edu;geoffhinton@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nQin2020Detecting,\ntitle={Detecting and Diagnosing Adversarial Images with Class-Conditional Capsule Reconstructions},\nauthor={Yao Qin and Nicholas Frosst and Sara Sabour and Colin Raffel and Garrison Cottrell and Geoffrey Hinton},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skgy464Kvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skgy464Kvr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "399;396;285",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "441;614;598",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            360.0,
            53.04714883949938
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            551.0,
            78.05553578489271
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 107,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14694927821916150502&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "Skl-fAVYvH",
        "title": "One Demonstration Imitation Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Unsupervised self-imitation algorithm capable of inference from a single expert demonstration.",
        "abstract": "We develop a new algorithm for imitation learning from a single expert demonstration. In contrast to many previous one-shot imitation learning approaches, our algorithm does not assume access to more than one expert demonstration during the training phase. Instead, we leverage an exploration policy to acquire unsupervised trajectories, which are then used to train both an encoder and a context-aware imitation policy. The optimization procedures for the encoder, imitation learner, and exploration policy are all tightly linked. This linking creates a feedback loop wherein the exploration policy collects new demonstrations that challenge the imitation learner, while the encoder attempts to help the imitation policy to the best of its abilities. We evaluate our algorithm on 6 MujoCo robotics tasks.",
        "keywords": "imitation learning;one shot imitation learning;reinforcement learning;exploration;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bradly C. Stadie;Siyan Zhao;Qiqi Xu;Bonnie Li;Lunjun Zhang",
        "authorids": "bstadie@berkeley.edu;siyan.zhao@mail.utoronto.ca;frances.xu@mail.utoronto.ca;bonnieli20010901@gmail.com;lunjun.zhang@mail.utoronto.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skl-fAVYvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "710;1116;468",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            764.6666666666666,
            267.3541139055508
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13484826119142904703&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Skl-fyHKPH",
        "title": "A Mean-Field Theory for Kernel Alignment with Random Features in Generative Adverserial Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel method to learn the kernel in MMD GANs and prove theoretical results for its performance. ",
        "abstract": "We propose a novel supervised learning method to optimize the kernel in maximum mean discrepancy generative adversarial networks (MMD GANs). Specifically, we characterize a distributionally robust optimization problem to compute a good distribution for the random feature model of Rahimi and Recht to approximate a good kernel function. Due to the fact that the distributional optimization is infinite dimensional, we consider a Monte-Carlo sample average approximation (SAA) to obtain a more tractable finite dimensional optimization problem. We subsequently leverage a particle stochastic gradient descent (SGD) method to solve finite dimensional optimization problems. Based on a mean-field analysis, we then prove that the empirical distribution of the interactive particles system at each iteration of the SGD follows the path of the gradient descent flow on the Wasserstein manifold. We also establish the non-asymptotic consistency of the finite sample estimator. Our empirical evaluation on synthetic data-set as well as MNIST and CIFAR-10 benchmark data-sets indicates that our proposed MMD GAN model with kernel learning indeed attains higher inception scores  well as Fr\\`{e}chet inception distances and generates better images compared to the generative moment matching network (GMMN) and MMD GAN with untrained kernels.",
        "keywords": "Kernel Learning;Generative Adversarial Networks;Mean Field Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Masoud Badiei Khuzani;Liyue Shen;Shahin Shahrampour;Lei Xing",
        "authorids": "mbadieik@stanford.edu;liyues@stanford.edu;shahin@tamu.edu;lei@stanford.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkhuzani2020a,\ntitle={A Mean-Field Theory for Kernel Alignment with Random Features in Generative Adverserial Networks},\nauthor={Masoud Badiei Khuzani and Liyue Shen and Shahin Shahrampour and Lei Xing},\nyear={2020},\nurl={https://openreview.net/forum?id=Skl-fyHKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skl-fyHKPH",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "756;197;265",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "812;305;626",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.0,
            249.0394881673721
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            581.0,
            209.4134666156883
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ijMnuHK_4wIJ:scholar.google.com/&scioq=A+Mean-Field+Theory+for+Kernel+Alignment+with+Random+Features+in+Generative+Adverserial+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Skl1HCNKDr",
        "title": "Learning Generative Models using Denoising Density Estimators",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel approach to train generative models including density estimation; different from normalizing and continuous flows, VAEs, or autoregressive models.",
        "abstract": "Learning generative probabilistic models that can estimate the continuous density given a set of samples, and that can sample from that density is one of the fundamental challenges in unsupervised machine learning. In this paper we introduce a new approach to obtain such models based on what we call denoising density estimators (DDEs). A DDE is a scalar function, parameterized by a neural network, that is efficiently trained to represent a kernel density estimator of the data. In addition, we show how to leverage DDEs to develop a novel approach to obtain generative models that sample from given densities. We prove that our algorithms to obtain both DDEs and generative models are guaranteed to converge to the correct solutions. Advantages of our approach include that we do not require specific network architectures like in normalizing flows, ODE solvers as in continuous normalizing flows, nor do we require adversarial training as in generative adversarial networks (GANs). Finally, we provide experimental results that demonstrate practical applications of our technique.\n",
        "keywords": "generative probabilistic models;denoising autoencoders;neural density estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Siavash Bigdeli;Geng Lin;Tiziano Portenier;Andrea Dunbar;Matthias Zwicker",
        "authorids": "siavash.bigdeli@csem.ch;geng@cs.umd.edu;tiziano.portenier@vision.ee.ethz.ch;andrea.dunbar@csem.ch;zwicker@cs.umd.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nbigdeli2020learning,\ntitle={Learning Generative Models using Denoising Density Estimators},\nauthor={Siavash Bigdeli and Geng Lin and Tiziano Portenier and Andrea Dunbar and Matthias Zwicker},\nyear={2020},\nurl={https://openreview.net/forum?id=Skl1HCNKDr}\n}",
        "github": "https://drive.google.com/file/d/1EzKRxnFG1Hd8g6Ggvt-jvKkgpDDwK2bY",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skl1HCNKDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "517;224;220",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "609;205;173",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            320.3333333333333,
            139.07392119141375
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            329.0,
            198.42042905574684
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14747045958894718897&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Skl3SkSKDr",
        "title": "Generating valid Euclidean distance matrices",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a neural network architecture which builds on the generation of Euclidean distance matrices to one-shot generate molecular structures and potentially other forms of point clouds.",
        "abstract": "Generating point clouds, e.g., molecular structures, in arbitrary rotations, translations, and enumerations remains a challenging task. Meanwhile, neural networks\nutilizing symmetry invariant layers have been shown to be able to optimize their\ntraining objective in a data-efficient way. In this spirit, we present an architecture\nwhich allows to produce valid Euclidean distance matrices, which by construction are already invariant under rotation and translation of the described object.\nMotivated by the goal to generate molecular structures in Cartesian space, we use\nthis architecture to construct a Wasserstein GAN utilizing a permutation invariant critic network. This makes it possible to generate molecular structures in a\none-shot fashion by producing Euclidean distance matrices which have a three-\ndimensional embedding.",
        "keywords": "euclidean distance matrices;wgan;point clouds;molecular structures",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Moritz Hoffmann;Frank Noe",
        "authorids": "moritz.hoffmann@fu-berlin.de;frank.noe@fu-berlin.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nhoffmann2020generating,\ntitle={Generating valid Euclidean distance matrices},\nauthor={Moritz Hoffmann and Frank Noe},\nyear={2020},\nurl={https://openreview.net/forum?id=Skl3SkSKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skl3SkSKDr",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "284;114;1088",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "350;83;673",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            495.3333333333333,
            424.7864823126503
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            368.6666666666667,
            241.22787751188477
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 78,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16137386937037046731&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Skl4LTEtDS",
        "title": "Growing Action Spaces",
        "track": "main",
        "status": "Reject",
        "tldr": "Progressively growing the available action space is a great curriculum for learning agents",
        "abstract": "In complex tasks, such as those with large combinatorial action spaces, random exploration may be too inefficient to achieve meaningful learning progress. In this work, we use a curriculum of progressively growing action spaces to accelerate learning. We assume the environment is out of our control, but that the agent may set an internal curriculum by initially restricting its action space. Our approach uses off-policy reinforcement learning to estimate optimal value functions for multiple action spaces simultaneously and efficiently transfers data,  value estimates, and state representations from restricted action spaces to the full task. We show the efficacy of our approach in proof-of-concept control tasks and on challenging large-scale StarCraft micromanagement tasks with large, multi-agent action spaces.",
        "keywords": "reinforcement learning;curriculum learning;multi-agent reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gregory Farquhar;Laura Gustafson;Zeming Lin;Shimon Whiteson;Nicolas Usunier;Gabriel Synnaeve",
        "authorids": "gregory.farquhar@cs.ox.ac.uk;lgustafson@fb.com;zlin@fb.com;shimon.whiteson@cs.ox.ac.uk;usunier@fb.com;gab@fb.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nfarquhar2020growing,\ntitle={Growing Action Spaces},\nauthor={Gregory Farquhar and Laura Gustafson and Zeming Lin and Shimon Whiteson and Nicolas Usunier and Gabriel Synnaeve},\nyear={2020},\nurl={https://openreview.net/forum?id=Skl4LTEtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skl4LTEtDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "476;317;330",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "437;473;178",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            374.3333333333333,
            72.08482657412878
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            362.6666666666667,
            131.40353453727525
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 44,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2822509827640565136&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Skl4mRNYDr",
        "title": "Deep Imitative Models for Flexible Inference, Planning, and Control",
        "track": "main",
        "status": "Poster",
        "tldr": "In this paper, we propose Imitative Models to combine the benefits of IL and goal-directed planning: probabilistic predictive models of desirable behavior able to plan interpretable expert-like trajectories to achieve specified goals.",
        "abstract": "Imitation Learning (IL) is an appealing approach to learn desirable autonomous behavior. However, directing IL to achieve arbitrary goals is difficult. In contrast, planning-based algorithms use dynamics models and reward functions to achieve goals. Yet, reward functions that evoke desirable behavior are often difficult to specify. In this paper, we propose \"Imitative Models\" to combine the benefits of IL and goal-directed planning. Imitative Models are probabilistic predictive models of desirable behavior able to plan interpretable expert-like trajectories to achieve specified goals. We derive families of flexible goal objectives, including constrained goal regions, unconstrained goal sets, and energy-based goals. We show that our method can use these objectives to successfully direct behavior. Our method substantially outperforms six IL approaches and a planning-based approach in a dynamic simulated autonomous driving task, and is efficiently learned from expert demonstrations without online data collection.  We also show our approach is robust to poorly-specified goals, such as goals on the wrong side of the road.",
        "keywords": "imitation learning;planning;autonomous driving",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nicholas Rhinehart;Rowan McAllister;Sergey Levine",
        "authorids": "nrhineha@cs.cmu.edu;rmcallister@berkeley.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nRhinehart2020Deep,\ntitle={Deep Imitative Models for Flexible Inference, Planning, and Control},\nauthor={Nicholas Rhinehart and Rowan McAllister and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skl4mRNYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skl4mRNYDr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "217;644;519",
        "wc_reply_reviewers": "0;0;72",
        "wc_reply_authors": "432;1336;611",
        "reply_reviewers": "0;0;2",
        "reply_authors": "1;2;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            460.0,
            179.244711683962
        ],
        "wc_reply_reviewers_avg": [
            24.0,
            33.94112549695428
        ],
        "wc_reply_authors_avg": [
            793.0,
            390.8512078357526
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 170,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=599185864570432210&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Skl6peHFwS",
        "title": "Best feature performance in codeswitched hate speech texts",
        "track": "main",
        "status": "Reject",
        "tldr": "Analysis of features and algorithm performance on codeswitched language datasets",
        "abstract": "How well can hate speech concept be abstracted in order to inform automatic classification in codeswitched texts by machine learning classifiers? We explore different representations and empirically evaluate their predictiveness using both conventional and deep learning algorithms in identifying hate speech in a ~48k human-annotated dataset that contain mixed languages, a phenomenon common among multilingual speakers. This paper espouses a novel approach to handle this challenge by introducing a hierarchical approach that employs Latent Dirichlet Allocation to generate topic models that feed into another high-level feature set that we acronym PDC. PDC groups similar meaning words in word families during the preprocessing stage for supervised learning models. The high-level PDC features generated are based on Ombui et al, (2019) hate speech annotation framework that is informed by the triangular theory of hate (Stanberg,2003).  Results obtained from frequency-based models using the PDC feature on the annotated dataset of ~48k short messages comprising of tweets generated during the 2012 and 2017 Kenyan presidential elections indicate an improvement on classification accuracy in identifying hate speech as compared to the baseline",
        "keywords": "Hate Speech;Code-switching;feature selection;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Edward Ombui;Lawrence Muchemi;Peter Wagacha",
        "authorids": "eombui@anu.ac.ke;lmuchemi@uonbi.ac.ke;waiganjo@uonbi.ac.ke",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nombui2020best,\ntitle={Best feature performance in codeswitched hate speech texts},\nauthor={Edward Ombui and Lawrence Muchemi and Peter Wagacha},\nyear={2020},\nurl={https://openreview.net/forum?id=Skl6peHFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skl6peHFwS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "159;202;381",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            247.33333333333334,
            96.13300970819313
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11182122338585764363&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Skl8EkSFDr",
        "title": "Self-Supervised GAN Compression",
        "track": "main",
        "status": "Reject",
        "tldr": "Existing pruning methods fail when applied to GANs tackling complex tasks, so we present a simple and robust method to prune generators that works well for a wide variety of networks and tasks.",
        "abstract": "Deep learning's success has led to larger and larger models to handle more and more complex tasks; trained models can contain millions of parameters. These large models are compute- and memory-intensive, which makes it a challenge to deploy them with minimized latency, throughput, and storage requirements. Some model compression methods have been successfully applied on image classification and detection or language models, but there has been very little work compressing generative adversarial networks (GANs) performing complex tasks. In this paper, we show that a standard model compression technique, weight pruning, cannot be applied to GANs using existing methods. We then develop a self-supervised compression technique which uses the trained discriminator to supervise the training of a compressed generator. We show that this framework has a compelling performance to high degrees of sparsity, generalizes well to new tasks and models, and enables meaningful comparisons between different pruning granularities.",
        "keywords": "compression;pruning;generative adversarial networks;GAN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chong Yu;Jeff Pool",
        "authorids": "chongy@nvidia.com;jpool@nvidia.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nyu2020selfsupervised,\ntitle={Self-Supervised {\\{}GAN{\\}} Compression},\nauthor={Chong Yu and Jeff Pool},\nyear={2020},\nurl={https://openreview.net/forum?id=Skl8EkSFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skl8EkSFDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "260;519;515",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "667;420;548",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            431.3333333333333,
            121.16196689647384
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            545.0,
            100.85963844207784
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6096389513186641456&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SklD9yrFPS",
        "title": "Neural Tangents: Fast and Easy Infinite Neural Networks in Python",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Keras for infinite neural networks.",
        "abstract": "Neural Tangents is a library for working with infinite-width neural networks. It provides a high-level API for specifying complex and hierarchical neural network architectures. These networks can then be trained and evaluated either at finite-width as usual or in their infinite-width limit. Infinite-width networks can be trained analytically using exact Bayesian inference or using gradient descent via the Neural Tangent Kernel. Additionally, Neural Tangents provides tools to study gradient descent training dynamics of wide but finite networks in either function space or weight space.\n\nThe entire library runs out-of-the-box on CPU, GPU, or TPU. All computations can be automatically distributed over multiple accelerators with near-linear scaling in the number of devices. \n\nIn addition to the repository below, we provide an accompanying interactive Colab notebook at\nhttps://colab.research.google.com/github/google/neural-tangents/blob/master/notebooks/neural_tangents_cookbook.ipynb\n",
        "keywords": "Infinite Neural Networks;Gaussian Processes;Neural Tangent Kernel;NNGP;NTK;Software Library;Python;JAX",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Roman Novak;Lechao Xiao;Jiri Hron;Jaehoon Lee;Alexander A. Alemi;Jascha Sohl-Dickstein;Samuel S. Schoenholz",
        "authorids": "romann@google.com;xlc@google.com;jh2084@cam.ac.uk;jaehlee@google.com;alemi@google.com;jaschasd@google.com;schsam@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nNovak2020Neural,\ntitle={Neural Tangents: Fast and Easy Infinite Neural Networks in Python},\nauthor={Roman Novak and Lechao Xiao and Jiri Hron and Jaehoon Lee and Alexander A. Alemi and Jascha Sohl-Dickstein and Samuel S. Schoenholz},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SklD9yrFPS}\n}",
        "github": "https://www.github.com/google/neural-tangents",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SklD9yrFPS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "864;248;309",
        "wc_reply_reviewers": "209;0;0",
        "wc_reply_authors": "2887;591;647",
        "reply_reviewers": "3;0;0",
        "reply_authors": "7;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            473.6666666666667,
            277.12853014841727
        ],
        "wc_reply_reviewers_avg": [
            69.66666666666667,
            98.52354484532562
        ],
        "wc_reply_authors_avg": [
            1375.0,
            1069.3898571927202
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            3.0,
            2.8284271247461903
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 293,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4030630874639258770&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SklE_CNFPr",
        "title": "Zeroth Order Optimization by a Mixture of Evolution Strategies",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Evolution strategies or zeroth-order optimization algorithms have become popular in some areas of optimization and machine learning where only the oracle of function value evaluations is available. The central idea in the design of the algorithms is by querying function values of some perturbed points in the neighborhood of the current update and constructing a pseudo-gradient using the function values. In recent years, there is a growing interest in developing new ways of perturbation. Though the new perturbation methods are well motivating, most of them are criticized for lack of convergence guarantees even when the underlying function is convex. Perhaps the only methods that enjoy convergence guarantees are the ones that sample the perturbed points uniformly from a unit sphere or from a multivariate Gaussian distribution with an isotropic covariance. In this work, we tackle the non-convergence issue and propose sampling perturbed points from a mixture of distributions. Experiments show that our proposed method can identify the best perturbation scheme for the convergence and might also help to leverage the complementariness of different perturbation schemes.\n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jun-Kun Wang;Xiaoyun Li;Ping Li",
        "authorids": "jimwang@gatech.edu;xl374@scarletmail.rutgers.edu;liping11@baidu.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwang2020zeroth,\ntitle={Zeroth Order Optimization by a Mixture of Evolution Strategies},\nauthor={Jun-Kun Wang and Xiaoyun Li and Ping Li},\nyear={2020},\nurl={https://openreview.net/forum?id=SklE_CNFPr}\n}",
        "github": "https://www.dropbox.com/s/e3pzv5581e3x8ku/codes.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SklE_CNFPr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "470;417;327",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.6666666666667,
            59.027300651666444
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8655699416203454482&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SklEhlHtPr",
        "title": "DeepPCM: Predicting Protein-Ligand Binding using Unsupervised Learned Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "We report a new methodological framework which uses unsupervised-learned representations of proteins and compounds to significantly outperform methods based on handcrafted features for the impactful protein-ligand binding task. ",
        "abstract": "In-silico protein-ligand binding prediction is an ongoing area of research in computational chemistry and machine learning based drug discovery, as an accurate predictive model could greatly reduce the time and resources necessary for the detection and prioritization of possible drug candidates. Proteochemometric modeling (PCM) attempts to make an accurate model of the protein-ligand interaction space by combining explicit protein and ligand descriptors. This requires the creation of information-rich, uniform and computer interpretable representations of proteins and ligands. Previous work in PCM modeling relies on pre-defined, handcrafted feature extraction methods, and many methods use protein descriptors that require alignment or are otherwise specific to a particular group of related proteins. However, recent advances in representation learning have shown that unsupervised machine learning can be used to generate embeddings which outperform complex, human-engineered representations. We apply this reasoning to propose a novel proteochemometric modeling methodology which, for the first time, uses embeddings generated via unsupervised representation learning for both the protein and ligand descriptors. We evaluate performance on various splits of a benchmark dataset, including a challenging split that tests the model\u2019s ability to generalize to proteins for which bioactivity data is greatly limited, and we find that our method consistently outperforms state-of-the-art methods.",
        "keywords": "Unsupervised Representation Learning;Computational biology;computational chemistry;protein-ligand binding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Paul Kim;Robin Winter;Djork-Arn\u00e9 Clevert",
        "authorids": "paul.kim@bayer.com;robin.winter@bayer.com;djork-arne.clevert@bayer.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkim2020deeppcm,\ntitle={Deep{\\{}PCM{\\}}: Predicting Protein-Ligand Binding using Unsupervised Learned Representations},\nauthor={Paul Kim and Robin Winter and Djork-Arn{\\'e} Clevert},\nyear={2020},\nurl={https://openreview.net/forum?id=SklEhlHtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SklEhlHtPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "213;377;487",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.0,
            112.58182209693831
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12127033523515275400&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SklEs2EYvS",
        "title": "Structural Multi-agent Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In this paper, we propose a multi-agent learning framework to model communication in complex multi-agent systems. Most existing multi-agent reinforcement learning methods require agents to exchange information with the environment or global manager to achieve effective and efficient interaction. We model the multi-agent system with an online adaptive graph where all agents communicate with each other through the edges. We update the graph network with a relation system which takes the current graph network and the hidden variable of agents as input. Messages and rewards are shared through the graph network. Finally, we optimize the whole system via the policy gradient algorithm. Experimental results of several multi-agent systems show the efficiency of the proposed method and its strength compared to existing methods in cooperative scenarios.",
        "keywords": "Multi-agent Learning;Communication;Graph Network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kaiqian Han;Liangliang Ren;Jiwen Lu;Jie Zhou",
        "authorids": "hkg16@mails.tsinghua.edu.cn;renll16@mails.tsinghua.edu.cn;lujiwen@tsinghua.edu.cn;jzhou@tsinghua.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SklEs2EYvS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "157;359;190",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "150;294;87",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            235.33333333333334,
            88.47724126701861
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            177.0,
            86.6371744691619
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "SklGryBtwr",
        "title": "Environmental drivers of systematicity and generalization in a situated agent",
        "track": "main",
        "status": "Poster",
        "tldr": "We isolate the environmental and training factors that contribute to emergent systematic generalization in a situated language-learning agent",
        "abstract": "The question of whether deep neural networks are good at generalising beyond their immediate training experience is of critical importance for learning-based approaches to AI. Here, we consider tests of out-of-sample generalisation that require an agent to respond to never-seen-before instructions by manipulating and positioning objects in a 3D Unity simulated room. We first describe a comparatively generic agent architecture that exhibits strong performance on these tests. We then identify three aspects of the training regime and environment that make a significant difference to its performance: (a) the number of object/word experiences in the training set; (b) the visual invariances afforded by the agent's perspective, or frame of reference; and (c) the variety of visual input inherent in the perceptual aspect of the agent's perception. Our findings indicate that the degree of generalisation that networks exhibit can depend critically on particulars of the environment in which a given task is instantiated. They further suggest that the propensity for neural networks to generalise in systematic ways may increase if, like human children, those networks have access to many frames of richly varying, multi-modal observations as they learn.",
        "keywords": "systematicitiy;systematic;generalization;combinatorial;agent;policy;language;compositionality",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Felix Hill;Andrew Lampinen;Rosalia Schneider;Stephen Clark;Matthew Botvinick;James L. McClelland;Adam Santoro",
        "authorids": "felixhill@google.com;lampinen@stanford.edo;rgschneider@google.com;clarkstephen@google.com;botvinick@google.com;jlmcc@google.com;adamsantoro@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nHill2020Environmental,\ntitle={Environmental drivers of systematicity and generalization in a situated agent},\nauthor={Felix Hill and Andrew Lampinen and Rosalia Schneider and Stephen Clark and Matthew Botvinick and James L. McClelland and Adam Santoro},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SklGryBtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SklGryBtwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "767;1230;976",
        "wc_reply_reviewers": "0;153;836",
        "wc_reply_authors": "251;731;1685",
        "reply_reviewers": "0;1;3",
        "reply_authors": "2;3;4",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            991.0,
            189.31631378903052
        ],
        "wc_reply_reviewers_avg": [
            329.6666666666667,
            363.43943404950187
        ],
        "wc_reply_authors_avg": [
            889.0,
            595.9932885528158
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            1.247219128924647
        ],
        "reply_authors_avg": [
            3.0,
            0.816496580927726
        ],
        "replies_avg": [
            23,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 112,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9066576005584919569&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SklKcRNYDH",
        "title": "Extreme Tensoring for Low-Memory Preconditioning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "State-of-the-art models are now trained with billions of parameters, reaching hardware limits in terms of memory consumption. This has created a recent demand for memory-efficient optimizers. To this end, we investigate the limits and performance tradeoffs of memory-efficient adaptively preconditioned gradient methods. We propose \\emph{extreme tensoring} for high-dimensional stochastic optimization, showing that an optimizer needs very little memory to benefit from adaptive preconditioning. Our technique applies to arbitrary models (not necessarily with tensor-shaped parameters), and is accompanied by regret and convergence guarantees, which shed light on the tradeoffs between preconditioner quality and expressivity. On a large-scale NLP model, we reduce the optimizer memory overhead by three orders of magnitude, without degrading performance.",
        "keywords": "optimization;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinyi Chen;Naman Agarwal;Elad Hazan;Cyril Zhang;Yi Zhang",
        "authorids": "xinyic@google.com;namanagarwal@google.com;ehazan@cs.princeton.edu;cyril.zhang@cs.princeton.edu;y.zhang@cs.princeton.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nChen2020Extreme,\ntitle={Extreme Tensoring for Low-Memory Preconditioning },\nauthor={Xinyi Chen and Naman Agarwal and Elad Hazan and Cyril Zhang and Yi Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SklKcRNYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SklKcRNYDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "350;294;1008",
        "wc_reply_reviewers": "35;0;0",
        "wc_reply_authors": "148;68;452",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            550.6666666666666,
            324.19061608189025
        ],
        "wc_reply_reviewers_avg": [
            11.666666666666666,
            16.49915822768611
        ],
        "wc_reply_authors_avg": [
            222.66666666666666,
            165.4193324722221
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9082504406907436084&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SklM1xStPB",
        "title": "Copy That! Editing Sequences by Copying Spans",
        "track": "main",
        "status": "Reject",
        "tldr": "Sequence-to-sequence models that copy long spans from the input.",
        "abstract": "Neural sequence-to-sequence models are finding increasing use in editing of documents, for example in correcting a text document or repairing source code. In this paper, we argue that existing seq2seq models (with a facility to copy single tokens) are not a natural fit for such tasks, as they have to explicitly copy each unchanged token. We present an extension of seq2seq models capable of copying entire spans of the input to the output in one step, greatly reducing the number of decisions required during inference. This extension means that there are now many ways of generating the same output, which we handle by deriving a new objective for training and a variation of beam search for inference that explicitly handle this problem.\n\nIn our experiments on a range of editing tasks of natural language and source code, we show that our new model consistently outperforms simpler baselines.",
        "keywords": "span copying;sequence generation;editing;code repair",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sheena Panthaplackel;Miltiadis Allamanis;Marc Brockschmidt",
        "authorids": "spantha@cs.utexas.edu;miallama@microsoft.com;mabrocks@microsoft.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npanthaplackel2020copy,\ntitle={Copy That! Editing Sequences by Copying Spans},\nauthor={Sheena Panthaplackel and Miltiadis Allamanis and Marc Brockschmidt},\nyear={2020},\nurl={https://openreview.net/forum?id=SklM1xStPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SklM1xStPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "652;411;393",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "629;248;366",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            485.3333333333333,
            118.08001053899382
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            414.3333333333333,
            159.2531178833104
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14860332779946702394&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SklOUpEYvB",
        "title": "Identifying through Flows for Recovering Latent Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Identifiability, or recovery of the true latent representations from which the observed data originates, is de facto a fundamental goal of representation learning. Yet, most deep generative models do not address the question of identifiability, and thus fail to deliver on the promise of the recovery of the true latent sources that generate the observations. Recent work proposed identifiable generative modelling using variational autoencoders (iVAE) with a theory of identifiability. Due to the intractablity of KL divergence between variational approximate posterior and the true posterior, however, iVAE has to maximize the evidence lower bound (ELBO) of the marginal likelihood, leading to suboptimal solutions in both theory and practice. In contrast, we propose an identifiable framework for estimating latent representations using a flow-based model (iFlow). Our approach directly maximizes the marginal likelihood, allowing for theoretical guarantees on identifiability, thereby dispensing with variational approximations. We derive its optimization objective in analytical form, making it possible to train iFlow in an end-to-end manner. Simulations on synthetic data validate the correctness and effectiveness of our proposed method and demonstrate its practical advantages over other existing methods.",
        "keywords": "Representation learning;identifiable generative models;nonlinear-ICA",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shen Li;Bryan Hooi;Gim Hee Lee",
        "authorids": "maths.shenli@gmail.com;bhooi@comp.nus.edu.sg;dcslgh@nus.edu.sg",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLi2020Identifying,\ntitle={Identifying through Flows for Recovering Latent Representations},\nauthor={Shen Li and Bryan Hooi and Gim Hee Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SklOUpEYvB}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SklOUpEYvB)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SklOUpEYvB",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "300;236",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "1247;610",
        "reply_reviewers": "0;0",
        "reply_authors": "2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            268.0,
            32.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            928.5,
            318.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12969533326037667378&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SklOypVKvS",
        "title": "A Data-Efficient Mutual Information Neural Estimator for Statistical Dependency Testing",
        "track": "main",
        "status": "Reject",
        "tldr": "A new & practical statistical test of dependency using neural networks, benchmarked on synthetic and a real fMRI datasets.",
        "abstract": "Measuring Mutual Information (MI) between high-dimensional, continuous, random variables from observed samples has wide theoretical and practical applications. Recent works have developed accurate MI estimators through provably low-bias approximations and tight variational lower bounds assuming abundant supply of samples, but require an unrealistic number of samples to guarantee statistical significance of the estimation. In this work, we focus on improving data efficiency and propose a Data-Efficient MINE Estimator (DEMINE) that can provide a tight lower confident interval of MI under limited data, through adding cross-validation to the MINE lower bound (Belghazi et al., 2018). Hyperparameter search is employed and a novel meta-learning approach with task augmentation is developed to increase robustness to hyperparamters, reduce overfitting and improve accuracy. With improved data-efficiency, our DEMINE estimator enables statistical testing of dependency at practical dataset sizes. We demonstrate the effectiveness of DEMINE on synthetic benchmarks and a real world fMRI dataset, with application of inter-subject correlation analysis.",
        "keywords": "mutual information;fMRI;inter-subject correation;mutual information neural estimation;meta-learning;statistical test of dependency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiao Lin;Indranil Sur;Samuel A. Nastase;Uri Hasson;Ajay Divakaran;Mohamed R. Amer",
        "authorids": "xiao.lin@sri.com;indranil.sur@sri.com;mohamed.rabie.amer@gmail.com;ajay.divakaran@sri.com;snastase@princeton.edu;hasson@princeton.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nlin2020a,\ntitle={A Data-Efficient Mutual Information Neural Estimator for Statistical Dependency Testing},\nauthor={Xiao Lin and Indranil Sur and Samuel A. Nastase and Uri Hasson and Ajay Divakaran and Mohamed R. Amer},\nyear={2020},\nurl={https://openreview.net/forum?id=SklOypVKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SklOypVKvS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "1081;599;320",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "721;527;143",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            666.6666666666666,
            314.3398726785318
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            463.6666666666667,
            240.17956245738776
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2511813541433744042&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SklR6aEtwH",
        "title": "Neural Architecture Search by Learning Action Space for Monte Carlo Tree Search",
        "track": "main",
        "status": "Reject",
        "tldr": "A new model that learns latent actions for MCTS to the application of neural architecture search",
        "abstract": "Neural Architecture Search (NAS) has emerged as a promising technique for automatic neural network design. However, existing NAS approaches often utilize manually designed action space, which is not directly related to the performance metric to be optimized (e.g., accuracy). As a result, using manually designed action space to perform NAS often leads to sample-inefficient explorations of architectures and thus can be sub-optimal. In order to improve sample efficiency, this paper proposes Latent Action Neural Architecture Search (LaNAS) that learns actions to recursively partition the search space into good or bad regions that contain networks with concentrated performance metrics, i.e., low variance. During the search phase, as different architecture search action sequences lead to regions of different performance, the search efficiency can be significantly improved by biasing towards the good regions. On the largest NAS dataset NASBench-101, our experimental results demonstrated that LaNAS is 22x, 14.6x, 12.4x, 6.8x, 16.5x more sample-efficient than Random Search, Regularized Evolution, Monte Carlo Tree Search, Neural Architecture Optimization, and Bayesian Optimization, respectively. When applied to the open domain, LaNAS achieves 98.0% accuracy on CIFAR-10 and 75.0% top1 accuracy on ImageNet in only 803 samples, outperforming SOTA AmoebaNet with 33x fewer samples.",
        "keywords": "MCTS;Neural Architecture Search;Search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Linnan Wang;Saining Xie;Teng Li;Rodrigo Fonseca;Yuandong Tian",
        "authorids": "linnan_wang@brown.edu;;tengli@fb.com;;yuandong@fb.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwang2020neural,\ntitle={Neural Architecture Search by Learning Action Space for Monte Carlo Tree Search},\nauthor={Linnan Wang and Saining Xie and Teng Li and Rodrigo Fonseca and Yuandong Tian},\nyear={2020},\nurl={https://openreview.net/forum?id=SklR6aEtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SklR6aEtwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "358;497;290",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "491;817;556",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            381.6666666666667,
            86.14845068575264
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            621.3333333333334,
            140.87898668321296
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9064485338438334885&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SklSQgHFDS",
        "title": "Scheduled Intrinsic Drive: A Hierarchical Take on Intrinsically Motivated Exploration",
        "track": "main",
        "status": "Reject",
        "tldr": "A new intrinsic reward signal based on successor features and a novel way to combine extrinsic and intrinsic reward.",
        "abstract": "Exploration in sparse reward reinforcement learning remains an open challenge. Many state-of-the-art methods use intrinsic motivation to complement the sparse extrinsic reward signal, giving the agent more opportunities to receive feedback during exploration. Commonly these signals are added as bonus rewards, which results in a mixture policy that neither conducts exploration nor task fulfillment resolutely.\nIn this paper, we instead learn separate intrinsic and extrinsic task policies and schedule between these different drives to accelerate exploration and stabilize learning. Moreover, we introduce a new type of intrinsic reward denoted as successor feature control (SFC), which is general and not task-specific. It takes into account statistics over complete trajectories and thus differs from previous methods that only use local information to evaluate intrinsic motivation. We evaluate our proposed scheduled intrinsic drive (SID) agent using three different environments with pure visual inputs: VizDoom, DeepMind Lab and DeepMind Control Suite. The results show a substantially improved exploration efficiency with SFC and the hierarchical usage of the intrinsic drives. A video of our experimental results can be found at https://gofile.io/?c=HpEwTd.",
        "keywords": "Reinforcement Learning;Exploration;Intrinsic Motivation;Sparse Rewards",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jingwei Zhang;Niklas Wetzel;Nicolai Dorka;Joschka Boedecker;Wolfram Burgard",
        "authorids": "zhang@cs.uni-freiburg.de;wetzel@cs.uni-freiburg.de;dorka@informatik.uni-freiburg.de;jboedeck@cs.uni-freiburg.de;burgard@informatik.uni-freiburg.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang2020scheduled,\ntitle={Scheduled Intrinsic Drive: A Hierarchical Take on Intrinsically Motivated Exploration},\nauthor={Jingwei Zhang and Niklas Wetzel and Nicolai Dorka and Joschka Boedecker and Wolfram Burgard},\nyear={2020},\nurl={https://openreview.net/forum?id=SklSQgHFDS}\n}",
        "github": "https://gofile.io/?c=XsePY8",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SklSQgHFDS",
        "pdf_size": 0,
        "rating": "3;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "1076;1537;407;539",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "1522;2033;638;618",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;3;1;1",
        "rating_avg": [
            5.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            889.75,
            449.9152003433536
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1202.75,
            602.5177901937834
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.75,
            0.82915619758885
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=124386080495962422&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SklTQCNtvS",
        "title": "Sign-OPT: A Query-Efficient Hard-label Adversarial Attack",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We study the most practical problem setup for evaluating adversarial robustness of a machine learning system with limited access:  the hard-label black-box attack setting for generating adversarial examples, where limited model queries are allowed and only the decision is provided to a queried data input. Several algorithms have been proposed for this problem but they typically require huge amount (>20,000) of queries for attacking one example. Among them, one of the state-of-the-art approaches (Cheng et al., 2019) showed that hard-label attack can be modeled as an optimization problem where the objective function can be evaluated by binary search with additional model queries, thereby a zeroth order optimization algorithm can be applied. In this paper, we adopt the same optimization formulation but  propose to directly estimate the sign of gradient at any direction instead of the gradient itself, which enjoys the benefit of single query. \nUsing this single query oracle for retrieving sign of directional derivative, we develop a novel query-efficient Sign-OPT approach for hard-label black-box attack. We provide a convergence analysis of the new algorithm and conduct experiments on several models on MNIST, CIFAR-10 and ImageNet. \nWe find that Sign-OPT attack consistently requires 5X to 10X fewer queries when compared to the current state-of-the-art approaches, and usually converges to an adversarial example with smaller perturbation. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Minhao Cheng;Simranjit Singh;Patrick H. Chen;Pin-Yu Chen;Sijia Liu;Cho-Jui Hsieh",
        "authorids": "mhcheng@ucla.edu;simranjit@cs.ucla.edu;patrickchen@ucla.edu;pin-yu.chen@ibm.com;sijia.liu@ibm.com;chohsieh@cs.ucla.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nCheng2020Sign-OPT:,\ntitle={Sign-OPT: A Query-Efficient Hard-label Adversarial Attack},\nauthor={Minhao Cheng and Simranjit Singh and Patrick H. Chen and Pin-Yu Chen and Sijia Liu and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SklTQCNtvS}\n}",
        "github": "https://github.com/cmhcbb/attackbox",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SklTQCNtvS",
        "pdf_size": 0,
        "rating": "3;6",
        "confidence": "0;0",
        "wc_review": "239;244",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "482;426",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            241.5,
            2.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            454.0,
            28.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 305,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4337120578340154737&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SklVI1HKvH",
        "title": "Sample-Based Point Cloud Decoder Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We present and evaluate sampling-based point cloud decoders that outperform the baseline MLP approach by better matching the semantics of point clouds.",
        "abstract": "Point clouds are a flexible and ubiquitous way to represent 3D objects with arbitrary resolution and precision. Previous work has shown that adapting encoder networks to match the semantics of their input point clouds can significantly improve their effectiveness over naive feedforward alternatives. However, the vast majority of work on point-cloud decoders are still based on fully-connected networks that map shape representations to a fixed number of output points. In this work, we investigate decoder architectures that more closely match the semantics of variable sized point clouds. Specifically, we study sample-based point-cloud decoders that map a shape representation to a point feature distribution, allowing an arbitrary number of sampled features to be transformed into individual output points. We develop three sample-based decoder architectures and compare their performance to each other and show their improved effectiveness over feedforward architectures. In addition, we investigate the learned distributions to gain insight into the output transformation. Our work is available as an extensible software platform to reproduce these results and serve as a baseline for future work.",
        "keywords": "point cloud;autoencoder",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Erich Merrill;Alan Fern",
        "authorids": "merriler@oregonstate.edu;alan.fern@oregonstate.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmerrill2020samplebased,\ntitle={Sample-Based Point Cloud Decoder Networks},\nauthor={Erich Merrill and Alan Fern},\nyear={2020},\nurl={https://openreview.net/forum?id=SklVI1HKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SklVI1HKvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1244;720;358",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            774.0,
            363.7178393571955
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MmGAgsLSZ20J:scholar.google.com/&scioq=Sample-Based+Point+Cloud+Decoder+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SklViCEFPH",
        "title": "Simple is Better: Training an End-to-end Contract Bridge Bidding Agent without Human Knowledge",
        "track": "main",
        "status": "Reject",
        "tldr": "State-of-the-art contract bridge bidding agent, learned from selfplay without human knowledge",
        "abstract": "Contract bridge is a multi-player imperfect-information game where one partnership collaborate with each other to compete against the other partnership. The game consists of two phases: bidding and playing. While playing is relatively easy for modern software, bidding is challenging and requires agents to learn a communication protocol to reach the optimal contract jointly, with their own private information. The agents need to exchange information to their partners, and interfere opponents, through a sequence of actions. In this work, we train a strong agent to bid competitive bridge purely through selfplay, outperforming WBridge5, a championship-winning software. Furthermore, we show that explicitly modeling belief is not necessary in boosting the performance. To our knowledge, this is the first competitive bridge agent that is trained with no domain knowledge. It outperforms previous state-of-the-art that use human replays with 70x fewer number of parameters.",
        "keywords": "Contract Bridge;Bidding;Selfplay;AlphaZero",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qucheng Gong;Yu Jiang;Yuandong Tian",
        "authorids": "qucheng@fb.com;tinayujiang@fb.com;yuandong@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngong2020simple,\ntitle={Simple is Better: Training an End-to-end Contract Bridge Bidding Agent without Human Knowledge},\nauthor={Qucheng Gong and Yu Jiang and Yuandong Tian},\nyear={2020},\nurl={https://openreview.net/forum?id=SklViCEFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SklViCEFPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "344;789;328",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;738;621",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            487.0,
            213.64612485759403
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            453.0,
            323.8610813296343
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3878279961965453730&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SklVqa4YwH",
        "title": "Realism Index: Interpolation in Generative Models With Arbitrary Prior",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In order to perform plausible interpolations in the latent space of a generative model, we need a measure that credibly reflects if a point in an interpolation is close to the data manifold being modelled, i.e. if it is convincing. In this paper, we introduce a realism index of a point, which can be constructed from an arbitrary prior density, or based on FID score approach in case a prior is not available. We propose a numerically efficient algorithm that directly maximises the realism index of an interpolation which, as we theoretically prove, leads to a search of a geodesic with respect to the corresponding Riemann structure. We show that we obtain better interpolations then the classical linear ones, in particular when either the prior density is not convex shaped, or when the soap bubble effect appears.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "\u0141ukasz Struski;Jacek Tabor;Igor Podolak;Aleksandra Nowak;Krzysztof Maziarz",
        "authorids": "lukasz.struski@uj.edu.pl;jacek.tabor@uj.edu.pl;igor.podolak@uj.edu.pl;aknoow@gmail.com;krzysztof.s.maziarz@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nstruski2020realism,\ntitle={Realism Index: Interpolation in Generative Models With Arbitrary Prior},\nauthor={{\\L}ukasz Struski and Jacek Tabor and Igor Podolak and Aleksandra Nowak and Krzysztof Maziarz},\nyear={2020},\nurl={https://openreview.net/forum?id=SklVqa4YwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SklVqa4YwH",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "352;553",
        "wc_reply_reviewers": "0;112",
        "wc_reply_authors": "61;911",
        "reply_reviewers": "0;1",
        "reply_authors": "1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.5,
            100.5
        ],
        "wc_reply_reviewers_avg": [
            56.0,
            56.0
        ],
        "wc_reply_authors_avg": [
            486.0,
            425.0
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3942573892430331143&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SklcyJBtvB",
        "title": "Off-policy Bandits with Deficient Support",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Off-policy training of contextual-bandit policies is attractive in online systems (e.g. search, recommendation, ad placement), since it enables the reuse of large amounts of log data from the production system. State-of-the-art methods for off-policy learning, however, are based on inverse propensity score (IPS) weighting, which requires that the logging policy chooses all actions with non-zero probability for any context (i.e., full support). In real-world systems, this condition is often violated, and we show that existing off-policy learning methods based on IPS weighting can fail catastrophically. We therefore develop new off-policy contextual-bandit methods that can controllably and robustly learn even when the logging policy has deficient support. To this effect, we explore three approaches that provide various guarantees for safe learning despite the inherent limitations of support deficient data: restricting the action space, reward extrapolation, and restricting the policy space. We analyze the statistical and computational properties of these three approaches, and empirically evaluate their effectiveness in a series of experiments. We find that controlling the policy space is both computationally efficient and that it robustly leads to accurate policies.",
        "keywords": "Recommender System;Search Engine;Counterfactual Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Noveen Sachdeva;Yi Su;Thorsten Joachims",
        "authorids": "ernoveen@gmail.com;ys756@cornell.edu;tj@cs.cornell.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsachdeva2020offpolicy,\ntitle={Off-policy Bandits with Deficient Support},\nauthor={Noveen Sachdeva and Yi Su and Thorsten Joachims},\nyear={2020},\nurl={https://openreview.net/forum?id=SklcyJBtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SklcyJBtvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "264;160;207",
        "wc_reply_reviewers": "0;65;0",
        "wc_reply_authors": "758;554;257",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            210.33333333333334,
            42.52319628417203
        ],
        "wc_reply_reviewers_avg": [
            21.666666666666668,
            30.641293851417057
        ],
        "wc_reply_authors_avg": [
            523.0,
            205.70367036103173
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 86,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9083015871383786250&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "Skld1aVtPB",
        "title": "Deep Mining: Detecting Anomalous Patterns in Neural Network Activations with Subset Scanning",
        "track": "main",
        "status": "Reject",
        "tldr": "We efficiently find a subset of images that have higher than expected activations for some subset of nodes.  These images appear more anomalous and easier to detect when viewed as a group. ",
        "abstract": "This work views neural networks as data generating systems and applies anomalous pattern detection techniques on that data in order to detect when a network is processing a group of anomalous inputs.  Detecting anomalies is a critical component for multiple machine learning problems including detecting the presence of adversarial noise added to inputs. More broadly, this work is a step towards giving neural networks the ability to detect groups of out-of-distribution samples.  This work introduces ``Subset Scanning methods from the anomalous pattern detection domain to the task of detecting anomalous inputs to neural networks.  Subset Scanning allows us to answer the question: \"``Which subset of inputs have larger-than-expected activations at which subset of nodes?\"  Framing the adversarial detection problem this way allows us to identify systematic patterns in the activation space that span multiple adversarially noised images.  Such images are ``\"weird together\".  Leveraging this common anomalous pattern, we show increased detection power as the proportion of noised images increases in a test set.   Detection power and accuracy results are provided for targeted adversarial noise added to CIFAR-10 images on a 20-layer ResNet using the Basic Iterative Method attack. ",
        "keywords": "anomalous pattern detection;subset scanning;node activations;adversarial noise",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Skyler Speakman;Celia Cintas;Victor Akinwande;Srihari Sridharan;Edward McFowland III",
        "authorids": "skyler@ke.ibm.com;celia.cintas@ibm.com;victor.akinwande1@ibm.com;sriharis.sridharan@ke.ibm.com;mcfowland@umn.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nspeakman2020deep,\ntitle={Deep Mining: Detecting Anomalous Patterns in Neural Network Activations with Subset Scanning},\nauthor={Skyler Speakman and Celia Cintas and Victor Akinwande and Srihari Sridharan and Edward McFowland III},\nyear={2020},\nurl={https://openreview.net/forum?id=Skld1aVtPB}\n}",
        "github": "https://github.com/hikayifix/adversarialdetector",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skld1aVtPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "423;293;702",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "786;545;693",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            472.6666666666667,
            170.62694850332275
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            674.6666666666666,
            99.23820948718404
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NmzCQ1MYAWsJ:scholar.google.com/&scioq=Deep+Mining:+Detecting+Anomalous+Patterns+in+Neural+Network+Activations+with+Subset+Scanning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Sklf1yrYDr",
        "title": "BatchEnsemble: an Alternative Approach to Efficient Ensemble and Lifelong Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduced BatchEnsemble, an efficient method for ensembling and lifelong learning which can be used to improve the accuracy and uncertainty of any neural network like typical ensemble methods.",
        "abstract": "\nEnsembles, where multiple neural networks are trained individually and their predictions are averaged, have been shown to be widely successful for improving both the accuracy and predictive uncertainty of single neural networks. However, an ensemble\u2019s cost for both training and testing increases linearly with the number of networks, which quickly becomes untenable.\nIn this paper, we propose BatchEnsemble, an ensemble method whose computational and memory costs are significantly lower than typical ensembles. BatchEnsemble achieves this by defining each weight matrix to be the Hadamard product of a shared weight among all ensemble members and a rank-one matrix per member. Unlike ensembles, BatchEnsemble is not only parallelizable across devices, where one device trains one member, but also parallelizable within a device, where multiple ensemble members are updated simultaneously for a given mini-batch. Across CIFAR-10, CIFAR-100, WMT14 EN-DE/EN-FR translation, and out-of-distribution tasks, BatchEnsemble yields competitive accuracy and uncertainties as typical ensembles; the speedup at test time is 3X and memory reduction is 3X at an ensemble of size 4. We also apply BatchEnsemble to lifelong learning, where on Split-CIFAR-100, BatchEnsemble yields comparable performance to progressive neural networks while having a much lower computational and memory costs. We further show that BatchEnsemble can easily scale up to lifelong learning on Split-ImageNet which involves 100 sequential learning tasks",
        "keywords": "deep learning;ensembles",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yeming Wen;Dustin Tran;Jimmy Ba",
        "authorids": "ywen@cs.toronto.edu;trandustin@google.com;jba@cs.toronto.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nWen2020BatchEnsemble:,\ntitle={BatchEnsemble: an Alternative Approach to Efficient Ensemble and Lifelong Learning},\nauthor={Yeming Wen and Dustin Tran and Jimmy Ba},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Sklf1yrYDr}\n}",
        "github": "https://github.com/google/edward2",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Sklf1yrYDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "778;650;305",
        "wc_reply_reviewers": "0;50;0",
        "wc_reply_authors": "1821;994;299",
        "reply_reviewers": "0;1;0",
        "reply_authors": "4;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            577.6666666666666,
            199.7604120495906
        ],
        "wc_reply_reviewers_avg": [
            16.666666666666668,
            23.570226039551585
        ],
        "wc_reply_authors_avg": [
            1038.0,
            622.1323546213191
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 550,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2684475579133602&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SklfY6EFDH",
        "title": "Representation Quality Explain Adversarial Attacks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural networks have been shown vulnerable to adversarial samples.  Slightly perturbed input images are able to change the classification of accurate models, showing that the representation learned is not as good as previously thought. To aid the development of better neural networks, it would be important to evaluate to what extent are current neural networks' representations capturing the existing features. Here we propose a way to evaluate the representation quality of neural networks using a novel type of zero-shot test, entitled Raw Zero-Shot. The main idea lies in the fact that some features are present on unknown classes and that unknown classes can be defined as a combination of previous learned features without representation bias (a bias towards representation that maps only current set of input-outputs and their boundary). To evaluate the soft-labels of unknown classes, two metrics are proposed. One is based on clustering validation techniques (Davies-Bouldin Index) and the other is based on soft-label distance of a given correct soft-label.\nExperiments show that such metrics are in accordance with the robustness to adversarial attacks and might serve as a guidance to build better models as well as be used in loss functions to create new types of neural networks. Interestingly, the results suggests that dynamic routing networks such as CapsNet have better representation while current deeper DNNs are trading off representation quality for accuracy.",
        "keywords": "Representation Metrics;Adversarial Machine Learning;One-Pixel Attack;DeepFool;CapsNet",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Danilo Vasconcellos Vargas;Shashank Kotyan;Moe Matsuki",
        "authorids": "vargas@inf.kyushu-u.ac.jp;shashankkotyan@gmail.com;matsuki.sousisu@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nvargas2020representation,\ntitle={Representation Quality Explain Adversarial Attacks},\nauthor={Danilo Vasconcellos Vargas and Shashank Kotyan and Moe Matsuki},\nyear={2020},\nurl={https://openreview.net/forum?id=SklfY6EFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SklfY6EFDH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "745;326;496",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "789;624;692",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            522.3333333333334,
            172.06652460280847
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            701.6666666666666,
            67.70688459988557
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16278024852634651347&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SklgTkBKDr",
        "title": "Neural Non-additive Utility Aggregation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose two new neural architectures that explicitly model latent intermediate element utilities for non-additive set utility estimation.",
        "abstract": "Neural architectures for set regression problems aim at learning representations such that good predictions can be made based on the learned representations. This strategy, however, ignores the fact that meaningful intermediate results might be helpful to perform well. We study two new architectures that explicitly model latent intermediate utilities and use non-additive utility aggregation to estimate the set utility based on the latent utilities. We evaluate the new architectures with visual and textual datasets, which have non-additive set utilities due to redundancy and synergy effects. We find that the new architectures perform substantially better in this setup.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Markus Zopf",
        "authorids": "mzopf@ke.tu-darmstadt.de",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nzopf2020neural,\ntitle={Neural Non-additive Utility Aggregation},\nauthor={Markus Zopf},\nyear={2020},\nurl={https://openreview.net/forum?id=SklgTkBKDr}\n}",
        "github": "https://drive.google.com/drive/folders/1X3bN3yxHew4XmhUryqLetpNxhFBH2FqI?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SklgTkBKDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "86;219;248",
        "wc_reply_reviewers": "87;0;0",
        "wc_reply_authors": "471;758;456",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            184.33333333333334,
            70.53289224814823
        ],
        "wc_reply_reviewers_avg": [
            29.0,
            41.012193308819754
        ],
        "wc_reply_authors_avg": [
            561.6666666666666,
            138.96362433705048
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rweEG1aez6EJ:scholar.google.com/&scioq=Neural+Non-additive+Utility+Aggregation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SklgfkSFPH",
        "title": "On PAC-Bayes Bounds for Deep Neural Networks using the Loss Curvature",
        "track": "main",
        "status": "Reject",
        "tldr": "For PAC-Bayes when modeling the prior and posterior as multivariate Gaussians, we show that for specific cases of DNNs it is impossible to prove generalization, assuming a second order Taylor expansion of the empirical loss is tight.",
        "abstract": "We investigate whether it's possible to tighten PAC-Bayes bounds for deep neural networks by utilizing the Hessian of the training loss at the minimum. For the case of Gaussian priors and posteriors we introduce a Hessian-based method to obtain tighter PAC-Bayes bounds that relies on closed form solutions of layerwise subproblems. We thus avoid commonly used variational inference techniques which can be difficult to implement and time consuming for modern deep architectures. We conduct a theoretical analysis that links the random initialization, minimum, and curvature at the minimum of a deep neural network to limits on what is provable about generalization through PAC-Bayes. Through careful experiments we validate our theoretical predictions and analyze the influence of the prior mean, prior covariance, posterior mean and posterior covariance on obtaining tighter bounds. ",
        "keywords": "PAC-Bayes;Hessian;curvature;lower bound;Variational Inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Konstantinos Pitas",
        "authorids": "konstantinos.pitas@epfl.ch",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\npitas2020on,\ntitle={On {\\{}PAC{\\}}-Bayes Bounds for Deep Neural Networks using the Loss Curvature},\nauthor={Konstantinos Pitas},\nyear={2020},\nurl={https://openreview.net/forum?id=SklgfkSFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SklgfkSFPH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "1120;443;364",
        "wc_reply_reviewers": "214;0;0",
        "wc_reply_authors": "2965;661;431",
        "reply_reviewers": "1;0;0",
        "reply_authors": "5;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            642.3333333333334,
            339.2976405589772
        ],
        "wc_reply_reviewers_avg": [
            71.33333333333333,
            100.88056744928079
        ],
        "wc_reply_authors_avg": [
            1352.3333333333333,
            1144.1868534271644
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.8856180831641267
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YTzWfjD0i0AJ:scholar.google.com/&scioq=On+PAC-Bayes+Bounds+for+Deep+Neural+Networks+using+the+Loss+Curvature&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Sklgs0NFvr",
        "title": "Learning The Difference That Makes A Difference With Counterfactually-Augmented Data",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Humans in the loop revise documents to accord with counterfactual labels, resulting resource helps to reduce reliance on spurious associations.",
        "abstract": "Despite alarm over the reliance of machine learning systems on so-called spurious patterns, the term lacks coherent meaning in standard statistical frameworks. However, the language of causality offers clarity: spurious associations are due to confounding (e.g., a common cause), but not direct or indirect causal effects. In this paper, we focus on natural language processing, introducing methods and resources  for training models less sensitive to spurious patterns. Given documents and their initial labels, we task humans with revising each document so that it (i) accords with a counterfactual target label; (ii) retains internal coherence;  and (iii) avoids unnecessary changes. Interestingly, on sentiment analysis and natural language inference tasks, classifiers trained on original data fail on their  counterfactually-revised counterparts and vice versa. Classifiers trained on combined datasets  perform remarkably well, just shy of those specialized to either domain. While classifiers trained on either original or manipulated data alone  are sensitive to spurious features (e.g., mentions of genre), models trained on the combined data are less sensitive to this signal. Both datasets are publicly available.",
        "keywords": "humans in the loop;annotation artifacts;text classification;sentiment analysis;natural language inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Divyansh Kaushik;Eduard Hovy;Zachary Lipton",
        "authorids": "dkaushik@cs.cmu.edu;hovy@cmu.edu;zlipton@cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nKaushik2020Learning,\ntitle={Learning The Difference That Makes A Difference With Counterfactually-Augmented Data},\nauthor={Divyansh Kaushik and Eduard Hovy and Zachary Lipton},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Sklgs0NFvr}\n}",
        "github": "https://github.com/dkaushik96/counterfactually-augmented-data",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Sklgs0NFvr",
        "pdf_size": 0,
        "rating": "1;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "156;460;287;450",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "248;410;71;337",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.75,
            2.8613807855648994
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            338.25,
            125.65105451208915
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            266.5,
            126.61457262100599
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 661,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7572756476096548867&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SkliR1SKDS",
        "title": "Rethinking Data Augmentation: Self-Supervision and Self-Distillation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a simple self-supervised data augmentation technique which improves performance of fully-supervised scenarios including few-shot learning and imbalanced classification.",
        "abstract": "Data augmentation techniques, e.g., flipping or cropping, which systematically enlarge the training dataset by explicitly generating more training samples, are effective in improving the generalization performance of deep neural networks. In the supervised setting, a common practice for data augmentation is to assign the same label to all augmented samples of the same source. However, if the augmentation results in large distributional discrepancy among them (e.g., rotations), forcing their label invariance may be too difficult to solve and often hurts the performance. To tackle this challenge, we suggest a simple yet effective idea of learning the joint distribution of the original and self-supervised labels of augmented samples. The joint learning framework is easier to train, and enables an aggregated inference combining the predictions from different augmented samples for improving the performance. Further, to speed up the aggregation process, we also propose a knowledge transfer technique, self-distillation, which transfers the knowledge of augmentation into the model itself. We demonstrate the effectiveness of our data augmentation framework on various fully-supervised settings including the few-shot and imbalanced classification scenarios.",
        "keywords": "self-supervision;data augmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hankook Lee;Sung Ju Hwang;Jinwoo Shin",
        "authorids": "hankook.lee@kaist.ac.kr;sjhwang82@kaist.ac.kr;jinwoos@kaist.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkliR1SKDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "331;130;480",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            313.6666666666667,
            143.41160653478815
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 71,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=856454226955890407&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Sklia3EFPH",
        "title": "Input Alignment along Chaotic directions increases Stability in Recurrent Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Input Structuring along Chaos for Stability",
        "abstract": "Anatomical studies demonstrate that brain reformats input information to generate reliable responses for performing computations. However, it remains unclear how neural circuits encode complex spatio-temporal patterns. We show that neural dynamics are strongly influenced by the phase alignment between the input and the spontaneous chaotic activity. Input alignment along the dominant chaotic projections causes the chaotic trajectories to become stable channels (or attractors), hence, improving the computational capability of a recurrent network. Using mean field analysis, we derive the impact of input alignment on the overall stability of attractors formed. Our results indicate that input alignment determines the extent of intrinsic noise suppression and hence, alters the attractor state stability, thereby controlling the network's inference ability.",
        "keywords": "Reservoir Models;Stability;Chaos;Input Alignment;Mean Field Analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Priyadarshini Panda;Kaushik Roy",
        "authorids": "priya.panda@yale.edu;kaushik@purdue.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Sklia3EFPH",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "302;561;500",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            454.3333333333333,
            110.55717475084505
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:H6uqEQmS7A4J:scholar.google.com/&scioq=Input+Alignment+along+Chaotic+directions+increases+Stability+in+Recurrent+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SklibJBFDB",
        "title": "Evaluating Semantic Representations of Source Code",
        "track": "main",
        "status": "Reject",
        "tldr": "A benchmark to evaluate neural embeddings of identifiers in source code.",
        "abstract": "Learned representations of source code enable various software developer tools, e.g., to detect bugs or to predict program properties. At the core of code representations often are word embeddings of identifier names in source code, because identifiers account for the majority of source code vocabulary and convey important semantic information. Unfortunately, there currently is no generally accepted way of evaluating the quality of word embeddings of identifiers, and current evaluations are biased toward specific downstream tasks. This paper presents IdBench, the first benchmark for evaluating to what extent word embeddings of identifiers represent semantic relatedness and similarity. The benchmark is based on thousands of ratings gathered by surveying 500 software developers. We use IdBench to evaluate state-of-the-art embedding techniques proposed for natural language, an embedding technique specifically designed for source code, and lexical string distance functions, as these are often used in current developer tools. Our results show that the effectiveness of embeddings varies significantly across different embedding techniques and that the best available embeddings successfully represent semantic relatedness. On the downside, no existing embedding provides a satisfactory representation of semantic similarities, e.g., because embeddings consider identifiers with opposing meanings as similar, which may lead to fatal mistakes in downstream developer tools. IdBench provides a gold standard to guide the development of novel embeddings that address the current limitations.\n",
        "keywords": "embeddings;representation;source code;identifiers",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yaza Wainakh;Moiz Rauf;Michael Pradel",
        "authorids": "yaza.wainakh@gmail.com;moiz.rauf@iste.uni-stuttgart.de;michael@binaervarianz.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwainakh2020evaluating,\ntitle={Evaluating Semantic Representations of Source Code},\nauthor={Yaza Wainakh and Moiz Rauf and Michael Pradel},\nyear={2020},\nurl={https://openreview.net/forum?id=SklibJBFDB}\n}",
        "github": "https://my.pcloud.com/publink/show?code=kZR2LzkZAViNngFuJ3ykFqNzbQF1TJRedJs7",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SklibJBFDB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "245;171;264",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "220;255;28",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            226.66666666666666,
            40.119266637808266
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            167.66666666666666,
            99.78755210724209
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17400719449214062308&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SklkDkSFPB",
        "title": "BlockSwap: Fisher-guided Block Substitution for Network Compression on a Budget",
        "track": "main",
        "status": "Poster",
        "tldr": "A simple and effective method for reducing large neural networks to flexible parameter targets based on block substitution.",
        "abstract": "The desire to map neural networks to varying-capacity devices has led to the development of a wealth of compression techniques, many of which involve replacing standard convolutional blocks in a large network with cheap alternative blocks. However, not all blocks are created equally; for a required compute budget there may exist a potent combination of many different cheap blocks, though exhaustively searching for such a combination is prohibitively expensive. In this work, we develop BlockSwap: a fast algorithm for choosing networks with interleaved block types by passing a single minibatch of training data through randomly initialised networks and gauging their Fisher potential. These networks can then be used as students and distilled with the original large network as a teacher. We demonstrate the effectiveness of the chosen networks across CIFAR-10 and ImageNet for classification, and COCO for detection, and provide a comprehensive ablation study of our approach. BlockSwap quickly explores possible block configurations using a simple architecture ranking system, yielding highly competitive networks in orders of magnitude less time than most architecture search techniques (e.g. under 5 minutes on a single GPU for CIFAR-10).",
        "keywords": "model compression;architecture search;efficiency;budget;convolutional neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jack Turner;Elliot J. Crowley;Michael O'Boyle;Amos Storkey;Gavin Gray",
        "authorids": "jack.turner@ed.ac.uk;elliot.j.crowley@ed.ac.uk;mob@inf.ed.ac.uk;a.storkey@ed.ac.uk;g.d.b.gray@ed.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nTurner2020BlockSwap:,\ntitle={BlockSwap: Fisher-guided Block Substitution for Network Compression on a Budget},\nauthor={Jack Turner and Elliot J. Crowley and Michael O'Boyle and Amos Storkey and Gavin Gray},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SklkDkSFPB}\n}",
        "github": "https://github.com/BayesWatch/pytorch-blockswap",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SklkDkSFPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "197;522;87",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "800;476;26",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            268.6666666666667,
            184.676895023594
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            434.0,
            317.3767477305166
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 79,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2671023600912683387&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Skln2A4YDB",
        "title": "Model-Augmented Actor-Critic: Backpropagating through Paths",
        "track": "main",
        "status": "Poster",
        "tldr": "Policy gradient through backpropagation through time using learned models and Q-functions. SOTA results in reinforcement learning benchmark environments.",
        "abstract": "Current model-based reinforcement learning approaches use the model simply as a learned black-box simulator to augment the data for policy optimization or value function learning. In this paper, we show how to make more effective use of the model by exploiting its differentiability. We construct a policy optimization algorithm that uses the pathwise derivative of the learned model and policy across future timesteps. Instabilities of learning across many timesteps are prevented by using a terminal value function, learning the policy in an actor-critic fashion. Furthermore, we present a derivation on the monotonic improvement of our objective in terms of the gradient error in the model and value function. We show that our approach (i) is consistently more sample efficient than existing state-of-the-art model-based algorithms, (ii) matches the asymptotic performance of model-free algorithms, and (iii) scales to long horizons, a regime where typically past model-based approaches have struggled.",
        "keywords": "reinforcement learning;model-based;actor-critic;pathwise",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ignasi Clavera;Yao Fu;Pieter Abbeel",
        "authorids": "iclavera@berkeley.edu;violetfuyao@berkeley.edu;pabbeel@cs.berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nclavera2020modelaugmented,\ntitle={Model-Augmented Actor-Critic: Backpropagating through Paths},\nauthor={Ignasi Clavera and Yao Fu and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skln2A4YDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skln2A4YDB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "395;347;295",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "454;335;259",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            345.6666666666667,
            40.83571421630281
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            349.3333333333333,
            80.25099514786233
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 110,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16847841909794815723&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SklnVAEFDB",
        "title": "BERT-AL: BERT for Arbitrarily Long Document Understanding",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Pretrained language models attract lots of attentions, and they take advantage of the two-stages training process: pretraining on huge corpus and finetuning on specific tasks. Thereinto, BERT (Devlin et al., 2019) is a Transformer (Vaswani et al., 2017) based model and has been the state-of-the-art for many kinds of Nature Language Processing (NLP) tasks. However, BERT cannot take text longer than the maximum length as input since the maximum length is predefined during pretraining. When we apply BERT to long text tasks, e.g., document-level text summarization: 1) Truncating inputs by the maximum sequence length will decrease performance, since the model cannot capture long dependency and global information ranging the whole document. 2) Extending the maximum length requires re-pretraining which will cost a mass of time and computing resources. What's even worse is that the computational complexity will increase quadratically with the length, which will result in an unacceptable training time. To resolve these problems, we propose to apply Transformer to only model local dependency and recurrently capture long dependency by inserting multi-channel LSTM into each layer of BERT. The proposed model is named as BERT-AL (BERT for Arbitrarily Long Document Understanding) and it can accept arbitrarily long input without re-pretraining from scratch. We demonstrate BERT-AL's effectiveness on text summarization by conducting experiments on the CNN/Daily Mail dataset. Furthermore, our method can be adapted to other Transformer based models, e.g., XLNet (Yang et al., 2019) and RoBERTa (Liu et al., 2019), for various NLP tasks with long text.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruixuan Zhang;Zhuoyu Wei;Yu Shi;Yining Chen",
        "authorids": "903276268@pku.edu.cn;zhuoyu.wei@microsoft.com;yushi@microsoft.com;yining.chen@microsoft.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020bertal,\ntitle={{\\{}BERT{\\}}-{\\{}AL{\\}}: {\\{}BERT{\\}} for Arbitrarily Long Document Understanding},\nauthor={Ruixuan Zhang and Zhuoyu Wei and Yu Shi and Yining Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=SklnVAEFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer5;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SklnVAEFDB",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "438;209;998;105",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            437.5,
            345.3002316825171
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5742840555047468346&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SkloDJSFPH",
        "title": "Neural Approximation of an Auto-Regressive Process through Confidence Guided Sampling",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a effective confidence-based approximation method that can be plugged in to various auto-regressive models with a proved convergence.",
        "abstract": "We propose a generic confidence-based approximation that can be plugged in and simplify an auto-regressive generation process with a proved convergence. We first assume that the priors of future samples can be generated in an independently and identically distributed (i.i.d.) manner using an efficient predictor. Given the past samples and future priors, the mother AR model can post-process the priors while the accompanied confidence predictor decides whether the current sample needs a resampling or not. Thanks to the i.i.d. assumption, the post-processing can update each sample in a parallel way, which remarkably accelerates the mother model. Our experiments on different data domains including sequences and images show that the proposed method can successfully capture the complex structures of the data and generate the meaningful future samples with lower computational cost while preserving the sequential relationship of the data.}",
        "keywords": "Neural approximation method;Auto-regressive model;Sequential sample generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "YoungJoon Yoo;Sanghyuk Chun;Jaejun Yoo;Sangdoo Yun;Jung Woo Ha",
        "authorids": "youngjoon.yoo@navercorp.com;sanghyuk.c@navercorp.com;jaejun.yoo@navercorp.com;sangdoo.yun@navercorp.com;jungwoo.ha@navercorp.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyoo2020neural,\ntitle={Neural Approximation of an Auto-Regressive Process through Confidence Guided Sampling},\nauthor={YoungJoon Yoo and Sanghyuk Chun and Jaejun Yoo and Sangdoo Yun and Jung Woo Ha},\nyear={2020},\nurl={https://openreview.net/forum?id=SkloDJSFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkloDJSFPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "501;390;220",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            370.3333333333333,
            115.55758545225646
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kqyqqrxr6cgJ:scholar.google.com/&scioq=Neural+Approximation+of+an+Auto-Regressive+Process+through+Confidence+Guided+Sampling&hl=en&as_sdt=0,33",
        "gs_version_total": 4
    },
    {
        "id": "SklqvxSFDB",
        "title": "On learning visual odometry errors",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper discusses different methods of pairing VO with deep learning and proposes a simultaneous prediction of corrections and uncertainty.",
        "abstract": "This paper fosters the idea that deep learning methods can be sided to classical\nvisual odometry pipelines to improve their accuracy and to produce uncertainty\nmodels to their estimations. We show that the biases inherent to the visual odom-\netry process can be faithfully learnt and compensated for, and that a learning ar-\nchitecture associated to a probabilistic loss function can jointly estimate a full\ncovariance matrix of the residual errors, defining a heteroscedastic error model.\nExperiments on autonomous driving image sequences and micro aerial vehicles\ncamera acquisitions assess the possibility to concurrently improve visual odome-\ntry and estimate an error associated to its outputs.",
        "keywords": "visual odometry;deep learning in robotics;uncertainty estimation in computer vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrea De Maio;Simon Lacroix",
        "authorids": "andrea.de-maio@laas.fr;simon.lacroix@laas.fr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "https://www.dropbox.com/s/xfa5wgl9k479mvm/on_learning_vo_errors.tar.gz?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SklqvxSFDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "417;160;616",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "292;77;449",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            397.6666666666667,
            186.66249995349597
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            272.6666666666667,
            152.48242157340263
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "SklsBJHKDS",
        "title": "Model Inversion Networks for Model-Based Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel approach to solve data-driven model-based optimization problems in both passive and active settings that can scale to high-dimensional input spaces.",
        "abstract": "In this work, we aim to solve data-driven optimization problems, where the goal is to find an input that maximizes an unknown score function given access to a dataset of input, score pairs. Inputs may lie on extremely thin manifolds in high-dimensional spaces, making the optimization prone to falling-off the manifold. Further, evaluating the unknown function may be expensive, so the algorithm should be able to exploit static, offline data. We propose model inversion networks (MINs) as an approach to solve such problems. Unlike prior work, MINs scale to extremely high-dimensional input spaces and can efficiently leverage offline logged datasets for optimization in both contextual and non-contextual settings. We show that MINs can also be extended to the active setting, commonly studied in prior work, via a simple, novel and effective scheme for active data collection. Our experiments show that MINs act as powerful optimizers on a range of contextual/non-contextual, static/active problems including optimization over images and protein designs and learning from logged bandit feedback.",
        "keywords": "data-driven optimization;model-based optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aviral Kumar;Sergey Levine",
        "authorids": "aviralkumar2907@gmail.com;svlevine@eecs.berkeley.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkumar2020model,\ntitle={Model Inversion Networks for Model-Based Optimization},\nauthor={Aviral Kumar and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=SklsBJHKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SklsBJHKDS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "114;674;570",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "492;758;750",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.6666666666667,
            243.20818699670087
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            666.6666666666666,
            123.55115899451891
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 107,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1624477288805359316&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Skltqh4KvB",
        "title": "Are there any 'object detectors' in the hidden layers of CNNs trained to identify objects or scenes?",
        "track": "main",
        "status": "Reject",
        "tldr": "Looking for object detectors using many different selectivity measures; CNNs are slightly selective , but not enough to be termed object detectors.",
        "abstract": "Various methods of measuring unit selectivity have been developed with the aim of better understanding how neural networks work.  But the different measures provide divergent estimates of selectivity, and this has led to different conclusions regarding the conditions in which selective object representations are learned and the functional relevance of these representations. In an attempt to better characterize object selectivity, we undertake a comparison of various selectivity measures on a large set of units in AlexNet, including localist selectivity, precision, class-conditional mean activity selectivity (CCMAS), network dissection, the human interpretation of activation maximization (AM) images, and standard signal-detection measures.  We find that the different measures provide different estimates of object selectivity, with precision and CCMAS measures providing misleadingly high estimates. Indeed, the most selective units had a poor hit-rate or a high false-alarm rate (or both) in object classification, making them poor object detectors.  We fail to find any units that are even remotely as selective as the 'grandmother cell' units reported in recurrent neural networks. In order to generalize these results, we compared selectivity measures on a few units in VGG-16 and GoogLeNet trained on the ImageNet or Places-365 datasets that have been described as 'object detectors'. Again, we find poor hit-rates and high false-alarm rates for object classification. ",
        "keywords": "neural networks;localist coding;selectivity;object detectors;CCMAS;CNNs;activation maximisation;information representation;network dissection;interpretabillity;signal detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ella M. Gale;Nicholas Martin;Ryan Blything;Anh Nguyen;Jeffrey S. Bowers",
        "authorids": "ella.gale@bristol.ac.uk;nm13850@bristol.ac.uk;ryan.blything@bristol.ac.uk;anhnguyen@auburn.edu;j.bowers@bristol.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ngale2020are,\ntitle={Are there any 'object detectors' in the hidden layers of {\\{}CNN{\\}}s trained to identify objects or scenes?},\nauthor={Ella M. Gale and Nicholas Martin and Ryan Blything and Anh Nguyen and Jeffrey S. Bowers},\nyear={2020},\nurl={https://openreview.net/forum?id=Skltqh4KvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skltqh4KvB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "368;968;231",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "667;805;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            522.3333333333334,
            320.0586751762176
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            490.6666666666667,
            351.4980638479946
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=988579658152507025&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "SkluFgrFwH",
        "title": "Learning Mahalanobis Metric Spaces via Geometric Approximation Algorithms",
        "track": "main",
        "status": "Reject",
        "tldr": "Fully parallelizable and adversarial-noise resistant metric learning algorithm with theoretical guarantees.",
        "abstract": "Learning Mahalanobis metric spaces is an important problem that has found numerous applications. Several algorithms have been designed for this problem, including Information Theoretic Metric Learning (ITML) [Davis et al. 2007] and Large Margin Nearest Neighbor (LMNN) classification [Weinberger and Saul 2009].  We consider a formulation of Mahalanobis metric learning as an optimization problem,where the objective is to minimize the number of violated similarity/dissimilarity constraints.  We show that for any fixed ambient dimension, there exists a fully polynomial time approximation scheme (FPTAS) with nearly-linear running time.This result is obtained using tools from the theory of linear programming in low dimensions. We also discuss improvements of the algorithm in practice, and present experimental results on synthetic and real-world data sets. Our algorithm is fully parallelizable and performs favorably in the presence of adversarial noise.",
        "keywords": "Metric Learning;Geometric Algorithms;Approximation Algorithms",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Diego Ihara;Neshat Mohammadi;Anastasios Sidiropoulos",
        "authorids": "dihara@gmail.com;nmoham24@uic.edu;sidiropo@uic.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nihara2020learning,\ntitle={Learning Mahalanobis Metric Spaces via Geometric Approximation Algorithms},\nauthor={Diego Ihara and Neshat Mohammadi and Anastasios Sidiropoulos},\nyear={2020},\nurl={https://openreview.net/forum?id=SkluFgrFwH}\n}",
        "github": "https://drive.google.com/drive/folders/1XgABqyh8E1CoRGadh1KC5or7TBLgQkdl",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkluFgrFwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "280;154;321",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "416;323;210",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            251.66666666666666,
            71.06022484875456
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            316.3333333333333,
            84.23116340695343
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13844566021957569195&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SklwGlHFvH",
        "title": "Learning Curves for Deep Neural Networks: A field theory perspective",
        "track": "main",
        "status": "Reject",
        "tldr": "Nice and accurate predictions for DNN learning curves using a novel field theory approach  ",
        "abstract": "A series of recent works established a rigorous correspondence between very wide deep neural networks (DNNs), trained in a particular manner, and noiseless Bayesian Inference with a certain Gaussian Process (GP) known as the Neural Tangent Kernel (NTK). Here we extend a known field-theory formalism for GP inference to get a detailed understanding of learning-curves in DNNs trained in the regime of this correspondence (NTK regime). In particular, a renormalization-group approach is used to show that noiseless GP inference using NTK, which lacks a good analytical handle, can be well approximated by noisy GP inference on a related kernel we call the renormalized NTK. Following this, a perturbation-theory analysis is carried in one over the dataset-size yielding analytical expressions for the (fixed-teacher/fixed-target) leading and sub-leading asymptotics of the learning curves. At least for uniform datasets, a coherent picture emerges wherein fully-connected DNNs have a strong implicit bias towards functions which are low order polynomials of the input.    ",
        "keywords": "Gaussian Processes;Neural Tangent Kernel;Learning Curves;Field Theory;Statistical Mechanics;Generalization;Deep neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Omry Cohen;Or Malka;Zohar Ringel",
        "authorids": "omrycohen.38.talpiot@gmail.com;or.malka@mail.huji.ac.il;zohar.ringel@mail.huji.ac.il",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ncohen2020learning,\ntitle={Learning Curves for Deep Neural Networks: A field theory perspective},\nauthor={Omry Cohen and Or Malka and Zohar Ringel},\nyear={2020},\nurl={https://openreview.net/forum?id=SklwGlHFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SklwGlHFvH",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "802;322;1074",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "412;322;1442",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            732.6666666666666,
            310.892621691513
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            725.3333333333334,
            508.0901057445968
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:q1cFmxLZZK4J:scholar.google.com/&scioq=Learning+Curves+for+Deep+Neural+Networks:+A+field+theory+perspective&hl=en&as_sdt=0,33",
        "gs_version_total": 2
    },
    {
        "id": "Sklw_kHtPH",
        "title": "AdamT: A Stochastic Optimization with Trend Correction Scheme",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present a new framework for adapting Adam-typed methods, namely AdamT, to include the trend information when updating the parameters with the adaptive step size and gradients.",
        "abstract": "Adam-typed optimizers, as a class of adaptive moment estimation methods with the exponential moving average scheme, have been successfully used in many applications of deep learning. Such methods are appealing for capability on large-scale sparse datasets. On top of that, they are computationally efficient and insensitive to the hyper-parameter settings. In this paper, we present a new framework for adapting Adam-typed methods, namely AdamT. Instead of applying a simple exponential weighted average, AdamT also includes the trend information when updating the parameters with the adaptive step size and gradients. The newly added term is expected to efficiently capture the non-horizontal moving patterns on the cost surface, and thus converge more rapidly. We show empirically the importance of the trend component, where AdamT outperforms the conventional Adam method constantly in both convex and non-convex settings.",
        "keywords": "Optimization;ADAM;Stochastic Gradient Descent;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bingxin Zhou;Xuebin Zheng;Junbin Gao",
        "authorids": "bzho3923@uni.sydney.edu.au;xzhe2914@uni.sydney.edu.au;junbin.gao@sydney.edu.au",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://github.com/xuebin-zh/AdamT",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Sklw_kHtPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "96;375;199",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            223.33333333333334,
            115.19355691858618
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4XcN4E7KVpkJ:scholar.google.com/&scioq=AdamT:+A+Stochastic+Optimization+with+Trend+Correction+Scheme&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "Sklyn6EYvH",
        "title": "Disentangled Representation Learning with Sequential Residual Variational Autoencoder",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recent advancements in unsupervised disentangled representation learning focus on extending the variational autoencoder (VAE) with an augmented objective function to balance the trade-off between disentanglement and reconstruction. We propose Sequential Residual Variational Autoencoder (SR-VAE) that defines a \"Residual learning\" mechanism as the training regime instead of the augmented objective function. Our proposed solution deploys two important ideas in a single framework: (1) learning from the residual between the input data and the accumulated reconstruction of sequentially added latent variables; (2) decomposing the reconstruction into decoder output and a residual term. This formulation encourages the disentanglement in the latent space by inducing explicit dependency structure, and reduces the bottleneck of VAE by adding the residual term to facilitate reconstruction. More importantly, SR-VAE eliminates the hyperparameter tuning, a crucial step for the prior state-of-the-art  performance using the objective function augmentation approach.  We demonstrate both qualitatively and quantitatively that SR-VAE improves the state-of-the-art  unsupervised disentangled representation learning on a variety of complex datasets.",
        "keywords": "Disentangled Representation Learning;Variational Autoencoder;Residual Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nanxiang Li;Shabnam Ghaffarzadegan;Liu Ren",
        "authorids": "nanxiang.li@us.bosch.com;shabnam.ghaffarzadegan@us.bosch.com;liu.ren@us.bosch.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nli2020disentangled,\ntitle={Disentangled Representation Learning with Sequential Residual Variational Autoencoder},\nauthor={Nanxiang Li and Shabnam Ghaffarzadegan and Liu Ren},\nyear={2020},\nurl={https://openreview.net/forum?id=Sklyn6EYvH}\n}",
        "github": "https://www.dropbox.com/s/5hkfn8xy5r8w5sz/Code.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Sklyn6EYvH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "368;91;70",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "623;286;169",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            176.33333333333334,
            135.7996890849002
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            359.3333333333333,
            192.46182882731725
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LMRkGTOYdiEJ:scholar.google.com/&scioq=Disentangled+Representation+Learning+with+Sequential+Residual+Variational+Autoencoder&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Skx1dhNYPS",
        "title": "VideoEpitoma: Efficient Recognition of Long-range Actions",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Efficient video classification using frame-based conditional gating module for selecting most-dominant frames, followed by temporal modeling and classifier.",
        "abstract": "CNNs are widely successful in recognizing human actions in videos, albeit with a great cost of computation. This cost is significantly higher in the case of long-range actions, where a video can span up to a few minutes, on average. The goal of this paper is to reduce the computational cost of these CNNs, without sacrificing their performance. We propose VideoEpitoma,  a neural network architecture comprising two modules: a timestamp selector and a video classifier. Given a long-range video of thousands of timesteps, the selector learns to choose only a few but most representative timesteps for the video. This selector resides on top of a lightweight CNN such as MobileNet and uses a novel gating module to take a binary decision: consider or discard a video timestep. This decision is conditioned on both the timestep-level feature and the video-level consensus. A heavyweight CNN model such as I3D takes the selected frames as input and performs video classification. Using off-the-shelf video classifiers, VideoEpitoma reduces the computation by up to 50\\% without compromising the accuracy. In addition, we show that if trained end-to-end, the selector learns to make better choices to the benefit of the classifier, despite the selector and the classifier residing on two different CNNs. Finally, we report state-of-the-art results on two datasets for long-range action recognition: Charades and Breakfast Actions, with much-reduced computation. In particular, we match the accuracy of I3D by using less than half of the computation.\n\n",
        "keywords": "Computer Vision;Action Recognition;Video Understanding;Efficient CNNs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Noureldien Hussein;Babak Ehteshami Bejnordi;Mihir Jain",
        "authorids": "nhussein@uva.nl;behtesha@qti.qualcomm.com;mijain@qti.qualcomm.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skx1dhNYPS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "416;219;635",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "315;127;303",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            423.3333333333333,
            169.9104339219801
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            248.33333333333334,
            85.93537623638409
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gpvdD6vgDuMJ:scholar.google.com/&scioq=VideoEpitoma:+Efficient+Recognition+of+Long-range+Actions&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Skx24yHFDr",
        "title": "Discovering Topics With Neural Topic Models Built From PLSA Loss",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a neural topic model that is built using documents, words, and topics embedding together with PLSA independence assumptions. ",
        "abstract": "In this paper we present a model for unsupervised topic discovery in texts corpora. The proposed model uses documents, words, and topics lookup table embedding as neural network model parameters to build probabilities of words given topics, and probabilities of topics given documents. These probabilities are used to recover by marginalization probabilities of words given documents. For very large corpora where the number of documents can be in the order of billions, using a neural auto-encoder based document embedding is more scalable then using a lookup table embedding as classically done. We thus extended the lookup based document embedding model to continuous auto-encoder based model. Our models are trained using probabilistic latent semantic analysis (PLSA) assumptions. We evaluated our models on six datasets with a rich variety of contents. Conducted experiments demonstrate that the proposed neural topic models are very effective in capturing relevant topics. Furthermore, considering perplexity metric, conducted evaluation benchmarks show that our topic models outperform latent Dirichlet allocation (LDA) model which is classically used to address topic discovery tasks.",
        "keywords": "neural network;topic model;neural topic model;bag-of-words;PLSA",
        "primary_area": "",
        "supplementary_material": "",
        "author": "sileye ba",
        "authorids": "sileye.ba@outlook.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nba2020discovering,\ntitle={Discovering Topics With Neural Topic Models Built From {\\{}PLSA{\\}} Loss},\nauthor={sileye ba},\nyear={2020},\nurl={https://openreview.net/forum?id=Skx24yHFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skx24yHFDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "359;432;171",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            320.6666666666667,
            109.94645161269291
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2977501338697036044&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Skx2iCNFwB",
        "title": "Stabilizing Neural ODE Networks with Stochasticity",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Neural Ordinary Differential Equation (Neural ODE) has been proposed as a continuous approximation to the ResNet architecture. Some commonly used regularization mechanisms in discrete neural networks (e.g. dropout, Gaussian noise) are missing in current Neural ODE networks. In this paper, we propose a new continuous neural network framework called Neural Stochastic Differential Equation (Neural SDE) network, which naturally incorporates various commonly used regularization mechanisms based on random noise injection. Our framework can model various types of noise injection frequently used in discrete networks for regularization purpose, such as dropout and additive/multiplicative noise in each block. We provide theoretical analysis explaining the improved robustness of Neural SDE models against input perturbations/adversarial attacks. Furthermore, we demonstrate that the Neural SDE network can achieve better generalization than the Neural ODE and is more resistant to adversarial and non-adversarial input perturbations.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xuanqing Liu;Tesi Xiao;Si Si;Qin Cao;Sanjiv Kumar;Cho-Jui Hsieh",
        "authorids": "xqliu@cs.ucla.edu;texiao@ucdavis.edu;sisidaisy@google.com;qincao@google.com;sanjivk@google.com;chohsieh@cs.ucla.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skx2iCNFwB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "580;682;363",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            541.6666666666666,
            133.02213683777933
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1935924212491242488&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Skx5ua4twS",
        "title": "Distilling Neural Networks for Faster and Greener Dependency Parsing",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We increase the efficiency of neural network dependency parsers with teacher-student distillation.",
        "abstract": "The carbon footprint of natural language processing (NLP) research has been increasing in recent years due to its reliance on large and inefficient neural network implementations. Distillation is a network compression technique which attempts to impart knowledge from a large model to a smaller one. We use teacher-student distillation  to  improve  the  efficiency  of  the  Biaffine  dependency  parser  which obtains state-of-the-art performance with respect to accuracy and parsing speed (Dozat & Manning, 2016).  When distilling to 20% of the original model\u2019s trainable parameters, we only observe an average decrease of \u223c1 point for both UAS and LAS across a number of diverse Universal Dependency treebanks while being 2.26x (1.21x) faster than the baseline model on CPU (GPU) at inference time. We also observe a small increase in performance when compressing to 80% for some treebanks.  Finally, through distillation we attain a parser which is not only faster but also more accurate than the fastest modern parser on the Penn Treebank.",
        "keywords": "dependency parsing;efficiency;green AI;compression;distillation;syntax;NLP",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mark Anderson;Carlos G\u00f3mez-Rodr\u00edguez",
        "authorids": "mark.anderson.nlp@gmail.com;carlos.gomez@udc.es",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skx5ua4twS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "304;643;303",
        "wc_reply_reviewers": "0;0;46",
        "wc_reply_authors": "345;322;391",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            416.6666666666667,
            160.0423555048961
        ],
        "wc_reply_reviewers_avg": [
            15.333333333333334,
            21.684607956387456
        ],
        "wc_reply_authors_avg": [
            352.6666666666667,
            28.686039965266882
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Jlk7fVVcbgYJ:scholar.google.com/&scioq=Distilling+Neural+Networks+for+Faster+and+Greener+Dependency+Parsing&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Skx6WaEYPH",
        "title": "Bandlimiting Neural Networks Against Adversarial Attacks",
        "track": "main",
        "status": "Reject",
        "tldr": "An insight into the reason of adversarial vulnerability, an effective defense method against adversarial attacks.",
        "abstract": "In this paper, we study the adversarial attack and defence problem in deep learning from the perspective of Fourier analysis. We first explicitly compute the Fourier transform of deep ReLU neural networks and show that there exist decaying but non-zero high frequency components in the Fourier spectrum of neural networks. We then demonstrate that the vulnerability of neural networks towards adversarial samples can be attributed to these insignificant but non-zero high frequency components. Based on this analysis, we propose to use a simple post-averaging technique to smooth out these high frequency components to improve the robustness of neural networks against adversarial attacks. Experimental results on the ImageNet and the CIFAR-10 datasets have shown that our proposed method is universally effective to defend many existing adversarial attacking methods proposed in the literature, including FGSM, PGD, DeepFool and C&W attacks. Our post-averaging method is simple since it does not require any re-training, and meanwhile it can successfully defend over 80-96% of the adversarial samples generated by these methods without introducing significant performance degradation (less than 2%) on the original clean images.",
        "keywords": "adversarial examples;adversarial attack defense;neural network;Fourier analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuping Lin;Kasra Ahmadi K. A.;Hui Jiang",
        "authorids": "yuping@eecs.yorku.ca;kasraah@eecs.yorku.ca;hj@cse.yorku.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlin2020bandlimiting,\ntitle={Bandlimiting Neural Networks Against Adversarial Attacks},\nauthor={Yuping Lin and Kasra Ahmadi K. A. and Hui Jiang},\nyear={2020},\nurl={https://openreview.net/forum?id=Skx6WaEYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skx6WaEYPH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "436;408;518",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            454.0,
            46.676189504571454
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18248195587993750773&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Skx73lBFDS",
        "title": "Combining graph and sequence information to learn protein representations",
        "track": "main",
        "status": "Reject",
        "tldr": "We learn protein representations by integrating data from physical interaction and amino acid sequence",
        "abstract": "Computational methods that infer the function of proteins are key to understanding life at the molecular level. In recent years, representation learning has emerged as a powerful paradigm to discover new patterns among entities as varied as images, words, speech, molecules. In typical representation learning, there is only one source of data or one level of abstraction at which the learned representation occurs. However, proteins can be described by their primary, secondary, tertiary, and quaternary structure or even as nodes in protein-protein interaction networks. Given that protein function is an emergent property of all these levels of interactions in this work, we learn joint representations from both amino acid sequence and multilayer networks representing tissue-specific protein-protein interactions. Using these representations, we train machine learning models that outperform existing methods on the task of tissue-specific protein function prediction on 10 out of 13 tissues. Furthermore, we outperform existing methods by 19% on average.",
        "keywords": "NLP;Protein;Representation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hassan Kan\u00e9;Mohamed Coulibali;Pelkins Ajanoh;Ali Abdalla",
        "authorids": "hassanmohamed@alum.mit.edu;mohamed-konoufo.coulibali.1@ulaval.ca;pelkins@alum.mit.edu;aabdalla@alum.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkan{\\'e}2020combining,\ntitle={Combining graph and sequence information to learn protein representations},\nauthor={Hassan Kan{\\'e} and Mohamed Coulibali and Pelkins Ajanoh and Ali Abdalla},\nyear={2020},\nurl={https://openreview.net/forum?id=Skx73lBFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skx73lBFDS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "502;479;272",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.6666666666667,
            103.42898798477898
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TJ0ixOujgmYJ:scholar.google.com/&scioq=Combining+graph+and+sequence+information+to+learn+protein+representations&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Skx82ySYPH",
        "title": "Neural Outlier Rejection for Self-Supervised Keypoint Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "Learning to extract distinguishable keypoints from a proxy task, outlier rejection.",
        "abstract": "Identifying salient points in images is a crucial component for visual odometry, Structure-from-Motion or SLAM algorithms. Recently, several learned keypoint methods have demonstrated compelling performance on challenging benchmarks.  However, generating consistent and accurate training data for interest-point detection in natural images still remains challenging, especially for human annotators. We introduce IO-Net (i.e. InlierOutlierNet), a novel proxy task for the self-supervision of keypoint detection, description and matching. By making the sampling of inlier-outlier sets from point-pair correspondences fully differentiable within the keypoint learning framework, we show that are able to simultaneously self-supervise keypoint description and improve keypoint matching. Second, we introduce KeyPointNet, a keypoint-network architecture that is especially amenable to robust keypoint detection and description. We design the network to allow local keypoint aggregation to avoid artifacts due to spatial discretizations commonly used for this task, and we improve fine-grained keypoint descriptor performance by taking advantage of efficient sub-pixel convolutions to upsample the descriptor feature-maps to a higher operating resolution. Through extensive experiments and ablative analysis, we show that the proposed self-supervised keypoint learning method greatly improves the quality of feature matching and homography estimation on challenging benchmarks over the state-of-the-art.",
        "keywords": "Self-Supervised Learning;Keypoint Detection;Outlier Rejection;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiexiong Tang;Hanme Kim;Vitor Guizilini;Sudeep Pillai;Rares Ambrus",
        "authorids": "jiexiong@kth.se;hanme.kim@tri.global;vitor.guizilini@tri.global;sudeep.pillai@tri.global;rares.ambrus@tri.global",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\ntang2020neural,\ntitle={Neural Outlier Rejection for Self-Supervised Keypoint Learning},\nauthor={Jiexiong Tang and Rares Ambrus and Vitor Guizilini and Hanme Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skx82ySYPH}\n}",
        "github": "https://github.com/TRI-ML/KP2D",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Skx82ySYPH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "882;274;462",
        "wc_reply_reviewers": "54;0;0",
        "wc_reply_authors": "1082;558;1229",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            539.3333333333334,
            254.16704918004
        ],
        "wc_reply_reviewers_avg": [
            18.0,
            25.45584412271571
        ],
        "wc_reply_authors_avg": [
            956.3333333333334,
            287.9864965970145
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 39,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=823859441730123149&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SkxBUpEKwH",
        "title": "Vid2Game: Controllable Characters Extracted from Real-World Videos",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We extract a controllable model from a video of a person performing a certain activity. The model generates novel image sequences of that person, according to user-defined control signals, typically marking the displacement of the moving body. The generated video can have an arbitrary background, and effectively capture both the dynamics and appearance of the person. \n\nThe method is based on two networks. The first  maps a current pose, and a single-instance control signal to the next pose. The second maps the current pose, the new pose, and a given background, to an output frame. Both networks include multiple novelties that enable high-quality performance. This is demonstrated on multiple characters extracted from various videos of dancers and athletes.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Oran Gafni;Lior Wolf;Yaniv Taigman",
        "authorids": "oran.gafni@gmail.com;wolf@fb.com;yaniv@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nGafni2020Vid2Game:,\ntitle={Vid2Game: Controllable Characters Extracted from Real-World Videos},\nauthor={Oran Gafni and Lior Wolf and Yaniv Taigman},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxBUpEKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkxBUpEKwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "370;132;161",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "208;181;7",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            221.0,
            106.0220102934606
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            132.0,
            89.07300376657341
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 48,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11603513031060284930&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkxEWgStDr",
        "title": "A Simple Geometric Proof for the Benefit of Depth in ReLU Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "ReLU MLP depth seperation proof with gemoteric arguments",
        "abstract": "We present a simple proof for the benefit of depth in multi-layer feedforward network with rectifed activation (``\"depth separation\"). Specifically we present a sequence of classification problems f_i such that (a) for any fixed depth rectified network we can find an index m such that problems with index > m require exponential network width to fully represent the function f_m; and (b) for any problem f_m in the family, we present a concrete neural network with linear depth and bounded width that fully represents it.\n\nWhile there are several previous work showing similar results, our proof uses substantially simpler tools and techniques, and should be accessible to undergraduate students in computer science and people with similar backgrounds.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Asaf Amrami;Yoav Goldberg",
        "authorids": "asaf.amrami@gmail.com;yoav.goldberg@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkxEWgStDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "116;257;275",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            216.0,
            71.09149034870488
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8257839538948474082&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkxG-CVFDH",
        "title": "GraphMix: Regularized Training of Graph Neural Networks for Semi-Supervised Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Regularization techniques for training Graph Neural Networks. We show that with our  simple method, the state-of-the-art  results can be achieved even with simpler Graph Neural Network architectures, at virtullay no additional computation cost.",
        "abstract": "We present GraphMix, a regularization technique for Graph Neural Network based semi-supervised object classification, leveraging the recent advances in the regularization of classical deep neural networks. Specifically, we propose a unified approach in which we train a fully-connected network jointly with the graph neural network via parameter sharing, interpolation-based regularization and self-predicted-targets. Our proposed method is architecture agnostic in the sense that it can be applied to any variant of graph neural networks which applies a parametric transformation to the features of the graph nodes. Despite its simplicity, with GraphMix we can consistently improve results and achieve or closely match state-of-the-art performance using even simpler architectures such as Graph Convolutional Networks, across three established graph benchmarks: Cora, Citeseer and Pubmed citation network datasets, as well as three newly proposed datasets : Cora-Full, Co-author-CS and Co-author-Physics.",
        "keywords": "Regularization;Graph Neural Networks;Mixup;Manifold Mixup;Semi-supervised Object Classification over graph Data",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vikas Verma;Meng Qu;Alex Lamb;Yoshua Bengio;Juho Kannala;Jian Tang",
        "authorids": "vikasverma.iitm@gmail.com;meng.qu@umontreal.ca;lambalex@iro.umontreal.ca;yoshua.bengio@mila.quebec;juho.kannala@aalto.fi;jian.tang@hec.ca",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nverma2020graphmix,\ntitle={GraphMix: Regularized Training of Graph Neural Networks for Semi-Supervised Learning},\nauthor={Vikas Verma and Meng Qu and Alex Lamb and Yoshua Bengio and Juho Kannala and Jian Tang},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxG-CVFDH}\n}",
        "github": "https://github.com/anon777000/GraphMix",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkxG-CVFDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "346;247;354",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "955;695;432",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            315.6666666666667,
            48.66438350808754
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            694.0,
            213.51502679358813
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14255787504703702780&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SkxHRySFvr",
        "title": "LEARNING TO IMPUTE: A GENERAL FRAMEWORK FOR SEMI-SUPERVISED LEARNING",
        "track": "main",
        "status": "Reject",
        "tldr": "We proposed a general learning-to-learn framework for semi-supervised learning, which can be used for both classification and regression tasks.",
        "abstract": "Recent semi-supervised learning methods have shown to achieve comparable results to their supervised counterparts while using only a small portion of labels in image classification tasks thanks to their regularization strategies. In this paper, we take a more direct approach for semi-supervised learning and propose learning to impute the labels of unlabeled samples such that a network achieves better generalization when it is trained on these labels. We pose the problem in a learning-to-learn formulation which can easily be incorporated to the state-of-the-art semi-supervised techniques and boost their performance especially when the labels are limited. We demonstrate that our method is applicable to both classification and regression problems including image classification and facial landmark detection tasks.",
        "keywords": "Semi-supervised Learning;Meta-Learning;Learning to label",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei-Hong Li;Chuan-Sheng Foo;Hakan Bilen",
        "authorids": "w.h.li@ed.ac.uk;foo_chuan_sheng@i2r.a-star.edu.sg;hbilen@ed.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nli2020learning,\ntitle={{\\{}LEARNING{\\}} {\\{}TO{\\}} {\\{}IMPUTE{\\}}: A {\\{}GENERAL{\\}} {\\{}FRAMEWORK{\\}} {\\{}FOR{\\}} {\\{}SEMI{\\}}-{\\{}SUPERVISED{\\}} {\\{}LEARNING{\\}}},\nauthor={Wei-Hong Li and Chuan-Sheng Foo and Hakan Bilen},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxHRySFvr}\n}",
        "github": "https://anonymous.4open.science/r/a4721095-8266-4038-9cc6-8791ef61c610/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer5",
        "site": "https://openreview.net/forum?id=SkxHRySFvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "255;347;471",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "804;812;760",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.6666666666667,
            88.50360946813913
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            792.0,
            22.861904265976328
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=936621046999950130&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkxJ8REYPH",
        "title": "SlowMo: Improving Communication-Efficient Distributed SGD with Slow Momentum",
        "track": "main",
        "status": "Poster",
        "tldr": "SlowMo improves the optimization and generalization performance of communication-efficient decentralized algorithms without sacrificing speed.",
        "abstract": "Distributed optimization is essential for training large models on large datasets. Multiple approaches have been proposed to reduce the communication overhead in distributed training, such as synchronizing only after performing multiple local SGD steps, and decentralized methods (e.g., using gossip algorithms) to decouple communications among workers. Although these methods run faster than AllReduce-based methods, which use blocking communication before every update, the resulting models may be less accurate after the same number of updates. Inspired by the BMUF method of Chen & Huo (2016), we propose a slow momentum (SlowMo) framework, where workers periodically synchronize and perform a momentum update, after multiple iterations of a base optimization algorithm. Experiments on image classification and machine translation tasks demonstrate that SlowMo consistently yields improvements in optimization and generalization performance relative to the base optimizer, even when the additional overhead is amortized over many updates so that the SlowMo runtime is on par with that of the base optimizer. We provide theoretical convergence guarantees showing that SlowMo converges to a stationary point of smooth non-convex losses. Since BMUF can be expressed through the SlowMo framework, our results also correspond to the first theoretical convergence guarantees for BMUF.",
        "keywords": "distributed optimization;decentralized training methods;communication-efficient distributed training with momentum;large-scale parallel SGD",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jianyu Wang;Vinayak Tantia;Nicolas Ballas;Michael Rabbat",
        "authorids": "jianyuw1@andrew.cmu.edu;tantia@fb.com;ballasn@fb.com;mikerabbat@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWang2020SlowMo:,\ntitle={SlowMo: Improving Communication-Efficient Distributed SGD with Slow Momentum},\nauthor={Jianyu Wang and Vinayak Tantia and Nicolas Ballas and Michael Rabbat},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxJ8REYPH}\n}",
        "github": "[![github](/images/github_icon.svg) facebookresearch/fairscale](https://github.com/facebookresearch/fairscale/blob/main/fairscale/experimental/nn/data_parallel/gossip/distributed.py)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkxJ8REYPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "250;202;497",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "174;335;233",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            316.3333333333333,
            129.24481506900855
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            247.33333333333334,
            66.50480350237034
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 217,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15353712927689171342&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SkxLFaNKwB",
        "title": "Computation Reallocation for Object Detection",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose CR-NAS to reallocate engaged  computation resources in different resolution and spatial position.",
        "abstract": "The allocation of computation resources in the backbone is a crucial issue in object detection. However, classification allocation pattern is usually adopted directly to object detector, which is proved to be sub-optimal. In order to reallocate the engaged computation resources in a more efficient way, we present CR-NAS (Computation Reallocation Neural Architecture Search) that can learn computation reallocation strategies across different feature resolution and spatial position diectly on the target detection dataset. A two-level reallocation space is proposed for both stage and spatial reallocation. A novel hierarchical search procedure is adopted to cope with the complex search space. We apply CR-NAS to multiple backbones and achieve consistent improvements. Our CR-ResNet50 and CR-MobileNetV2 outperforms the baseline by 1.9% and 1.7% COCO AP respectively without any additional computation budget. The models discovered by CR-NAS can be equiped to other powerful detection neck/head and be easily transferred to other dataset, e.g. PASCAL VOC, and other vision tasks, e.g. instance segmentation. Our CR-NAS can be used as a plugin to improve the performance of various networks, which is demanding.",
        "keywords": "Neural Architecture Search;Object Detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Feng Liang;Chen Lin;Ronghao Guo;Ming Sun;Wei Wu;Junjie Yan;Wanli Ouyang",
        "authorids": "liangfeng@sensetime.com;linchen@sensetime.com;guoronghao@sensetime.com;sunming1@sensetime.com;wuwei@sensetime.com;yanjunjie@sensetime.com;wanli.ouyang@sydney.edu.au",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nLiang2020Computation,\ntitle={Computation Reallocation for Object Detection},\nauthor={Feng Liang and Chen Lin and Ronghao Guo and Ming Sun and Wei Wu and Junjie Yan and Wanli Ouyang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxLFaNKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkxLFaNKwB",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "511;479;263;386",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "749;927;662;623",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;2;1;1",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            409.75,
            96.36746079460639
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            740.25,
            117.07129238203531
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 40,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18395031658704689379&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkxMjxHYPS",
        "title": "Filter redistribution templates for iteration-lessconvolutional model reduction",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Automatic neural network discovery methods face an enormous challenge caused for the size of the search space. A common practice is to split this space at different levels and to explore only a part of it. Neural architecture search  methods look for how to combine a subset of layers, which are the most promising, to create an architecture while keeping a predefined number of filters in each layer. On the other hand, pruning techniques take a well known architecture and look for the appropriate number of filters per layer. In both cases the exploration is made iteratively, training models several times during the search. Inspired by the advantages of the two previous approaches, we proposed a fast option to find models with improved characteristics. We apply a small set of templates, which are considered promising, for make a redistribution of the number of filters in an already existing neural network. When compared to the initial base models, we found that the resulting architectures, trained from scratch, surpass the original accuracy even after been reduced to fit the same amount of resources.",
        "keywords": "Model reduction;Pruning;filter distribution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ramon Izquierdo Cordova;Walterio Mayol Cuevas",
        "authorids": "ri16164@bristol.ac.uk;walterio.mayol-cuevas@bristol.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ncordova2020filter,\ntitle={Filter redistribution templates for iteration-lessconvolutional model reduction},\nauthor={Ramon Izquierdo Cordova and Walterio Mayol Cuevas},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxMjxHYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkxMjxHYPS",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "459;134;331;510",
        "wc_reply_reviewers": "0;0;105;0",
        "wc_reply_authors": "99;148;212;405",
        "reply_reviewers": "0;0;1;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            358.5,
            145.0939350903407
        ],
        "wc_reply_reviewers_avg": [
            26.25,
            45.46633369868303
        ],
        "wc_reply_authors_avg": [
            216.0,
            116.2432793756267
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rglk86rdDpEJ:scholar.google.com/&scioq=Filter+redistribution+templates+for+iteration-lessconvolutional+model+reduction&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SkxOhANKDr",
        "title": "Generative Cleaning Networks with Quantized Nonlinear Transform for Deep Neural Network Defense",
        "track": "main",
        "status": "Reject",
        "tldr": "Defense against adversarial attacks.",
        "abstract": "Effective defense of deep neural networks against adversarial attacks remains a challenging problem, especially under white-box attacks.\nIn this paper, we develop a new generative cleaning network  with quantized nonlinear transform  for effective defense of deep neural networks.  The generative cleaning network, equipped with a trainable quantized nonlinear  transform block, is able to destroy the sophisticated noise pattern of adversarial attacks and recover the original image content. The generative cleaning network and attack detector network are jointly trained using adversarial learning  to minimize both perceptual loss and adversarial loss. Our extensive experimental results demonstrate that our approach outperforms the state-of-art methods by large margins in both white-box and black-box attacks. For example, it improves the classification accuracy for white-box attacks upon the second best method by more than 40\\% on the SVHN dataset and more than 20\\% on the challenging CIFAR-10 dataset. ",
        "keywords": "Adversarial Defense;Adversarial Attack",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jianhe Yuan;Zhihai He",
        "authorids": "yuanjia@missouri.edu;hezhi@missouri.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nyuan2020generative,\ntitle={Generative Cleaning Networks with Quantized Nonlinear Transform  for  Deep Neural Network Defense},\nauthor={Jianhe Yuan and Zhihai He},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxOhANKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkxOhANKDr",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "128;175;97",
        "wc_reply_reviewers": "567;0;0",
        "wc_reply_authors": "611;188;117",
        "reply_reviewers": "3;0;0",
        "reply_authors": "5;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            133.33333333333334,
            32.065904356843305
        ],
        "wc_reply_reviewers_avg": [
            189.0,
            267.28636328851496
        ],
        "wc_reply_authors_avg": [
            305.3333333333333,
            218.07389165652597
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.8856180831641267
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iKtq_CTeIhAJ:scholar.google.com/&scioq=Generative+Cleaning+Networks+with+Quantized+Nonlinear+Transform+for+Deep+Neural+Network+Defense&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SkxQp1StDH",
        "title": "Low-dimensional statistical manifold embedding of directed graphs",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a novel node embedding of directed graphs to statistical manifolds and analyze connections to divergence, geometry and efficient learning procedure.",
        "abstract": "We propose a novel node embedding of directed graphs to statistical manifolds, which is based on a global minimization of pairwise relative entropy and graph geodesics in a non-linear way. Each node is encoded with a probability density function over a measurable space. Furthermore, we analyze the connection of the geometrical properties of such embedding and their efficient learning procedure. Extensive experiments show that our proposed embedding is better preserving the global geodesic information of graphs, as well as outperforming existing embedding models on directed graphs in a variety of evaluation metrics, in an unsupervised setting.",
        "keywords": "graph embedding;information geometry;graph representations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thorben Funke;Tian Guo;Alen Lancic;Nino Antulov-Fantulin",
        "authorids": "fun@biba.uni-bremen.de;tian.guo0980@gmail.com;alen.lancic@math.hr;nino.antulov@gess.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nFunke2020Low-dimensional,\ntitle={Low-dimensional statistical manifold embedding of directed graphs},\nauthor={Thorben Funke and Tian Guo and Alen Lancic and Nino Antulov-Fantulin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxQp1StDH}\n}",
        "github": "[![github](/images/github_icon.svg) funket/dinet_public](https://github.com/funket/dinet_public)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkxQp1StDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "263;324;341",
        "wc_reply_reviewers": "0;0;38",
        "wc_reply_authors": "665;439;277",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            309.3333333333333,
            33.48963355361709
        ],
        "wc_reply_reviewers_avg": [
            12.666666666666666,
            17.913371790059205
        ],
        "wc_reply_authors_avg": [
            460.3333333333333,
            159.11700796024988
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9660939784062408067&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SkxSv6VFvS",
        "title": "Deformable Kernels: Adapting Effective Receptive Fields for Object Deformation",
        "track": "main",
        "status": "Poster",
        "tldr": "Don't deform your convolutions -- deform your kernels.",
        "abstract": "Convolutional networks are not aware of an object's geometric variations, which leads to inefficient utilization of model and data capacity. To overcome this issue, recent works on deformation modeling seek to spatially reconfigure the data towards a common arrangement such that semantic recognition suffers less from deformation. This is typically done by augmenting static operators with learned free-form sampling grids in the image space, dynamically tuned to the data and task for adapting the receptive field. Yet adapting the receptive field does not quite reach the actual goal -- what really matters to the network is the *effective* receptive field (ERF), which reflects how much each pixel contributes. It is thus natural to design other approaches to adapt the ERF directly during runtime. In this work, we instantiate one possible solution as Deformable Kernels (DKs), a family of novel and generic convolutional operators for handling object deformations by directly adapting the ERF while leaving the receptive field untouched. At the heart of our method is the ability to resample the original kernel space towards recovering the deformation of objects. This approach is justified with theoretical insights that the ERF is strictly determined by data sampling locations and kernel values. We implement DKs as generic drop-in replacements of rigid kernels and conduct a series of empirical studies whose results conform with our theories. Over several tasks and standard base models, our approach compares favorably against prior works that adapt during runtime. In addition, further experiments suggest a working mechanism orthogonal and complementary to previous works.",
        "keywords": "Effective Receptive Fields;Deformation Modeling;Dynamic Inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hang Gao;Xizhou Zhu;Stephen Lin;Jifeng Dai",
        "authorids": "hangg@berkeley.edu;ezra0408@mail.ustc.edu.cn;stevelin@microsoft.com;jifdai@microsoft.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nGao2020Deformable,\ntitle={Deformable Kernels: Adapting Effective Receptive Fields for Object Deformation},\nauthor={Hang Gao and Xizhou Zhu and Stephen Lin and Jifeng Dai},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxSv6VFvS}\n}",
        "github": "https://github.com/hangg7/deformable-kernels/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkxSv6VFvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "314;376;172",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "542;510;529",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            287.3333333333333,
            85.39060578046951
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            527.0,
            13.140268896284683
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 80,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18261376182267761512&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkxUrTVKDH",
        "title": "Split LBI for Deep Learning: Structural Sparsity via Differential Inclusion Paths",
        "track": "main",
        "status": "Reject",
        "tldr": "SplitLBI is applied to deep learning to explore model structural sparsity, achieving state-of-the-art performance in ImageNet-2012 and unveiling effective subnet architecture.",
        "abstract": "Over-parameterization is ubiquitous nowadays in training neural networks to benefit both optimization in seeking global optima and generalization in reducing prediction error. However, compressive networks are desired in many real world applications and direct training of small networks may be trapped in local optima. In this paper, instead of pruning or distilling over-parameterized models to compressive ones, we propose a new approach based on \\emph{differential inclusions of inverse scale spaces}, that generates a family of models from simple to complex ones by coupling gradient descent and mirror descent to explore model structural sparsity. It has a simple discretization, called the Split Linearized Bregman Iteration (SplitLBI), whose global convergence analysis in deep learning is established that from any initializations, algorithmic iterations converge to a critical point of empirical risks. Experimental evidence shows that\\ SplitLBI may achieve state-of-the-art performance in large scale training on ImageNet-2012 dataset etc., while with \\emph{early stopping} it unveils effective subnet architecture with comparable test accuracies to dense models after retraining instead of pruning well-trained ones.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yanwei Fu;Chen Liu;Donghao Li;Xinwei Sun;Jinshan ZENG;Yuan Yao",
        "authorids": "yanweifu@fudan.edu.cn;corwinliu9669@gmail.com;donghao.li@connect.ust.hk;xinsun@microsoft.com;jsh.zeng@gmail.com;yuany@ust.hk",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nfu2020split,\ntitle={Split {\\{}LBI{\\}} for Deep Learning: Structural Sparsity via Differential Inclusion Paths},\nauthor={Yanwei Fu and Chen Liu and Donghao Li and Xinwei Sun and Jinshan ZENG and Yuan Yao},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxUrTVKDH}\n}",
        "github": "https://anonymous.4open.science/repository/d22bbbc8-50d5-4e60-b2e8-4ded4e93db63/Split_LBI_code",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkxUrTVKDH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "2052;481;329",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2903;373;273",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            954.0,
            778.879109147669
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1183.0,
            1216.9086517346595
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4Y3I1om22t8J:scholar.google.com/&scioq=Split+LBI+for+Deep+Learning:+Structural+Sparsity+via+Differential+Inclusion+Paths&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SkxV0RVYDH",
        "title": "Versatile Anomaly Detection with Outlier Preserving Distribution Mapping Autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "An extension of Wasserstein autoencoders such that anomalies in the feature-space remain anomalies in the latent space ",
        "abstract": " State-of-the-art deep learning methods for outlier detection make the assumption that anomalies will appear far away from inlier data in the latent space produced by distribution mapping deep networks. However, this assumption fails in practice, because the divergence penalty adopted for this purpose encourages mapping outliers into the same high-probability regions as inliers. To overcome this shortcoming,  we introduce a novel deep learning outlier detection method, called Outlier Preserving Distribution Mapping Autoencoder (OP-DMA), which succeeds to map outliers to low probability regions in the latent space of an autoencoder. For this we leverage the insight that outliers are likely to have a higher reconstruction error than inliers. We thus achieve outlier-preserving distribution mapping through weighting the reconstruction error of individual points by the value of a multivariate Gaussian probability density function evaluated at those points. This weighting implies that outliers will result overall penalty if they are mapped to low-probability regions. We show that if the  global minimum of our newly proposed loss function is achieved, \n then our OP-DMA  maps inliers to regions with a Mahalanobis distance less than delta, and outliers to regions past this delta, delta being the inverse Chi Squared CDF evaluated at (1-alpha) with alpha the percentage of outliers in the dataset. Our experiments confirm that OP-DMA  consistently outperforms  the state-of-art  methods on a rich variety of outlier detection benchmark datasets.",
        "keywords": "Anomaly detection;outliers;deep learning;distribution mapping;wasserstein autoencoders",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Walter Gerych;Elke Rundensteiner;Emmanuel Agu",
        "authorids": "wgerych@wpi.edu;rundenst@wpi.edu;emmanuel@wpi.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngerych2020versatile,\ntitle={Versatile Anomaly Detection with Outlier Preserving Distribution Mapping Autoencoders},\nauthor={Walter Gerych and Elke Rundensteiner and Emmanuel Agu},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxV0RVYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer5",
        "site": "https://openreview.net/forum?id=SkxV0RVYDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "568;213;574",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "798;112;526",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            451.6666666666667,
            168.7805939345187
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            478.6666666666667,
            282.0512167832092
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IPfdG11nNFwJ:scholar.google.com/&scioq=Versatile+Anomaly+Detection+with+Outlier+Preserving+Distribution+Mapping+Autoencoders&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SkxV7kHKvr",
        "title": "TWIN GRAPH CONVOLUTIONAL NETWORKS: GCN WITH DUAL GRAPH SUPPORT FOR SEMI-SUPERVISED LEARNING",
        "track": "main",
        "status": "Reject",
        "tldr": "A primal dual graph neural network model for semi-supervised learning",
        "abstract": "Graph Neural Networks as a combination of Graph Signal Processing and Deep Convolutional Networks shows great power in pattern recognition in non-Euclidean domains. In this paper, we propose a new method to deploy two pipelines based on the duality of a graph to improve accuracy. By exploring the primal graph and its dual graph where nodes and edges can be treated as one another, we have exploited the benefits of both vertex features and edge features. As a result, we have arrived at a framework that has great potential in both semisupervised and unsupervised learning.",
        "keywords": "Graph;Neural Networks;Deep Learning;semi-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Feng Shi;Yizhou Zhao;Ziheng Xu;Tianyang Liu;Song-Chun Zhu",
        "authorids": ";yizhouzhao@ucla.edu;;;",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nshi2020twin,\ntitle={{\\{}TWIN{\\}} {\\{}GRAPH{\\}} {\\{}CONVOLUTIONAL{\\}} {\\{}NETWORKS{\\}}: {\\{}GCN{\\}} {\\{}WITH{\\}} {\\{}DUAL{\\}} {\\{}GRAPH{\\}} {\\{}SUPPORT{\\}} {\\{}FOR{\\}} {\\{}SEMI{\\}}-{\\{}SUPERVISED{\\}} {\\{}LEARNING{\\}}},\nauthor={Feng Shi and Yizhou Zhao and Ziheng Xu and Tianyang Liu and Song-Chun Zhu},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxV7kHKvr}\n}",
        "github": "https://drive.google.com/file/d/1lWt4Mvpq_czCIC8isdgVZ0qENBLDqQcX/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkxV7kHKvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "126;183;358",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            222.33333333333334,
            98.71282703996589
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5165258857706727502&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SkxW23NtPH",
        "title": "GDP: Generalized Device Placement for Dataflow Graphs",
        "track": "main",
        "status": "Reject",
        "tldr": "The first end-to-end device placement that can be generalized to unseen graphs. ",
        "abstract": "Runtime and scalability of large neural networks can be significantly affected by the placement of operations in their dataflow graphs on suitable devices. With increasingly complex neural network architectures and heterogeneous device characteristics, finding a reasonable placement is extremely challenging even for domain experts. Most existing automated device placement approaches are impractical due to the significant amount of compute required and their inability to generalize to new, previously held-out graphs. To address both limitations, we propose an efficient end-to-end method based on a scalable sequential attention mechanism over a graph neural network that is transferable to new graphs. On a diverse set of representative deep learning models, including Inception-v3, AmoebaNet, Transformer-XL, and WaveNet, our method on average achieves 16% improvement over human experts and 9.2% improvement over the prior art with 15 times faster convergence. To further reduce the computation cost, we pre-train the policy network on a set of dataflow graphs and use a superposition network to fine-tune it on each individual graph, achieving state-of-the-art performance on large hold-out graphs with over 50k nodes, such as an 8-layer GNMT.",
        "keywords": "device placement;reinforcement learning;graph neural networks;transformer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yanqi Zhou;Sudip Roy;Amirali Abdolrashidi;Daniel Wong;Peter C. Ma;Qiumin Xu;Ming Zhong;Hanxiao Liu;Anna Goldie;Azalia Mirhoseini;James Laudon",
        "authorids": "yanqiz@google.com;sudipr@google.com;abdolrashidi@google.com;wonglkd@google.com;pcma@google.com;qiuminxu@google.com;mingzhong@google.com;hanxiaol@google.com;agoldie@google.com;azalia@google.com;jlaudon@google.com",
        "gender": ";;;;;;;;;;",
        "homepage": ";;;;;;;;;;",
        "dblp": ";;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;",
        "orcid": ";;;;;;;;;;",
        "linkedin": ";;;;;;;;;;",
        "or_profile": ";;;;;;;;;;",
        "aff": ";;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;",
        "position": ";;;;;;;;;;",
        "bibtex": "@misc{\nzhou2020gdp,\ntitle={{\\{}GDP{\\}}: Generalized Device Placement for Dataflow Graphs},\nauthor={Yanqi Zhou and Sudip Roy and Amirali Abdolrashidi and Daniel Wong and Peter C. Ma and Qiumin Xu and Ming Zhong and Hanxiao Liu and Anna Goldie and Azalia Mirhoseini and James Laudon},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxW23NtPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkxW23NtPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "497;386;200",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "977;914;748",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            361.0,
            122.53162856993292
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            879.6666666666666,
            96.58962447155263
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            11,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5553376780459993787&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SkxWnkStvS",
        "title": "Searching for Stage-wise Neural Graphs In the Limit",
        "track": "main",
        "status": "Reject",
        "tldr": "Graphon is a good search space for neural architecture search and empirically produces good networks.",
        "abstract": "Search space is a key consideration for neural architecture search. Recently, Xie et al. (2019a) found that randomly generated networks from the same distribution perform similarly, which suggest we should search for random graph distributions instead of graphs. We propose graphon as a new search space. A graphon is the limit of Cauchy sequence of graphs and a scale-free probabilistic distribution, from which graphs of different number of vertices can be drawn. This property enables us to perform NAS using fast, low-capacity models and scale the found models up when necessary. We develop an algorithm for NAS in the space of graphons and empirically demonstrate that it can find stage-wise graphs that outperform DenseNet and other baselines on ImageNet. ",
        "keywords": "neural architecture search;graphon;random graphs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin Zhou;Dejing Dou;Boyang Li",
        "authorids": "chow459@gmail.com;doudejing@baidu.com;libo0001@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhou2020searching,\ntitle={Searching for Stage-wise Neural Graphs In the Limit},\nauthor={Xin Zhou and Dejing Dou and Boyang Li},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxWnkStvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkxWnkStvS",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "142;215",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "359;486",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            178.5,
            36.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            422.5,
            63.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10520898765265947459&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkxaueHFPB",
        "title": "Implicit competitive regularization in GANs",
        "track": "main",
        "status": "Reject",
        "tldr": "Training GANs by modeling networks as agents acting with limited information and in awareness of their opponent imposes an implicit regularization that leads to stable convergence and prevents mode collapse.",
        "abstract": "Generative adversarial networks (GANs) are capable of producing high quality samples, but they suffer from numerous issues such as instability and mode collapse during training. To combat this, we propose to model the generator and discriminator as agents acting under local information, uncertainty, and awareness of their opponent. By doing so we achieve stable convergence, even when the underlying game has no  Nash equilibria. We call this mechanism \\emph{implicit competitive regularization} (ICR) and show that it is present in the recently proposed \\emph{competitive gradient descent} (CGD).\nWhen comparing CGD to Adam using a variety of loss functions and regularizers on CIFAR10, CGD shows a much more consistent performance, which we attribute to ICR.\nIn our experiments, we achieve the highest inception score when using the WGAN loss (without gradient penalty or weight clipping) together with CGD. This can be interpreted as minimizing a form of integral probability metric based on ICR.",
        "keywords": "GAN;competitive optimization;game theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Florian Schaefer;Hongkai Zheng;Anima Anandkumar",
        "authorids": "florian.schaefer@caltech.edu;devzhk@sjtu.edu.cn;anima@caltech.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nschaefer2020implicit,\ntitle={Implicit competitive regularization in {\\{}GAN{\\}}s},\nauthor={Florian Schaefer and Hongkai Zheng and Anima Anandkumar},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxaueHFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkxaueHFPB",
        "pdf_size": 0,
        "rating": "1;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "670;231;340;160",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "1341;135;338;93",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "3;1;1;1",
        "rating_avg": [
            5.25,
            2.5860201081971503
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            350.25,
            195.42309868590254
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            476.75,
            507.502894080418
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.8660254037844386
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11779741771376175175&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "SkxcSpEKPS",
        "title": "Generative Adversarial Networks For Data Scarcity Industrial Positron Images With Attention",
        "track": "main",
        "status": "Reject",
        "tldr": "adversarial nets, attention mechanism, positron images, data scarcity",
        "abstract": "In the industrial field, the positron annihilation is not affected by complex environment, and the gamma-ray photon penetration is strong, so the nondestructive detection of industrial parts can be realized. Due to the poor image quality caused by gamma-ray photon scattering, attenuation and short sampling time in positron process, we propose the idea of combining deep learning to generate positron images with good quality and clear details by adversarial nets. The structure of the paper is as follows: firstly, we encode to get the hidden vectors of medical CT images based on transfer Learning, and use PCA to extract positron image features. Secondly, we construct a positron image memory based on attention mechanism as a whole input to the adversarial nets which uses medical hidden variables as a query. Finally, we train the whole model jointly and update the input parameters until convergence. Experiments have proved the possibility of generating rare positron images for industrial non-destructive testing using countermeasure networks, and good imaging results have been achieved.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mingwei Zhu;Min Zhao;Min Yao;Ruipeng Guo",
        "authorids": "zhumingwei@nuaa.edu.cn;xymzhao@126.com;ym_nuaa@163.com;rpguo@nuaa.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhu2020generative,\ntitle={Generative Adversarial Networks For Data Scarcity Industrial Positron Images With Attention},\nauthor={Mingwei Zhu and Min Zhao and Min Yao and Ruipeng Guo},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxcSpEKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkxcSpEKPS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "182;622;226",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            343.3333333333333,
            197.86415092740327
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1g6PPIgb0hMJ:scholar.google.com/&scioq=Generative+Adversarial+Networks+For+Data+Scarcity+Industrial+Positron+Images+With+Attention&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SkxcZCNKDS",
        "title": "If MaxEnt RL is the Answer, What is the Question?",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that MaxEnt RL implicitly solves control problems with variability in rewards.",
        "abstract": "Experimentally, it has been observed that humans and animals often make decisions that do not maximize their expected utility, but rather choose outcomes randomly, with probability proportional to expected utility. Probability matching, as this strategy is called, is equivalent to maximum entropy reinforcement learning (MaxEnt RL). However, MaxEnt RL does not optimize expected utility. In this paper, we formally show that MaxEnt RL does optimally solve certain classes of control problems with variability in the reward function. In particular, we show (1) that MaxEnt RL can be used to solve a certain class of POMDPs, and (2) that MaxEnt RL is equivalent to a two-player game where an adversary chooses the reward function. These results suggest a deeper connection between MaxEnt RL, robust control, and POMDPs, and provide insight for the types of problems for which we might expect MaxEnt RL to produce effective solutions. Specifically, our results suggest that domains with uncertainty in the task goal may be especially well-suited for MaxEnt RL methods.",
        "keywords": "reinforcement learning;maximum entropy;POMDP",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Benjamin Eysenbach;Sergey Levine",
        "authorids": "beysenba@cs.cmu.edu;svlevine@eecs.berkeley.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\neysenbach2020if,\ntitle={If MaxEnt {\\{}RL{\\}} is the Answer, What is the Question?},\nauthor={Benjamin Eysenbach and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxcZCNKDS}\n}",
        "github": "https://drive.google.com/file/d/1Xf3OTxWBg67L2ka1eLd32qDQmtnG8Fxc/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkxcZCNKDS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "316;1559;283",
        "wc_reply_reviewers": "0;2624;0",
        "wc_reply_authors": "423;2129;192",
        "reply_reviewers": "0;6;0",
        "reply_authors": "1;6;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            719.3333333333334,
            593.8868204932279
        ],
        "wc_reply_reviewers_avg": [
            874.6666666666666,
            1236.9654625556673
        ],
        "wc_reply_authors_avg": [
            914.6666666666666,
            863.8265000694424
        ],
        "reply_reviewers_avg": [
            2.0,
            2.8284271247461903
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 67,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4403655131085963001&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Skxd6gSYDS",
        "title": "Query-efficient Meta Attack to Deep Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Black-box attack methods aim to infer suitable attack patterns to targeted DNN models by only using output feedback of the models and the corresponding input queries. However, due to lack of prior and inefficiency in leveraging the query and feedback information, existing methods are mostly query-intensive for obtaining effective attack patterns. In this work, we propose a meta attack approach that is capable of attacking a targeted  model with much fewer queries. Its high query-efficiency stems from effective utilization of  meta learning approaches in learning generalizable prior abstraction from the previously observed attack patterns and exploiting  such prior to help infer attack patterns from only a few queries and outputs. Extensive experiments on MNIST, CIFAR10 and tiny-Imagenet demonstrate that our meta-attack method can remarkably reduce the number of model queries without sacrificing the attack performance. Besides, the obtained meta attacker is not restricted to a particular model but can be used easily with a fast adaptive ability to attack a variety of models. Our code will be released to the public.",
        "keywords": "Adversarial attack;Meta learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiawei Du;Hu Zhang;Joey Tianyi Zhou;Yi Yang;Jiashi Feng",
        "authorids": "dujiawei@u.nus.edu;hu.zhang-1@student.uts.edu.au;joey.tianyi.zhou@gmail.com;yi.yang@uts.edu.au;elefjia@nus.edu.sg",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nDu2020Query-efficient,\ntitle={Query-efficient Meta Attack to Deep Neural Networks},\nauthor={Jiawei Du and Hu Zhang and Joey Tianyi Zhou and Yi Yang and Jiashi Feng},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skxd6gSYDS}\n}",
        "github": "[![github](/images/github_icon.svg) dydjw9/MetaAttack_ICLR2020](https://github.com/dydjw9/MetaAttack_ICLR2020)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skxd6gSYDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "315;343;319",
        "wc_reply_reviewers": "0;17;27",
        "wc_reply_authors": "817;1100;472",
        "reply_reviewers": "0;1;1",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            325.6666666666667,
            12.364824660660938
        ],
        "wc_reply_reviewers_avg": [
            14.666666666666666,
            11.145502331533658
        ],
        "wc_reply_authors_avg": [
            796.3333333333334,
            256.7960712748975
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 99,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13046330660709295854&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkxgnnNFvH",
        "title": "Poly-encoders: Architectures and Pre-training Strategies for Fast and Accurate Multi-sentence Scoring",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "The use of deep pre-trained transformers has led to remarkable progress in a number of applications (Devlin et al., 2018). For tasks that make pairwise comparisons between sequences, matching a given input with a corresponding label, two approaches are common: Cross-encoders performing full self-attention over the pair and Bi-encoders encoding the pair separately. The former often performs better, but is too slow for practical use. In this work, we develop a new transformer architecture, the Poly-encoder, that learns global rather than token level self-attention features. We perform a detailed comparison of all three approaches, including what pre-training and fine-tuning strategies work best. We show our models achieve state-of-the-art results on four tasks; that Poly-encoders are faster than Cross-encoders and more accurate than Bi-encoders; and that the best results are obtained by pre-training on large datasets similar to the downstream tasks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Samuel Humeau;Kurt Shuster;Marie-Anne Lachaux;Jason Weston",
        "authorids": "samuelhumeau@fb.com;kshuster@fb.com;malachaux@fb.com;jaseweston@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nHumeau2020Poly-encoders:,\ntitle={Poly-encoders: Architectures and Pre-training Strategies for Fast and Accurate Multi-sentence Scoring},\nauthor={Samuel Humeau and Kurt Shuster and Marie-Anne Lachaux and Jason Weston},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxgnnNFvH}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SkxgnnNFvH)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkxgnnNFvH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "101;79;453",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;787",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            211.0,
            171.35538120136954
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            262.3333333333333,
            370.995357862542
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 600,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1281813500896958596&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkxhS6EYvH",
        "title": "The Convex Information Bottleneck Lagrangian",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We introduce a general family of Lagrangians that allow exploring the IB curve in all scenarios. When these are used, and the IB curve is known, one can optimize directly for a performance/compression level directly.",
        "abstract": "The information bottleneck (IB) problem tackles the issue of obtaining relevant compressed representations T of some random variable X for the task of predicting Y. It is defined as a constrained optimization problem which maximizes the information the representation has about the task, I(T;Y), while ensuring that a minimum level of compression r is achieved (i.e., I(X;T) <= r). For practical reasons the problem is usually solved by maximizing the IB Lagrangian for many values of the Lagrange multiplier, therefore drawing the IB curve (i.e., the curve of maximal I(T;Y) for a given I(X;Y)) and selecting the representation of desired predictability and compression. It is known when Y is a deterministic function of X, the IB curve cannot be explored and other Lagrangians have been proposed to tackle this problem (e.g., the squared IB Lagrangian). In this paper we (i) present a general family of Lagrangians which allow for the exploration of the IB curve in all scenarios; (ii) prove that if these Lagrangians are used, there is a one-to-one mapping between the Lagrange multiplier and the desired compression rate r for known IB curve shapes, hence, freeing from the burden of solving the optimization problem for many values of the Lagrange multiplier.",
        "keywords": "information bottleneck;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Borja Rodr\u00edguez G\u00e1lvez;Ragnar Thobaben;Mikael Skoglund",
        "authorids": "borjarg@kth.se;ragnart@kth.se;skoglund@kth.se",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ng{\\'a}lvez2020the,\ntitle={The Convex Information Bottleneck Lagrangian},\nauthor={Borja Rodr{\\'\\i}guez G{\\'a}lvez and Ragnar Thobaben and Mikael Skoglund},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxhS6EYvH}\n}",
        "github": "https://gofile.io/?c=G9Dl1L",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkxhS6EYvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "366;241;198",
        "wc_reply_reviewers": "0;28;0",
        "wc_reply_authors": "503;472;271",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            268.3333333333333,
            71.25696847014723
        ],
        "wc_reply_reviewers_avg": [
            9.333333333333334,
            13.199326582148887
        ],
        "wc_reply_authors_avg": [
            415.3333333333333,
            102.84076148211965
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3112075768658555983&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "SkxlElBYDS",
        "title": "Continual Learning via Principal Components Projection",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Continual learning in neural networks (NN) often suffers from catastrophic forgetting. That is, when learning a sequence of tasks on an NN, the learning of a new task will cause weight changes that may destroy the learned knowledge embedded in the weights for previous tasks. Without solving this problem, it is difficult to use an NN to perform continual or lifelong learning. Although researchers have attempted to solve the problem in many ways, it remains to be challenging. In this paper, we propose a new approach, called principal components projection (PCP). The idea is that in learning a new task, if we can ensure that the gradient updates will only occur in the orthogonal directions to the input vectors of the previous tasks, then the weight updates for learning the new task will not affect the previous tasks. We propose to compute the principal components of the input vectors and use them to transform the input and to project the gradient updates for learning each new task. PCP does not need to store any sampled data from previous tasks or to generate pseudo data of previous tasks and use them to help learn a new task. Empirical evaluation shows that the proposed method PCP markedly outperforms the state-of-the-art baseline methods.",
        "keywords": "Neural network;continual learning;catastrophic forgetting;lifelong learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gyuhak Kim;Bing Liu",
        "authorids": "gkim87@uic.edu;liub@uic.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkim2020continual,\ntitle={Continual Learning via Principal Components Projection},\nauthor={Gyuhak Kim and Bing Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxlElBYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkxlElBYDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "546;357;258",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            387.0,
            119.47384651043926
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2403117274131923063&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Skxn-JSYwr",
        "title": "EXPLOITING SEMANTIC COHERENCE TO IMPROVE PREDICTION IN SATELLITE SCENE IMAGE ANALYSIS: APPLICATION TO DISEASE DENSITY ESTIMATION",
        "track": "main",
        "status": "Reject",
        "tldr": "Approach for improving prediction accuracy by learning deep features over neighboring scene images in satellite scene image analysis.",
        "abstract": "High intra-class diversity and inter-class similarity is a characteristic of remote sensing scene image data sets currently posing significant difficulty for deep learning algorithms on classification tasks. To improve accuracy, post-classification\nmethods have been proposed for smoothing results of model predictions. However, those approaches require an additional neural network to perform the smoothing operation, which adds overhead to the task. We propose an approach that involves learning deep features directly over neighboring scene images without requiring use of a cleanup model. Our approach utilizes a siamese network to improve the discriminative power of convolutional neural networks on a pair\nof neighboring scene images. It then exploits semantic coherence between this pair to enrich the feature vector of the image for which we want to predict a label.\nEmpirical results show that this approach provides a viable alternative to existing methods. For example, our model improved prediction accuracy by 1 percentage point and dropped the mean squared error value by 0.02 over the baseline, on a disease density estimation task. These performance gains are comparable with results from existing post-classification methods, moreover without implementation overheads.",
        "keywords": "semantic coherence;satellite scene image analysis;convolutional neural networks;disease density",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rahman Sanya;Gilbert Maiga;Ernest Mwebaze",
        "authorids": "hbasanya@gmail.com;gilmaiga@gmail.com;emwebaze@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsanya2020exploiting,\ntitle={{\\{}EXPLOITING{\\}} {\\{}SEMANTIC{\\}} {\\{}COHERENCE{\\}} {\\{}TO{\\}} {\\{}IMPROVE{\\}} {\\{}PREDICTION{\\}} {\\{}IN{\\}} {\\{}SATELLITE{\\}} {\\{}SCENE{\\}} {\\{}IMAGE{\\}} {\\{}ANALYSIS{\\}}: {\\{}APPLICATION{\\}} {\\{}TO{\\}} {\\{}DISEASE{\\}} {\\{}DENSITY{\\}} {\\{}ESTIMATION{\\}}},\nauthor={Rahman Sanya and Gilbert Maiga and Ernest Mwebaze},\nyear={2020},\nurl={https://openreview.net/forum?id=Skxn-JSYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer5;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skxn-JSYwr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "169;210;295",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;248",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            224.66666666666666,
            52.4743323497837
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            82.66666666666667,
            116.90832115617586
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eoJIjdbMrzgJ:scholar.google.com/&scioq=EXPLOITING+SEMANTIC+COHERENCE+TO+IMPROVE+PREDICTION+IN+SATELLITE+SCENE+IMAGE+ANALYSIS:+APPLICATION+TO+DISEASE+DENSITY+ESTIMATION&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SkxnuTNtDS",
        "title": "Quadratic GCN for graph classification",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Graph Convolutional Networks (GCN) have been extensively used to classify nodes in graphs and have been shown to outperform most other node classification methods. However, there are currently few GCN based formalism for graph classification tasks. In this task, graphs of different sizes (e.g. graphs representing different protein structures) belong to different classes, and one attempts to predict the graph class. \n\nWe here propose a solution combing GCN and methods from knowledge graphs to produce a quadratic GCN (Q-GCN). We extend the GCN formalism by adding a quadratic layer to a standard GCN to classify full graphs. Such a layer produces an output with dimensions independent of the graph node number. This output is then passed through a softmax to classify full graphs. We applied this method to a wide range of graph classification problems and show that such a straightforward formalism outperforms state of the art methods for binary graph classification with or without external input on each graph. \n",
        "keywords": "GCN;Quadratic activation;graph classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Omer Nagar;Yoram Louzoun",
        "authorids": "ovednagar@hotmail.com;louzouy@math.biu.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "https://github.com/Unknown-Data/Q-GCN",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkxnuTNtDS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "145;219;248",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            204.0,
            43.36665385600631
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3702243941421421081&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkxoqRNKwr",
        "title": "Adversarial Privacy Preservation under Attribute Inference Attack",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "With the prevalence of machine learning services, crowdsourced data containing sensitive information poses substantial privacy challenges. Existing work focusing on protecting against membership inference attacks under the rigorous framework of differential privacy are vulnerable to attribute inference attacks. In light of the current gap between theory and practice, we develop a novel theoretical framework for privacy-preservation under the attack of attribute inference. Under our framework, we propose a minimax optimization formulation to protect the given attribute and analyze its privacy guarantees against arbitrary adversaries. On the other hand, it is clear that privacy constraint may cripple utility when the protected attribute is correlated with the target variable. To this end, we also prove an information-theoretic lower bound to precisely characterize the fundamental trade-off between utility and privacy. Empirically, we extensively conduct experiments to corroborate our privacy guarantee and validate the inherent trade-offs in different privacy preservation algorithms. Our experimental results indicate that the adversarial representation learning approaches achieve the best trade-off in terms of privacy preservation and utility maximization.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Han Zhao;Jianfeng Chi;Yuan Tian;Geoffrey J. Gordon",
        "authorids": "han.zhao@cs.cmu.edu;jc6ub@virginia.edu;yuant@virginia.edu;geoff.gordon@microsoft.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhao2020adversarial,\ntitle={Adversarial Privacy Preservation under Attribute Inference Attack},\nauthor={Han Zhao and Jianfeng Chi and Yuan Tian and Geoffrey J. Gordon},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxoqRNKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SkxoqRNKwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "300;351;257",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "295;531;299",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            302.6666666666667,
            38.42163742245016
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            375.0,
            110.32074449833388
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11093072018146024206&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SkxpDT4YvS",
        "title": "Policy Optimization with Stochastic Mirror Descent",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a sample efficient policy gradient method with stochastic mirror descent via conducting a variance reduced policy gradient estimator. ",
        "abstract": "Improving sample efficiency has been a longstanding goal in reinforcement learning.\nIn this paper, we propose the $\\mathtt{VRMPO}$: a sample efficient policy gradient method with stochastic mirror descent.\nA novel variance reduced policy gradient estimator is the key of $\\mathtt{VRMPO}$ to improve sample efficiency.\nOur $\\mathtt{VRMPO}$ needs only $\\mathcal{O}(\\epsilon^{-3})$ sample trajectories to achieve an $\\epsilon$-approximate first-order stationary point, \nwhich matches the best-known sample complexity.\nWe conduct extensive experiments to show our algorithm outperforms state-of-the-art policy gradient methods in various settings.",
        "keywords": "reinforcement learning;policy gradient;stochastic variance reduce gradient;sample efficiency;stochastic mirror descent",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Long Yang;Gang Zheng;Zavier Zhang;Yu Zhang;Qian Zheng;Jun Wen;Gang Pana sample efficient policy gradient method with stochastic mirror descent.",
        "authorids": "yanglong@zju.edu.cn;gang_zheng@zju.edu.cn;21721269@zju.edu.cn;hzzhangyu@zju.edu.cn;csqianzheng@gmail.com;junwen@zju.edu.cn;gpan@zju.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nyang2020policy,\ntitle={Policy Optimization with Stochastic Mirror Descent},\nauthor={Long Yang and Gang Zheng and Zavier Zhang and Yu Zhang and Qian Zheng and Jun Wen and Gang Pana sample efficient policy gradient method with stochastic mirror descent.},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxpDT4YvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkxpDT4YvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "848;458;289",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "836;612;543",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            531.6666666666666,
            234.08023315853808
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            663.6666666666666,
            125.07153508648118
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 41,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17987520723212070624&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SkxpxJBKwS",
        "title": "Emergent Tool Use From Multi-Agent Autocurricula",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "Through multi-agent competition, the simple objective of hide-and-seek, and standard reinforcement learning algorithms at scale, we find that agents create a self-supervised autocurriculum inducing multiple distinct rounds of emergent strategy, many of which require sophisticated tool use and coordination. We find clear evidence of six emergent phases in agent strategy in our environment, each of which creates a new pressure for the opposing team to adapt; for instance, agents learn to build multi-object shelters using moveable boxes which in turn leads to agents discovering that they can overcome obstacles using ramps. We further provide evidence that multi-agent competition may scale better with increasing environment complexity and leads to behavior that centers around far more human-relevant skills than other self-supervised reinforcement learning methods such as intrinsic motivation. Finally, we propose transfer and fine-tuning as a way to quantitatively evaluate targeted capabilities, and we compare hide-and-seek agents to both intrinsic motivation and random initialization baselines in a suite of domain-specific intelligence tests.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bowen Baker;Ingmar Kanitscheider;Todor Markov;Yi Wu;Glenn Powell;Bob McGrew;Igor Mordatch",
        "authorids": "bowen@openai.com;ingmar@openai.com;todor@openai.com;jxwuyi@openai.com;glenn@openai.com;bmcgrew@openai.com;imordatch@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nBaker2020Emergent,\ntitle={Emergent Tool Use From Multi-Agent Autocurricula},\nauthor={Bowen Baker and Ingmar Kanitscheider and Todor Markov and Yi Wu and Glenn Powell and Bob McGrew and Igor Mordatch},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxpxJBKwS}\n}",
        "github": "https://github.com/openai/multi-agent-emergence-environments",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SkxpxJBKwS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "537;1133;249",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "786;1415;255",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            639.6666666666666,
            368.12075313528425
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            818.6666666666666,
            474.13101799209704
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 961,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=428666358348789864&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Skxuk1rFwB",
        "title": "Towards Stable and Efficient Training of Verifiably Robust Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a new certified adversarial training method, CROWN-IBP, that achieves state-of-the-art robustness for L_inf norm adversarial perturbations.",
        "abstract": "Training neural networks with verifiable robustness guarantees is challenging. Several existing approaches utilize linear relaxation based neural network output bounds under perturbation, but they can slow down training by a factor of hundreds depending on the underlying network architectures. Meanwhile, interval bound propagation (IBP) based training is efficient and significantly outperforms linear relaxation based methods on many tasks, yet it may suffer from stability issues since the bounds are much looser especially at the beginning of training. In this paper, we propose a new certified adversarial training method, CROWN-IBP, by combining the fast IBP bounds in a forward bounding pass and a tight linear relaxation based bound, CROWN, in a backward bounding pass. CROWN-IBP is computationally efficient and consistently outperforms IBP baselines on training verifiably robust neural networks. We conduct large scale experiments on MNIST and CIFAR datasets, and outperform all previous linear relaxation and bound propagation based certified defenses in L_inf robustness.\nNotably, we achieve 7.02% verified test error on MNIST at epsilon=0.3, and 66.94% on CIFAR-10 with epsilon=8/255.",
        "keywords": "Robust Neural Networks;Verifiable Training;Certified Adversarial Defense",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Huan Zhang;Hongge Chen;Chaowei Xiao;Sven Gowal;Robert Stanforth;Bo Li;Duane Boning;Cho-Jui Hsieh",
        "authorids": "huan@huan-zhang.com;chenhg@mit.edu;xiaocw@umich.edu;sgowal@google.com;stanforth@google.com;lbo@illinois.edu;boning@mtl.mit.edu;chohsieh@cs.ucla.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nzhang2020towards,\ntitle={Towards Stable and Efficient Training of Verifiably Robust Neural Networks},\nauthor={Huan Zhang and Hongge Chen and Chaowei Xiao and Sven Gowal and Robert Stanforth and Bo Li and Duane Boning and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Skxuk1rFwB}\n}",
        "github": "https://github.com/huanzhang12/CROWN-IBP",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Skxuk1rFwB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "220;186;375",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1586;493;748",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            260.3333333333333,
            82.26110191876164
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            942.3333333333334,
            466.89494416719583
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 396,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7372965607005042230&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Skxw-REFwS",
        "title": "Unsupervised Progressive Learning and the STAM Architecture",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce Unsupervised  Progressive  Learning  (UPL) and evaluate a neuro-inspired architecture: Self-Taught Associative Memory (STAM).",
        "abstract": "We  first pose  the  Unsupervised  Progressive  Learning  (UPL)  problem:   learning salient representations from a non-stationary stream of unlabeled data in which the number of object classes increases with time.  If some limited labeled data is also available, those representations can be associated with specific classes, thus enabling classification tasks.  To solve the UPL problem, we propose an architecture that involves an online clustering module, called Self-Taught Associative Memory (STAM). Layered hierarchies of STAM modules learn based on a combination of online clustering, novelty detection, forgetting outliers, and storing only prototypical representations rather than specific examples. The goal of this paper is to introduce the UPL problem, describe the STAM architecture, and evaluate the latter in the UPL context. ",
        "keywords": "continual learning;unsupervised learning;online learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "James Smith;Constantine Dovrolis",
        "authorids": "jamessealesmith@gatech.edu;constantine@gatech.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsmith2020unsupervised,\ntitle={Unsupervised Progressive Learning and the {\\{}STAM{\\}} Architecture},\nauthor={James Smith and Constantine Dovrolis},\nyear={2020},\nurl={https://openreview.net/forum?id=Skxw-REFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Skxw-REFwS",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "450;529;511",
        "wc_reply_reviewers": "0;162;0",
        "wc_reply_authors": "386;424;463",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            496.6666666666667,
            33.8066397160216
        ],
        "wc_reply_reviewers_avg": [
            54.0,
            76.36753236814714
        ],
        "wc_reply_authors_avg": [
            424.3333333333333,
            31.43600200760622
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 43,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13346343103338392033&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SkxwY3NtPr",
        "title": "Towards a Unified Evaluation of Explanation Methods without Ground Truth",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "This paper proposes a set of criteria to evaluate the objectiveness of explanation\nmethods of neural networks, which is crucial for the development of explainable\nAI, but it also presents significant challenges. The core challenge is that people\nusually cannot obtain ground-truth explanations of the neural network. To this\nend, we design four metrics to evaluate the explanation result without ground-truth\nexplanations. Our metrics can be broadly applied to nine benchmark methods of\ninterpreting neural networks, which provides new insights of explanation methods.",
        "keywords": "Deep Learning;Interpretability;Evaluation;Convolutional Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hao Zhang;Jiayi Chen;Haotian Xue;Quanshi Zhang",
        "authorids": "1603023-zh@sjtu.edu.cn;miracle3310@sjtu.edu.cn;xavihart@sjtu.edu.cn;zqs1022@sjtu.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkxwY3NtPr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "765;500;223",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "586;233;127",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            496.0,
            221.28865010810352
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            315.3333333333333,
            196.22153013593817
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10992551567237037987&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkxxtgHKPS",
        "title": "On Generalization Error Bounds of Noisy Gradient Methods for Non-Convex Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We give some generalization error bounds of noisy gradient methods such as SGLD, Langevin dynamics, noisy momentum and so forth.",
        "abstract": "Generalization error (also known as the out-of-sample error) measures how well the hypothesis learned from training data generalizes to previously unseen data. Proving tight generalization error bounds is a central question in statistical learning  theory.   In  this  paper,  we  obtain  generalization  error  bounds  for  learning general  non-convex  objectives,  which  has  attracted  significant  attention  in  recent years.   We develop a new framework,  termed Bayes-Stability,  for proving algorithm-dependent generalization error bounds.  The new framework combines ideas from both the PAC-Bayesian theory and the notion of algorithmic stability.  Applying the Bayes-Stability method, we obtain new data-dependent generalization bounds for stochastic gradient Langevin dynamics (SGLD) and several other noisy gradient methods (e.g., with momentum, mini-batch and acceleration, Entropy-SGD). Our result recovers (and is typically tighter than) a recent result in Mou et al. (2018) and improves upon the results in Pensia et al. (2018).  Our experiments demonstrate that our data-dependent bounds can distinguish randomly labelled data from normal data, which provides an explanation to the intriguing phenomena observed in Zhang et al. (2017a). We also study the setting where the total loss is the sum of a bounded loss and an additiona l`2 regularization term. We obtain new generalization bounds for the continuous Langevin dynamic in this setting by developing a new Log-Sobolev inequality for the parameter distribution at any time. Our new bounds are more desirable when the noise level of the processis not very small, and do not become vacuous even when T tends to infinity.",
        "keywords": "learning theory;generalization;nonconvex learning;stochastic gradient descent;Langevin dynamics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jian Li;Xuanyuan Luo;Mingda Qiao",
        "authorids": "ljiian83@mail.tsinghua.edu.cn;luo-xy19@mails.tsinghua.edu.cn;mqiao@stanford.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLi2020On,\ntitle={On Generalization Error Bounds of Noisy Gradient Methods for Non-Convex Learning},\nauthor={Jian Li and Xuanyuan Luo and Mingda Qiao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxxtgHKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkxxtgHKPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "181;217;429",
        "wc_reply_reviewers": "34;42;0",
        "wc_reply_authors": "1357;372;302",
        "reply_reviewers": "1;1;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            275.6666666666667,
            109.41460394095277
        ],
        "wc_reply_reviewers_avg": [
            25.333333333333332,
            18.208667044996886
        ],
        "wc_reply_authors_avg": [
            677.0,
            481.68108398261467
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 98,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8326370318167205439&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SkxybANtDB",
        "title": "Dynamic Time Lag Regression: Predicting What & When",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a new regression framework for temporal phenomena having non-stationary time-lag dependencies.",
        "abstract": "This paper tackles a new regression problem, called Dynamic Time-Lag Regression (DTLR), where a cause signal drives an effect signal with an unknown time delay.\nThe motivating application, pertaining to space weather modelling, aims to predict the near-Earth solar wind speed based on estimates of the Sun's  coronal magnetic field. \nDTLR differs from mainstream regression and from sequence-to-sequence learning in two respects: firstly, no ground truth (e.g., pairs of associated sub-sequences) is available; secondly, the cause signal contains much information irrelevant to the effect signal (the solar magnetic field governs the solar wind propagation in the heliosphere, of which the Earth's magnetosphere is but a minuscule region). \n\nA Bayesian approach is presented to tackle the specifics of the DTLR problem, with theoretical justifications based on linear stability analysis. A proof of concept on synthetic problems is presented. Finally, the empirical results on the solar wind modelling task improve on the state of the art in solar wind forecasting.",
        "keywords": "Dynamic Time-Lag Regression;Time Delay;Regression;Time Series",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mandar Chandorkar;Cyril Furtlehner;Bala Poduval;Enrico Camporeale;Michele Sebag",
        "authorids": "mandar.chandorkar@cwi.nl;furtlehn@lri.fr;bala.poduval@unh.edu;e.camporeale@cwi.nl;michele.sebag@lri.fr",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nChandorkar2020Dynamic,\ntitle={Dynamic Time Lag Regression: Predicting What & When},\nauthor={Mandar Chandorkar and Cyril Furtlehner and Bala Poduval and Enrico Camporeale and Michele Sebag},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxybANtDB}\n}",
        "github": "https://github.com/transcendent-ai-labs/PlasmaML",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SkxybANtDB",
        "pdf_size": 0,
        "rating": "6;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "119;291;246;368",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "11;253;362;81",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            6.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            256.0,
            90.32995073617609
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            176.75,
            138.53947993261704
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5170552035479326246&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 14
    },
    {
        "id": "SkxzSgStPS",
        "title": "Exploration via Flow-Based Intrinsic Rewards",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Exploration bonuses derived from the novelty of observations in an environment have become a popular approach to motivate exploration for reinforcement learning (RL) agents in the past few years. Recent methods such as curiosity-driven exploration usually estimate the novelty of new observations by the prediction errors of their system dynamics models. In this paper, we introduce the concept of optical flow estimation from the field of computer vision to the RL domain and utilize the errors from optical flow estimation to evaluate the novelty of new observations.  We introduce a flow-based intrinsic curiosity module (FICM) capable of learning the motion features and understanding the observations in a more comprehensive and efficient fashion. We evaluate our method and compare it with a number of baselines on several benchmark environments, including Atari games, Super Mario Bros., and ViZDoom. Our results show that the proposed method is superior to the baselines in certain environments, especially for those featuring sophisticated moving patterns or with high-dimensional observation spaces.",
        "keywords": "reinforcement learning;exploration;curiosity;optical flow;intrinsic rewards",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hsuan-Kung Yang;Po-Han Chiang;Min-Fong Hong;Chun-Yi Lee",
        "authorids": "hellochick@gapp.nthu.edu.tw;ymmoy999@gapp.nthu.edu.tw;romulus@gapp.nthu.edu.tw;cylee@gapp.nthu.edu.tw",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nyang2020exploration,\ntitle={Exploration via Flow-Based Intrinsic Rewards},\nauthor={Hsuan-Kung Yang and Po-Han Chiang and Min-Fong Hong and Chun-Yi Lee},\nyear={2020},\nurl={https://openreview.net/forum?id=SkxzSgStPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SkxzSgStPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "414;723;358",
        "wc_reply_reviewers": "129;224;0",
        "wc_reply_authors": "1647;2152;474",
        "reply_reviewers": "1;1;0",
        "reply_authors": "3;4;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            498.3333333333333,
            160.4999134648434
        ],
        "wc_reply_reviewers_avg": [
            117.66666666666667,
            91.79808761019419
        ],
        "wc_reply_authors_avg": [
            1424.3333333333333,
            702.9017633274858
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.247219128924647
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13045530679658684281&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Sye0XkBKvS",
        "title": "SNODE: Spectral Discretization of Neural ODEs for System Identification",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper proposes the use of spectral element methods for fast and accurate training of Neural Ordinary Differential Equations for system identification.",
        "abstract": "This paper proposes the use of spectral element methods \\citep{canuto_spectral_1988} for fast and accurate training of Neural Ordinary Differential Equations (ODE-Nets; \\citealp{Chen2018NeuralOD}) for system identification. This is achieved by expressing their dynamics as a truncated series of Legendre polynomials. The series coefficients, as well as the network weights, are computed by minimizing the weighted sum of the loss function and the violation of the ODE-Net dynamics. The problem is solved by coordinate descent that alternately minimizes, with respect to the coefficients and the weights, two unconstrained sub-problems using standard backpropagation and gradient methods. The resulting optimization scheme is fully time-parallel and results in a low memory footprint. Experimental comparison to standard methods, such as backpropagation through explicit solvers and the adjoint technique \\citep{Chen2018NeuralOD}, on training surrogate models of small and medium-scale dynamical systems shows that it is at least one order of magnitude faster at reaching a comparable value of the loss function. The corresponding testing MSE is one order of magnitude smaller as well, suggesting generalization capabilities increase.",
        "keywords": "Recurrent neural networks;system identification;neural ODEs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alessio Quaglino;Marco Gallieri;Jonathan Masci;Jan Koutn\u00edk",
        "authorids": "alessio@nnaisense.com;marco@nnaisense.com;jonathan@nnaisense.com;jan@nnaisense.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nQuaglino2020SNODE:,\ntitle={SNODE: Spectral Discretization of Neural ODEs for System Identification},\nauthor={Alessio Quaglino and Marco Gallieri and Jonathan Masci and Jan Koutn\u00edk},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Sye0XkBKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Sye0XkBKvS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "485;771;132",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "119;877;78",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            462.6666666666667,
            261.3482138620597
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            358.0,
            367.36993163113755
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 63,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9231586685687221208&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Sye2c3NYDB",
        "title": "Boosting Ticket: Towards Practical Pruning for Adversarial Training with Lottery Ticket Hypothesis",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We show the possibility of pruning to find a small sub-network with significantly higher convergence rate than the full model.",
        "abstract": "Recent research has proposed the lottery ticket hypothesis, suggesting that for a deep neural network, there exist trainable sub-networks performing equally or better than the original model with commensurate training steps. While this discovery is insightful, finding proper sub-networks requires iterative training and pruning. The high cost incurred limits the applications of the lottery ticket hypothesis. We show there exists a subset of the aforementioned sub-networks that converge significantly faster during the training process and thus can mitigate the cost issue. We conduct extensive experiments to show such sub-networks consistently exist across various model structures for a restrictive setting of hyperparameters (e.g., carefully selected learning rate, pruning ratio, and model capacity).  As a practical application of our findings, we demonstrate that such sub-networks can help in cutting down the total time of adversarial training, a standard approach to improve robustness, by up to 49% on CIFAR-10 to achieve the state-of-the-art robustness. ",
        "keywords": "neural networks;adversarial training;prune",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bai Li;Shiqi Wang;Yunhan Jia;Yantao Lu;Zhenyu Zhong;Lawrence Carin;Suman Jana",
        "authorids": "bai.li@duke.edu;tcwangshiqi@cs.columbia.edu;jack0082010@gmail.com;ylu25@syr.edu;edwardzhong@baidu.com;lcarin@duke.edu;suman@cs.columbia.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Sye2c3NYDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "823;362;408",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            531.0,
            207.32743828704068
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CpVRKG6yLz0J:scholar.google.com/&scioq=Boosting+Ticket:+Towards+Practical+Pruning+for+Adversarial+Training+with+Lottery+Ticket+Hypothesis&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Sye2s2VtDr",
        "title": "Automatically Learning Feature Crossing from Model Interpretation for Tabular Data",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel method called CrossGO, which automatically and efficiently selects useful cross features according to the interpretation inconsistency computed in deep neural networks.",
        "abstract": "Automatically feature generation is a major topic of automated machine learning. Among various feature generation approaches, feature crossing, which takes cross-product of sparse features, is a promising way to effectively capture the interactions among categorical features in tabular data. Previous works on feature crossing try to search in the set of all the possible cross feature fields. This is obviously not efficient when the size of original feature fields is large. Meanwhile, some deep learning-based methods combines deep neural networks and various interaction components. However, due to the existing of Deep Neural Networks (DNN), only a few cross features can be explicitly generated by the interaction components. Recently, piece-wise interpretation of DNN has been widely studied, and the piece-wise interpretations are usually inconsistent in different samples. Inspired by this, we give a definition of interpretation inconsistency in DNN, and propose a novel method called CrossGO, which selects useful cross features according to the interpretation inconsistency. The whole process of learning feature crossing can be done via simply training a DNN model and a logistic regression (LR) model. CrossGO can generate compact candidate set of cross feature fields, and promote the efficiency of searching. Extensive experiments have been conducted on several real-world datasets. Cross features generated by CrossGO can empower a simple LR model achieving approximate or even better performances comparing with complex DNN models.",
        "keywords": "AutoML;feature crossing;interpretation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhaocheng Liu;Qiang Liu;Haoli Zhang",
        "authorids": "zhaocheng.liu@realai.ai;qiang.liu@realai.ai;haoli.zhang@realai.ai",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nliu2020automatically,\ntitle={Automatically Learning Feature Crossing from Model Interpretation for Tabular Data},\nauthor={Zhaocheng Liu and Qiang Liu and Haoli Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=Sye2s2VtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Sye2s2VtDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "292;480;317",
        "wc_reply_reviewers": "0;481;103",
        "wc_reply_authors": "400;430;319",
        "reply_reviewers": "0;2;1",
        "reply_authors": "3;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            363.0,
            83.35866281717016
        ],
        "wc_reply_reviewers_avg": [
            194.66666666666666,
            206.78867366361138
        ],
        "wc_reply_authors_avg": [
            383.0,
            46.88283267892417
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            22,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18112259705311524582&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Sye57xStvB",
        "title": "Never Give Up: Learning Directed Exploration Strategies",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a reinforcement learning agent to solve hard exploration games by learning a range of directed exploratory policies. ",
        "abstract": "We propose a reinforcement learning agent to solve hard exploration games by learning a range of directed exploratory policies. We construct an episodic memory-based intrinsic reward using k-nearest neighbors over the agent's recent experience to train the directed exploratory policies, thereby encouraging the agent to repeatedly revisit all states in its environment. A self-supervised inverse dynamics model is used to train the embeddings of the nearest neighbour lookup, biasing the novelty signal towards what the agent can control. We employ the framework of Universal Value Function Approximators to simultaneously learn many directed exploration policies with the same neural network, with different trade-offs between exploration and exploitation. By using the same neural network for different degrees of exploration/exploitation, transfer is demonstrated from predominantly exploratory policies yielding effective exploitative policies. The proposed method can be incorporated to run with modern distributed RL agents that collect large amounts of experience from many actors running in parallel on separate environment instances. Our method doubles the performance of the base agent in all hard exploration in the Atari-57 suite while maintaining a very high score across the remaining games, obtaining a median human normalised score of 1344.0%. Notably, the proposed method is the first algorithm to achieve non-zero rewards (with a mean score of 8,400) in the game of Pitfall! without using demonstrations or hand-crafted features.",
        "keywords": "deep reinforcement learning;exploration;intrinsic motivation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Adri\u00e0 Puigdom\u00e8nech Badia;Pablo Sprechmann;Alex Vitvitskyi;Daniel Guo;Bilal Piot;Steven Kapturowski;Olivier Tieleman;Martin Arjovsky;Alexander Pritzel;Andrew Bolt;Charles Blundell",
        "authorids": "adriap@google.com;psprechmann@google.com;avlife@google.com;danielguo@google.com;piot@google.com;skapturowski@google.com;tieleman@google.com;martinarjovsky@gmail.com;apritzel@google.com;abolt@google.com;cblundell@google.com",
        "gender": ";;;;;;;;;;",
        "homepage": ";;;;;;;;;;",
        "dblp": ";;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;",
        "orcid": ";;;;;;;;;;",
        "linkedin": ";;;;;;;;;;",
        "or_profile": ";;;;;;;;;;",
        "aff": ";;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;",
        "position": ";;;;;;;;;;",
        "bibtex": "@inproceedings{\nBadia2020Never,\ntitle={Never Give Up: Learning Directed Exploration Strategies},\nauthor={Adri\u00e0 Puigdom\u00e8nech Badia and Pablo Sprechmann and Alex Vitvitskyi and Daniel Guo and Bilal Piot and Steven Kapturowski and Olivier Tieleman and Martin Arjovsky and Alexander Pritzel and Andrew Bolt and Charles Blundell},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Sye57xStvB}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=Sye57xStvB)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Sye57xStvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "1172;156;248",
        "wc_reply_reviewers": "0;0;32",
        "wc_reply_authors": "1678;395;382",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            525.3333333333334,
            458.80230552554497
        ],
        "wc_reply_reviewers_avg": [
            10.666666666666666,
            15.084944665313014
        ],
        "wc_reply_authors_avg": [
            818.3333333333334,
            607.8992972158757
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            11,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 410,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9222453228534036360&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Sye9lyHKwB",
        "title": "Training-Free Uncertainty Estimation for Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A set of methods to obtain uncertainty estimation of any given model without re-designing, re-training, or to fine-tuning it.",
        "abstract": "Uncertainty estimation is an essential step in the evaluation of the robustness for deep learning models in computer vision, especially when applied in risk-sensitive areas. However, most state-of-the-art deep learning models either fail to obtain uncertainty estimation or need significant modification (e.g., formulating a proper Bayesian treatment) to obtain it. None of the previous methods are able to take an arbitrary model off the shelf and generate uncertainty estimation without retraining or redesigning it. To address this gap, we perform the first systematic exploration into training-free uncertainty estimation. \nWe propose three simple and scalable methods to analyze the variance of output from a trained network under tolerable perturbations: infer-transformation, infer-noise, and infer-dropout. They operate solely during inference, without the need to re-train, re-design, or fine-tune the model, as typically required by other state-of-the-art uncertainty estimation methods. Surprisingly, even without involving such perturbations in training, our methods produce comparable or even better uncertainty estimation when compared to other training-required state-of-the-art methods. Last but not least, we demonstrate that the uncertainty from our proposed methods can be used to improve the neural network training.\n",
        "keywords": "uncertainty estimation;training-free;neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lu Mi;Hao Wang;Yonglong Tian;Nir Shavit",
        "authorids": "lumi@mit.edu;hoguewang@gmail.com;yonglong@mit.edu;shanir@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Sye9lyHKwB",
        "pdf_size": 0,
        "rating": "1;1;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "320;610;321;163",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "303;304;339;63",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            353.5,
            161.44735984214793
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            252.25,
            110.22108464354721
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 26,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1803952422059149002&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SyeD0RVtvS",
        "title": "DeepSFM: Structure From Motion Via Deep Bundle Adjustment",
        "track": "main",
        "status": "Reject",
        "tldr": "We design a physical driven deep architecture for depth and pose estimation inspired by Bundle Ajustment.",
        "abstract": "Structure from motion (SfM) is an essential computer vision problem which has not been well handled by deep learning. One of the promising trends is to apply explicit structural constraint, e.g. 3D cost volume, into the network. In this work, we design a physical driven architecture, namely DeepSFM, inspired by traditional Bundle Adjustment (BA), which consists of two cost volume based architectures for depth and pose estimation respectively, iteratively running  to improve both. In each cost volume, we encode not only photo-metric consistency across multiple input images, but also geometric consistency to ensure that depths from multiple views agree with each other. The explicit constraints on both depth (structure) and pose (motion), when combined with the learning components, bring the merit from both traditional BA and emerging deep learning technology. Extensive experiments on various datasets show that our model achieves the state-of-the-art performance on both depth and pose estimation with superior robustness against less number of inputs and the noise in initialization.\n",
        "keywords": "Computer Vision;Bundle Ajustment;Structure from Motion",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xingkui Wei;Yinda Zhang;Zhuwen Li;Yanwei Fu;Xiangyang Xue",
        "authorids": "xkwei19@fudan.edu.cn;yindaz@cs.princeton.edu;lzhuwen@gmail.com;yanweifu@fudan.edu.cn;xyxue@fudan.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwei2020deepsfm,\ntitle={Deep{\\{}SFM{\\}}: Structure From Motion Via Deep Bundle Adjustment},\nauthor={Xingkui Wei and Yinda Zhang and Zhuwen Li and Yanwei Fu and Xiangyang Xue},\nyear={2020},\nurl={https://openreview.net/forum?id=SyeD0RVtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SyeD0RVtvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "162;457;224",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "569;700;310",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            281.0,
            126.99868765726151
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            526.3333333333334,
            162.05006085226324
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 127,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14040398907012674672&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SyeHPgHFDr",
        "title": "Finding Deep Local Optima Using Network Pruning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Artificial neural networks (ANNs) are very popular nowadays and offer reliable solutions to many classification problems. However, training deep neural networks (DNN) is time-consuming due to the large number of parameters. Recent research indicates that these  DNNs might be over-parameterized and different solutions have been proposed to reduce the complexity both in the number of parameters and in the training time of the neural networks. Furthermore, some researchers argue that after reducing the neural network complexity via connection pruning, the remaining weights are irrelevant and retraining the sub-network would obtain a comparable accuracy with the original one. \nThis may hold true in most vision problems where we always enjoy a large number of training samples and research indicates that most local optima of the convolutional neural networks may be equivalent. However, in non-vision sparse datasets, especially with many irrelevant features where a standard neural network would overfit, this might not be the case and there might be many non-equivalent local optima. This paper presents empirical evidence for these statements and an empirical study of the learnability of neural networks (NNs) on some challenging non-linear real and simulated data with irrelevant variables. \nOur simulation experiments indicate that the cross-entropy loss function on XOR-like data has many local optima, and the number of local optima grows exponentially with the number of irrelevant variables. \nWe also introduce a connection pruning method to improve the capability of NNs to find a deep local minimum even when there are irrelevant variables. \nFurthermore, the performance of the discovered sparse sub-network degrades considerably either by retraining from scratch or the corresponding original initialization, due to the existence of many bad optima around. \nFinally, we will show that the performance of neural networks for real-world experiments on sparse datasets can be recovered or even improved by discovering a good sub-network architecture via connection pruning.",
        "keywords": "network pruning;non-convex optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yangzi Guo;Yiyuan She;Ying Nian Wu;Adrian Barbu",
        "authorids": "yguo@math.fsu.edu;yshe@stat.fsu.edu;ywu@stat.ucla.edu;abarbu@stat.fsu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nguo2020finding,\ntitle={Finding Deep Local Optima Using Network Pruning},\nauthor={Yangzi Guo and Yiyuan She and Ying Nian Wu and Adrian Barbu},\nyear={2020},\nurl={https://openreview.net/forum?id=SyeHPgHFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyeHPgHFDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "290;574;168",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.0,
            170.0901721636693
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GpqGNZTwW3QJ:scholar.google.com/&scioq=Finding+Deep+Local+Optima+Using+Network+Pruning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SyeKGgStDB",
        "title": "Training a Constrained Natural Media Painting Agent using Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We train a natural media painting agent using environment model. Based on our painting agent, we present a novel approach to train a constrained painting agent that follows the command encoded in the observation.",
        "abstract": "We present a novel approach to train a natural media painting using reinforcement learning. Given a reference image,  our formulation is based on stroke-based rendering that  imitates human drawing and can be learned from scratch without supervision. Our painting agent computes a sequence of actions that represent the primitive painting strokes. In order to ensure that the generated policy is predictable and controllable, we use a constrained learning method and train the painting agent  using the environment model and follows the commands encoded in an observation. We have applied our approach on many benchmarks and our results demonstrate that our constrained agent can handle different painting media and different constraints in the action space to collaborate with humans or other agents.\n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Biao Jia;Jonathan Brandt;Radomir Mech;Ning Xu;Byungmoon Kim;Dinesh Manocha",
        "authorids": "biao@cs.umd.edu;jbrandt@adobe.com;rmech@adobe.com;nxu@adobe.com;bmkim@adobe.com;dm@cs.umd.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\njia2020training,\ntitle={Training a Constrained Natural Media Painting Agent using Reinforcement Learning },\nauthor={Biao Jia and Jonathan Brandt and Radomir Mech and Ning Xu and Byungmoon Kim and Dinesh Manocha},\nyear={2020},\nurl={https://openreview.net/forum?id=SyeKGgStDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyeKGgStDB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "599;311;482",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            464.0,
            118.26242006656214
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13354437918867185841&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SyeLGlHtPS",
        "title": "Learning vector representation of local content and matrix representation of local motion, with implications for V1",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This paper proposes a representational model for image pair such as consecutive video frames that are related by local pixel displacements, in the hope that the model may shed light on motion perception in primary visual cortex (V1). The model couples the following two components. (1) The vector representations of local contents of images. (2) The matrix representations of local pixel displacements caused by the relative motions between the agent and the objects in the 3D scene. When the image frame undergoes changes due to local pixel displacements, the vectors are multiplied by the matrices that represent the local displacements. Our experiments show that our model can learn to infer local motions. Moreover, the model can learn Gabor-like filter pairs of quadrature phases.",
        "keywords": "Representation learning;V1;neuroscience",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruiqi Gao;Jianwen Xie;Siyuan Huang;Yufan Ren;Song-Chun Zhu;Ying Nian Wu",
        "authorids": "ruiqigao@ucla.edu;jianwen@ucla.edu;huangsiyuan@ucla.edu;3160104704@zju.edu.cn;sczhu@stat.ucla.edu;ywu@stat.ucla.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ngao2020learning,\ntitle={Learning vector representation of local content and matrix representation of local motion, with implications for V1},\nauthor={Ruiqi Gao and Jianwen Xie and Siyuan Huang and Yufan Ren and Song-Chun Zhu and Ying Nian Wu},\nyear={2020},\nurl={https://openreview.net/forum?id=SyeLGlHtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SyeLGlHtPS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "490;299;300",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1138;995;253",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            363.0,
            89.80348916755221
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            795.3333333333334,
            387.9057732090216
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8509165293837111184&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SyeMblBtwr",
        "title": "CrossNorm: On Normalization for Off-Policy Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Use of normalization for deep RL allows for training without target networks and better performance.",
        "abstract": "Off-policy temporal difference (TD) methods are a powerful class of reinforcement learning (RL) algorithms. Intriguingly, deep off-policy TD algorithms are not commonly used in combination with feature normalization techniques, despite positive effects of normalization in other domains. We show that naive application of existing normalization techniques is indeed not effective, but that well-designed normalization improves optimization stability and removes the necessity of target networks. In particular, we introduce a normalization based on a mixture of on- and off-policy transitions, which we call cross-normalization. It can be regarded as an extension of batch normalization that re-centers data for two different distributions, as present in off-policy learning. Applied to DDPG and TD3, cross-normalization improves over the state of the art across a range of MuJoCo benchmark tasks.\n",
        "keywords": "RL;Normalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aditya Bhatt;Max Argus;Artemij Amiranashvili;Thomas Brox",
        "authorids": "aditya@bhatts.org;argus.max@gmail.com;amiranas@cs.uni-freiburg.de;brox@cs.uni-freiburg.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nbhatt2020crossnorm,\ntitle={CrossNorm: On Normalization for Off-Policy Reinforcement Learning},\nauthor={Aditya Bhatt and Max Argus and Artemij Amiranashvili and Thomas Brox},\nyear={2020},\nurl={https://openreview.net/forum?id=SyeMblBtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SyeMblBtwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "572;410;305",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "759;600;11",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            429.0,
            109.82713690158731
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            456.6666666666667,
            321.7497302079504
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12433262507625999009&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SyeOVTEFPH",
        "title": "Instance adaptive adversarial training: Improved accuracy tradeoffs in neural nets",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Instance adaptive adversarial training for improving robustness-accuracy tradeoff",
        "abstract": "Adversarial training is by far the most successful strategy for improving robustness of neural networks to adversarial attacks. Despite its success as a defense mechanism, adversarial training fails to generalize well to unperturbed test set. We hypothesize that this poor generalization is a consequence of adversarial training with uniform perturbation radius around every training sample. Samples close to decision boundary can be morphed into a different class under a small perturbation budget, and enforcing large margins around these samples produce poor decision boundaries that generalize poorly. Motivated by this hypothesis, we propose instance adaptive adversarial training -- a technique that enforces sample-specific perturbation margins around every training sample. We show that using our approach, test accuracy on unperturbed samples improve with a marginal drop in robustness. Extensive experiments on CIFAR-10, CIFAR-100 and Imagenet datasets demonstrate the effectiveness of our proposed approach.",
        "keywords": "Adversarial training;Improving generalization;robustness-accuracy tradeoff",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yogesh Balaji;Tom Goldstein;Judy Hoffman",
        "authorids": "yogesh@cs.umd.edu;tomg@cs.umd.edu;judy@gatech.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SyeOVTEFPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "261;412;560",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "296;416;366",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            411.0,
            122.06828689986055
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            359.3333333333333,
            49.21607686744467
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 152,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3161300812029713325&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SyePUgBtPr",
        "title": "RefNet: Automatic Essay Scoring by Pairwise Comparison",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Automatically score essays on sparse data by comparing new essays with known samples with Referee Network. ",
        "abstract": "Automatic Essay Scoring (AES) has been an active research area as it can greatly reduce the workload of teachers and prevents subjectivity bias . Most recent AES solutions apply deep neural network (DNN)-based models with regression, where the neural neural-based encoder learns an essay representation that helps differentiate among the essays and the corresponding essay score is inferred by a regressor. Such DNN approach usually requires a lot of expert-rated essays as training data in order to learn a good essay representation for accurate scoring. However, such data is usually expensive and thus is sparse. Inspired by the observation that human usually scores an essay by comparing it with some references, we propose a Siamese framework called Referee Network (RefNet) which allows the model to compare the quality of two essays by capturing the relative features that can differentiate the essay pair. The proposed framework can be applied as an extension to regression models as it can capture additional relative features on top of internal information. Moreover, it intrinsically augment the data by pairing thus is ideal for handling data sparsity. Experiment shows that our framework can significantly improve the existing regression models and achieve acceptable performance even when the training data is greatly reduced.",
        "keywords": "Natural Language Processing;Automatic Essay Scoring;Few-shot Learning;Neural Network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiaxin Li;Jinan Zhou",
        "authorids": "jiaxin.li@link.cuhk.edu.hk;jinan.zhou@link.cuhk.edu.hk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "https://drive.google.com/open?id=1V2HqjtzTkdTxCQxrjQzEymsheEV2hLSl",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyePUgBtPr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "375;556;604",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            511.6666666666667,
            98.6047102773944
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=690668299670060755&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SyePj6NYwS",
        "title": "LEVERAGING AUXILIARY TEXT FOR DEEP RECOGNITION OF UNSEEN VISUAL RELATIONSHIPS",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "",
        "abstract": "One of the most difficult tasks in \\emph{scene understanding} is recognizing\ninteractions between objects in an image. This task is often called \n\\emph{visual relationship detection} (VRD).\nWe consider the question of whether,  given\nauxiliary textual data in addition to the  standard visual data used for \ntraining VRD models, VRD performance can be improved. \nWe present a new deep model that can leverage additional textual data.\nOur model relies on a shared text--image representation of subject-verb-object \nrelationships appearing in the text, and object interactions in images.\nOur method is the first to enable recognition of visual relationships missing in the visual training data and appearing only in the auxiliary text. We test our approach on two different text sources: text originating in images and text originating in books.  \nWe test and validate our approach using two large-scale recognition tasks: \nVRD and Scene Graph Generation. We show a surprising result: Our approach works better with text originating in books, and outperforms the text originating in images on the task of unseen relationship recognition.  It is comparable to the model which utilizes text originating in images on the task of seen relationship recognition.",
        "keywords": "computer vision;natural language processing;visual relationship detection;scene graph generation;few shot learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gal Sadeh Kenigsfield;Ran El-Yaniv",
        "authorids": "sgalk87@campus.technion.ac.il;rani@cs.technion.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nanonymous2020leveraging,\ntitle={{\\{}LEVERAGING{\\}} {\\{}AUXILIARY{\\}} {\\{}TEXT{\\}}  {\\{}FOR{\\}} {\\{}DEEP{\\}} {\\{}RECOGNITION{\\}} {\\{}OF{\\}} {\\{}UNSEEN{\\}} {\\{}VISUAL{\\}} {\\{}RELATIONSHIPS{\\}}},\nauthor={Anonymous},\nbooktitle={Submitted to International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyePj6NYwS},\nnote={under review}\n}",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=SyePj6NYwS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12754559880527267565&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SyeRIgBYDB",
        "title": "Semi-Implicit Back Propagation",
        "track": "main",
        "status": "Reject",
        "tldr": "A new scheme in the spirit of error back propagation, with an implicit updates on the parameters sets and semi-implicit updates on the hidden neurons.",
        "abstract": "Neural network has attracted great attention for a long time and many researchers are devoted  to improve the effectiveness of  neural network training algorithms. Though stochastic gradient descent (SGD) and other explicit gradient-based methods are widely adopted, there are still many challenges such as gradient vanishing and small step sizes, which leads to slow convergence and instability of SGD algorithms. Motivated by error back propagation (BP)  and proximal methods, we propose a semi-implicit back propagation method for neural network training. Similar to BP, the difference on the neurons are propagated in a backward fashion and the parameters are updated with proximal mapping. The implicit update for both hidden neurons and parameters allows to choose large step size in the training algorithm. Finally, we also show that any fixed point of  convergent sequences produced by this algorithm is a stationary point of the objective loss function. The experiments on both MNIST and CIFAR-10 demonstrate that the proposed semi-implicit BP algorithm leads to  better performance in terms of both loss decreasing and training/validation accuracy, compared to SGD and a similar algorithm ProxBP.",
        "keywords": "Optimization;Neural Network;Proximal mapping;Back propagation;Implicit",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ren Liu;Xiaoqun Zhang",
        "authorids": "liur0810@sjtu.edu.cn;xqzhang@sjtu.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nliu2020semiimplicit,\ntitle={Semi-Implicit Back Propagation},\nauthor={Ren Liu and Xiaoqun Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=SyeRIgBYDB}\n}",
        "github": "https://github.com/liur0810/semi-implicit-back-propagation/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyeRIgBYDB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "497;293;299",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            363.0,
            94.78396488858229
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15226908601560675960&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SyeUMRNYDr",
        "title": "Generating Dialogue Responses From A Semantic Latent Space",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel dialog generation model that learns on a utterance level semantic latent space. The model could learn from semantically similar sentences collectively, thus eliminates the generic response problem.",
        "abstract": "Generic responses are a known issue for open-domain dialog generation. Most current approaches model this one-to-many task as a one-to-one task, hence being unable to integrate information from multiple semantically similar valid responses of a prompt. We propose a novel dialog generation model that learns a semantic latent space, on which representations of semantically related sentences are close to each other. This latent space is learned by maximizing correlation between the features extracted from prompt and responses. Learning the pair relationship between the prompts and responses as a regression task on the latent space, instead of classification on the vocabulary using MLE loss, enables our model to view semantically related responses collectively. An additional autoencoder is trained, for recovering the full sentence from the latent space. Experimental results show that our proposed model eliminates the generic response problem, while achieving comparable or better coherence compared to baselines.",
        "keywords": "dialog;chatbot;open domain conversation;CCA",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei-Jen Ko;Avik Ray;Yilin Shen;Hongxia Jin",
        "authorids": "wjko@outlook.com;avik.r@samsung.com;yilin.shen@samsung.com;hongxia.jin@samsung.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nko2020generating,\ntitle={Generating Dialogue Responses From A Semantic Latent Space},\nauthor={Wei-Jen Ko and Avik Ray and Yilin Shen and Hongxia Jin},\nyear={2020},\nurl={https://openreview.net/forum?id=SyeUMRNYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyeUMRNYDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "355;241;429",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "633;652;751",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            341.6666666666667,
            77.32758599332811
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            678.6666666666666,
            51.73221648278974
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16750237333282163686&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SyeYiyHFDH",
        "title": "Convergence Analysis of a Momentum Algorithm with Adaptive Step Size for Nonconvex Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Although Adam is a very popular algorithm for optimizing the weights of neural networks, it has been recently shown that it can diverge even in simple convex optimization examples. Therefore, several variants of Adam have been proposed to circumvent this convergence issue. In this work, we study the algorithm for smooth nonconvex optimization under a boundedness assumption on the adaptive learning rate. The bound on the adaptive step size depends on the Lipschitz constant of the gradient of the objective function and provides safe theoretical adaptive step sizes. Under this boundedness assumption, we show a novel first order convergence rate result in both deterministic and stochastic contexts. Furthermore, we establish convergence rates of the function value sequence using the Kurdyka-Lojasiewicz property.",
        "keywords": "nonconvex optimization;adaptive methods",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anas Barakat;Pascal Bianchi",
        "authorids": "anas.barakat@telecom-paristech.fr;pascal.bianchi@telecom-paristech.fr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nbarakat2020convergence,\ntitle={Convergence Analysis of a Momentum Algorithm with Adaptive Step Size for Nonconvex Optimization},\nauthor={Anas Barakat and Pascal Bianchi},\nyear={2020},\nurl={https://openreview.net/forum?id=SyeYiyHFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyeYiyHFDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "303;860;147",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            436.6666666666667,
            306.04175459494996
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "SyeZIkrKwS",
        "title": "DyNet: Dynamic Convolution for Accelerating Convolution Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a dynamic convolution method to significantly accelerate inference time of CNNs while maintaining the accuracy.",
        "abstract": "Convolution operator is the core of convolutional neural networks (CNNs) and occupies the most computation cost. To make CNNs more efficient, many methods have been proposed to either design lightweight networks or compress models. Although some efficient network structures have been proposed, such as MobileNet or ShuffleNet, we find that there still exists redundant information between convolution kernels. To address this issue, we propose a novel dynamic convolution method named \\textbf{DyNet} in this paper, which can adaptively generate convolution kernels based on image contents. To demonstrate the effectiveness, we apply DyNet on multiple state-of-the-art CNNs. The experiment results show that DyNet can reduce the computation cost remarkably, while maintaining the performance nearly unchanged. Specifically, for ShuffleNetV2 (1.0), MobileNetV2 (1.0), ResNet18 and ResNet50, DyNet reduces 40.0%, 56.7%, 68.2% and 72.4% FLOPs respectively while the Top-1 accuracy on ImageNet only changes by +1.0%, -0.27%, -0.6% and -0.08%. Meanwhile, DyNet further accelerates the inference speed of MobileNetV2 (1.0), ResNet18 and ResNet50 by 1.87x,1.32x and 1.48x on CPU platform respectively. To verify the scalability, we also apply DyNet on segmentation task, the results show that DyNet can reduces 69.3% FLOPs while maintaining the Mean IoU on segmentation task.",
        "keywords": "CNNs;dynamic convolution kernel",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kane Zhang;Jian Zhang;Qiang Wang;Zhao Zhong",
        "authorids": "zhangyikang5@huawei.com;zhangjian157@huawei.com;wangqiang168@huawei.com;zorro.zhongzhao@huawei.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020dynet,\ntitle={DyNet: Dynamic Convolution for Accelerating Convolution Neural Networks},\nauthor={Kane Zhang and Jian Zhang and Qiang Wang and Zhao Zhong},\nyear={2020},\nurl={https://openreview.net/forum?id=SyeZIkrKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SyeZIkrKwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "548;221;364",
        "wc_reply_reviewers": "0;0;113",
        "wc_reply_authors": "496;662;942",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;2;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            377.6666666666667,
            133.84651242706659
        ],
        "wc_reply_reviewers_avg": [
            37.666666666666664,
            53.268710849386586
        ],
        "wc_reply_authors_avg": [
            700.0,
            184.0507176477904
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 117,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3975939469081050838&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Sye_OgHFwH",
        "title": "Unrestricted Adversarial Examples via Semantic Manipulation",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce unrestricted perturbations that manipulate semantically meaningful image-based visual descriptors - color and texture - in order to generate effective and  photorealistic adversarial examples.",
        "abstract": "Machine learning models, especially deep neural networks (DNNs), have been shown to be vulnerable against adversarial examples which are carefully crafted samples with a small magnitude of the perturbation.  Such adversarial perturbations are usually restricted by bounding their $\\mathcal{L}_p$ norm such that they are imperceptible, and thus many current defenses can exploit this property to reduce their adversarial impact.  In this paper, we instead introduce \"unrestricted\" perturbations that manipulate semantically meaningful image-based visual descriptors - color and texture - in order to generate effective and photorealistic adversarial examples. We show that these semantically aware perturbations are effective against JPEG compression, feature squeezing and adversarially trained model. We also show that the proposed methods can effectively be applied to both image classification and image captioning tasks on complex datasets such as ImageNet and MSCOCO. In addition, we conduct comprehensive user studies to show that our generated semantic adversarial examples are photorealistic to humans despite large magnitude perturbations when compared to other attacks.",
        "keywords": "Adversarial Examples;Semantic Manipulation;Image Colorization;Texture Transfer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anand Bhattad;Min Jin Chong;Kaizhao Liang;Bo Li;D. A. Forsyth",
        "authorids": "bhattad2@illinois.edu;mchong6@illinois.edu;kl2@illinois.edu;lbo@illinois.edu;daf@illinois.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nBhattad2020Unrestricted,\ntitle={Unrestricted Adversarial Examples via Semantic Manipulation},\nauthor={Anand Bhattad and Min Jin Chong and Kaizhao Liang and Bo Li and D. A. Forsyth},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Sye_OgHFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Sye_OgHFwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "302;151;164",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "701;311;201",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            205.66666666666666,
            68.32438965861475
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            404.3333333333333,
            214.52790546272115
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 178,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=618663074714402924&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SyecdJSKvr",
        "title": "Learning from Label Proportions with Consistency Regularization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The problem of learning from label proportions (LLP) involves training classifiers with weak labels on bags of instances, rather than strong labels on individual instances. The weak labels only contain the label proportions of each bag. The LLP problem is important for many practical applications that only allow label proportions to be collected because of data privacy or annotation costs, and has recently received lots of research attention. Most existing works focus on extending supervised learning models to solve the LLP problem, but the weak learning nature makes it hard to further improve LLP performance with a supervised angle. In this paper, we take a different angle from semi-supervised learning.\nIn particular, we propose a novel model inspired by consistency regularization, a popular concept in semi-supervised learning that encourages the model to produce a decision boundary that better describes the data manifold. With the introduction of consistency regularization, we further extend our study to non-uniform bag-generation and validation-based parameter-selection procedures that better match practical needs. Experiments not only justify that LLP with consistency regularization achieves superior performance, but also demonstrate the practical usability of the proposed procedures.",
        "keywords": "learning from label proportions;consistency regularization;semi-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kuen-Han Tsai;Hsuan-Tien Lin",
        "authorids": "r06922066@csie.ntu.edu.tw;htlin@csie.ntu.edu.tw",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ntsai2020learning,\ntitle={Learning from Label Proportions with Consistency Regularization},\nauthor={Kuen-Han Tsai and Hsuan-Tien Lin},\nyear={2020},\nurl={https://openreview.net/forum?id=SyecdJSKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyecdJSKvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "448;377;280",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "505;407;474",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            368.3333333333333,
            68.85895406957295
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            462.0,
            40.89824772122476
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2559599403492640000&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SyedHyBFwS",
        "title": "Relative Pixel Prediction For Autoregressive Image Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In natural images, transitions between adjacent pixels tend to be smooth and gradual, a fact that has long been exploited in image compression models based on predictive coding. In contrast, existing neural autoregressive image generation models predict the absolute pixel intensities at each position, which is a more challenging problem. In this paper, we propose to predict pixels relatively, by predicting new pixels relative to previously generated pixels (or pixels from the conditioning context, when available). We show that this form of prediction fare favorably to its absolute counterpart when used independently, but their coordination under an unified probabilistic model yields optimal performance, as the model learns to predict sharp transitions using the absolute predictor, while generating smooth transitions using the relative predictor.\nExperiments on multiple benchmarks for unconditional image generation, image colorization, and super-resolution indicate that our presented mechanism leads to improvements in terms of likelihood compared to the absolute prediction counterparts. ",
        "keywords": "Image Generation;Autoregressive",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wang Ling;Chris Dyer;Lei Yu;Lingpeng Kong;Dani Yogatama;Susannah Young",
        "authorids": "lingwang@google.com;cdyer@google.com;leiyu@google.com;lingpenk@google.com;dyogatama@google.com;susannahy@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nling2020relative,\ntitle={Relative Pixel Prediction For Autoregressive Image Generation},\nauthor={Wang Ling and Chris Dyer and Lei Yu and Lingpeng Kong and Dani Yogatama and Susannah Young},\nyear={2020},\nurl={https://openreview.net/forum?id=SyedHyBFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SyedHyBFwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "307;296;193",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            265.3333333333333,
            51.344154703551425
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bDh8XSpSyj0J:scholar.google.com/&scioq=Relative+Pixel+Prediction+For+Autoregressive+Image+Generation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Syee1pVtDS",
        "title": "Distributed Online Optimization with Long-Term Constraints",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We consider distributed online convex optimization problems, where the distributed system consists of various computing units connected through a time-varying communication graph. In each time step, each computing unit selects a constrained vector, experiences a loss equal to an arbitrary convex function evaluated at this vector, and may communicate to its neighbors in the graph. The objective is to minimize the system-wide loss accumulated over time. We propose a decentralized algorithm with regret and cumulative constraint violation in ${\\cal O}(T^{\\max\\{c,1-c\\} })$ and ${\\cal O}(T^{1-c/2})$, respectively, for any $c\\in (0,1)$, where $T$ is the time horizon. When the loss functions are strongly convex, we establish improved regret and constraint violation upper bounds in ${\\cal O}(\\log(T))$ and ${\\cal O}(\\sqrt{T\\log(T)})$. These regret scalings match those obtained by state-of-the-art algorithms and fundamental limits in the corresponding centralized online optimization problem (for both convex and strongly convex loss functions).  In the case of bandit feedback, the proposed algorithms achieve a regret and constraint violation in ${\\cal O}(T^{\\max\\{c,1-c/3 \\} })$ and ${\\cal O}(T^{1-c/2})$ for any $c\\in (0,1)$. We numerically illustrate the performance of our algorithms for the particular case of distributed online regularized linear regression problems.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Deming Yuan;Alexandre Proutiere;Guodong Shi",
        "authorids": "dmyuan1012@gmail.com;alepro@kth.se;guodong.shi@anu.edu.au",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyuan2020distributed,\ntitle={Distributed Online Optimization with Long-Term Constraints},\nauthor={Deming Yuan and Alexandre Proutiere and Guodong Shi},\nyear={2020},\nurl={https://openreview.net/forum?id=Syee1pVtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Syee1pVtDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "166;121;181",
        "wc_reply_reviewers": "0;15;0",
        "wc_reply_authors": "323;443;113",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            156.0,
            25.495097567963924
        ],
        "wc_reply_reviewers_avg": [
            5.0,
            7.0710678118654755
        ],
        "wc_reply_authors_avg": [
            293.0,
            136.38181696985856
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 70,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6473204315354418644&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SyegvgHtwr",
        "title": "Localised Generative Flows",
        "track": "main",
        "status": "Reject",
        "tldr": "We use a deep continuous mixture of bijections to improve normalising flows for density estimation.",
        "abstract": "We argue that flow-based density models based on continuous bijections are limited in their ability to learn target distributions with complicated topologies, and propose localised generative flows (LGFs) to address this problem. LGFs are composed of stacked continuous mixtures of bijections, which enables each bijection to learn a local region of the target rather than its entirety. Our method is a generalisation of existing flow-based methods, which can be used without modification as the basis for an LGF model. Unlike normalising flows, LGFs do not permit exact computation of log likelihoods, but we propose a simple variational scheme that performs well in practice. We show empirically that LGFs yield improved performance across a variety of common density estimation tasks.",
        "keywords": "Deep generative models;normalizing flows;variational inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rob Cornish;Anthony Caterini;George Deligiannidis;Arnaud Doucet",
        "authorids": "rcornish@robots.ox.ac.uk;anthony.caterini@stats.ox.ac.uk;deligian@stats.ox.ac.uk;doucet@stats.ox.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ncornish2020localised,\ntitle={Localised Generative Flows},\nauthor={Rob Cornish and Anthony Caterini and George Deligiannidis and Arnaud Doucet},\nyear={2020},\nurl={https://openreview.net/forum?id=SyegvgHtwr}\n}",
        "github": "https://github.com/anonsubmission974/lgf",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyegvgHtwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "484;376;392",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "490;637;67",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.3333333333333,
            47.59084879353266
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            398.0,
            241.62367433676692
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13463874690566006105&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Syejj0NYvr",
        "title": "Adversarial Interpolation Training: A Simple Approach for Improving Model Robustness",
        "track": "main",
        "status": "Reject",
        "tldr": "adversarial interpolation training: a simple, intuitive and effective approach for improving model robustness",
        "abstract": "We propose a simple approach for adversarial training. The proposed approach utilizes an adversarial interpolation scheme for generating adversarial images and accompanying adversarial labels, which are then used in place of the original data for model training. The proposed approach is intuitive to understand, simple to implement and achieves state-of-the-art performance. We evaluate the proposed approach on a number of datasets including CIFAR10, CIFAR100 and SVHN. Extensive empirical results compared with several state-of-the-art methods against different attacks verify the effectiveness of the proposed approach. ",
        "keywords": "adversarial training;adversarial robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haichao Zhang;Wei Xu",
        "authorids": "hczhang1@gmail.com;wei.xu@horizon.ai",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzhang2020adversarial,\ntitle={Adversarial Interpolation Training: A Simple Approach for Improving Model Robustness},\nauthor={Haichao Zhang and Wei Xu},\nyear={2020},\nurl={https://openreview.net/forum?id=Syejj0NYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Syejj0NYvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "232;320;315",
        "wc_reply_reviewers": "0;0;53",
        "wc_reply_authors": "393;701;822",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            289.0,
            40.3567425180312
        ],
        "wc_reply_reviewers_avg": [
            17.666666666666668,
            24.984439601924677
        ],
        "wc_reply_authors_avg": [
            638.6666666666666,
            180.59961855502968
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14283341485188176043&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SyekUyrFPS",
        "title": "Generating Biased Datasets for Neural Natural Language Processing",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a neural bias annotator to benchmark models on their robustness to biased text datasets.",
        "abstract": "In a time where neural networks are increasingly adopted in sensitive applications, algorithmic bias has emerged as an issue with moral implications. While there are myriad ways that a system may be compromised by bias, systematically isolating and evaluating existing systems on such scenarios is non-trivial, i.e., bias may be subtle, natural and inherently difficult to quantify. To this end, this paper proposes the first systematic study of benchmarking state-of-the-art neural models against biased scenarios. More concretely, we postulate that the bias annotator problem can be approximated with neural models, i.e., we propose generative models of latent bias to deliberately and unfairly associate latent features to a specific class. All in all, our framework provides a new way for principled quantification and evaluation of models against biased datasets. Consequently, we find that state-of-the-art NLP models (e.g., BERT, RoBERTa, XLNET) are readily compromised by biased data. ",
        "keywords": "bias;natural language processing;text classification;natural language inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alvin Chan;Yi Tay;Yew Soon Ong;Aston Zhang",
        "authorids": "guoweial001@e.ntu.edu.sg;ytay017@e.ntu.edu.sg;asysong@ntu.edu.sg;astonz@amazon.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyekUyrFPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "292;698;597",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            529.0,
            172.58234749436764
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tS61K-XI9fgJ:scholar.google.com/&scioq=Generating+Biased+Datasets+for+Neural+Natural+Language+Processing&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SyekweSFPr",
        "title": "Robust Few-Shot Learning with Adversarially Queried Meta-Learners",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We develop meta-learning methods for adversarially robust few-shot learning.",
        "abstract": "Previous work on adversarially robust neural networks requires large training sets and computationally expensive training procedures.  On the other hand, few-shot learning methods are highly vulnerable to adversarial examples.  The goal of our work is to produce networks which both perform well at few-shot tasks and are simultaneously robust to adversarial examples.  We adapt adversarial training for meta-learning, we adapt robust architectural features to small networks for meta-learning, we test pre-processing defenses as an alternative to adversarial training for meta-learning, and we investigate the advantages of robust meta-learning over robust transfer-learning for few-shot tasks.  This work provides a thorough analysis of adversarially robust methods in the context of meta-learning, and we lay the foundation for future work on defenses for few-shot tasks.",
        "keywords": "meta-learning;adversarial;robust;few-shot",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Micah Goldblum;Liam Fowl;Tom Goldstein",
        "authorids": "goldblumcello@gmail.com;lhfowl@gmail.com;tomg@cs.umd.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyekweSFPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "439;271;278",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "367;209;114",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            329.3333333333333,
            77.59868269214083
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            230.0,
            104.34877415028251
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=136694394482539873&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SyepHTNFDS",
        "title": "Graph Residual Flow for Molecular Graph Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a residual flow model for molecular graphs, derive the conditions so that the flow is invertible, and show its efficacy in experiments.",
        "abstract": "Statistical generative models for molecular graphs attract attention from many researchers from the fields of bio- and chemo-informatics. Among these models, invertible flow-based approaches are not fully explored yet. In this paper, we propose a powerful invertible flow for molecular graphs, called Graph Residual Flow (GRF). The GRF is based on residual flows, which are known for more flexible and complex non-linear mappings than traditional coupling flows. We theoretically derive non-trivial conditions such that GRF is invertible, and present a way of keeping the entire flows invertible throughout the training and sampling. Experimental results show that a generative model based on the proposed GRF achieve comparable generation performance, with much smaller number of trainable parameters compared to the existing flow-based model. ",
        "keywords": "deep generative model;normalizing flow;graph generation;cheminformatics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shion Honda;Hirotaka Akita;Katsuhiko Ishiguro;Toshiki Nakanishi;Kenta Oono",
        "authorids": "26x.orc.ed5.1hs@gmail.com;akita714@preferred.jp;k.ishiguro.jp@ieee.org;nakanishi@preferred.jp;oono@preferred.jp",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nhonda2020graph,\ntitle={Graph Residual Flow for Molecular Graph Generation},\nauthor={Shion Honda and Hirotaka Akita and Katsuhiko Ishiguro and Toshiki Nakanishi and Kenta Oono},\nyear={2020},\nurl={https://openreview.net/forum?id=SyepHTNFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SyepHTNFDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1696;360;284",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "715;447;268",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            780.0,
            648.452516894388
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            476.6666666666667,
            183.68874640422464
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 61,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4884694995011446991&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Syetja4KPH",
        "title": "Deep Randomized Least Squares Value Iteration",
        "track": "main",
        "status": "Reject",
        "tldr": "A Deep Learning adaptation of Randomized Least Squares Value Iteration",
        "abstract": "Exploration while learning representations is one of the main challenges Deep\nReinforcement Learning (DRL) faces today. As the learned representation is dependant in the observed data, the exploration strategy has a crucial role. The popular DQN algorithm has improved significantly the capabilities of Reinforcement\nLearning (RL) algorithms to learn state representations from raw data, yet, it uses\na naive exploration strategy which is statistically inefficient. The Randomized\nLeast Squares Value Iteration (RLSVI) algorithm (Osband et al., 2016), on the\nother hand, explores and generalizes efficiently via linearly parameterized value\nfunctions. However, it is based on hand-designed state representation that requires\nprior engineering work for every environment. In this paper, we propose a Deep\nLearning adaptation for RLSVI. Rather than using hand-design state representation, we use a state representation that is being learned directly from the data by a\nDQN agent. As the representation is being optimized during the learning process,\na key component for the suggested method is a likelihood matching mechanism,\nwhich adapts to the changing representations. We demonstrate the importance of\nthe various properties of our algorithm on a toy problem and show that our method\noutperforms DQN in five Atari benchmarks, reaching competitive results with the\nRainbow algorithm.",
        "keywords": "Thompson Sampling;Deep Learning;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guy Adam;Tom Zahavy;Oron Anschel;Nahum Shimkin",
        "authorids": "guyadam3@gmail.com;tomzahavy@gmail.com;oronanschel@gmail.com;shimkin@ee.technion.ac.il",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nadam2020deep,\ntitle={Deep Randomized Least Squares Value Iteration},\nauthor={Guy Adam and Tom Zahavy and Oron Anschel and Nahum Shimkin},\nyear={2020},\nurl={https://openreview.net/forum?id=Syetja4KPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syetja4KPH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "568;1043;369",
        "wc_reply_reviewers": "0;112;62",
        "wc_reply_authors": "87;311;121",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            660.0,
            282.7448791166105
        ],
        "wc_reply_reviewers_avg": [
            58.0,
            45.81120678029194
        ],
        "wc_reply_authors_avg": [
            173.0,
            98.56300861208868
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yeNtrMZGgAIJ:scholar.google.com/&scioq=Deep+Randomized+Least+Squares+Value+Iteration&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Syeu8CNYvS",
        "title": "MODELLING BIOLOGICAL ASSAYS WITH ADAPTIVE DEEP KERNEL LEARNING",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigate the modelling of biological assays using deep kernel learning in few-shot settings.",
        "abstract": "Due to the significant costs of data generation, many prediction tasks within drug discovery are by nature few-shot regression (FSR) problems, including accurate modelling of biological assays.  Although a number of few-shot classification and reinforcement learning methods exist for similar applications, we find relatively few FSR methods meeting the performance standards required for such tasks under real-world constraints. Inspired by deep kernel learning, we develop a novel FSR algorithm that is better suited to these settings. Our algorithm consists of learning a deep network in combination with a kernel function and a differentiable kernel algorithm. As the choice of the kernel is critical, our algorithm learns to find the appropriate one for each task during inference. It thus performs more effectively with complex task distributions, outperforming current state-of-the-art algorithms on both toy and novel, real-world benchmarks that we introduce herein. By introducing novel benchmarks derived from biological assays, we hope that the community will progress towards the development of FSR algorithms suitable for use in noisy and uncertain environments such as drug discovery.",
        "keywords": "few-shot learning;few-shot regression;deep kernel learning;biological assay modelling;drug discovery",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Prudencio Tossou;Basile Dura;Daniel Cohen;Mario Marchand;Fran\u00e7ois Laviolette;Alexandre Lacoste",
        "authorids": "tossouprudencio@gmail.com;basile@invivoai.ca;daniel@invivoai.ca;mario.marchand@ift.ulaval.ca;francois.laviolette@ift.ulaval.ca;allac@elementai.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ntossou2020modelling,\ntitle={{\\{}MODELLING{\\}}   {\\{}BIOLOGICAL{\\}}   {\\{}ASSAYS{\\}}   {\\{}WITH{\\}} {\\{}ADAPTIVE{\\}} {\\{}DEEP{\\}} {\\{}KERNEL{\\}} {\\{}LEARNING{\\}}},\nauthor={Prudencio Tossou and Basile Dura and Daniel Cohen and Mario Marchand and Fran{\\c{c}}ois Laviolette and Alexandre Lacoste},\nyear={2020},\nurl={https://openreview.net/forum?id=Syeu8CNYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer2;AnonReviewer7;AnonReviewer4;AnonReviewer6",
        "site": "https://openreview.net/forum?id=Syeu8CNYvS",
        "pdf_size": 0,
        "rating": "3;3;6;6;8",
        "confidence": "0;0;0;0;0",
        "wc_review": "601;424;678;296;742",
        "wc_reply_reviewers": "0;0;0;0;0",
        "wc_reply_authors": "1219;1397;768;504;818",
        "reply_reviewers": "0;0;0;0;0",
        "reply_authors": "2;2;1;1;1",
        "rating_avg": [
            5.2,
            1.9390719429665317
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            548.2,
            165.06047376643508
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            941.2,
            322.87669473035675
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.4,
            0.4898979485566356
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jkAf-imhBwYJ:scholar.google.com/&scioq=MODELLING+BIOLOGICAL+ASSAYS+WITH+ADAPTIVE+DEEP+KERNEL+LEARNING&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SyevDaVYwr",
        "title": "Confidence Scores Make Instance-dependent Label-noise Learning Possible",
        "track": "main",
        "status": "Reject",
        "tldr": "Using confidence scores makes the instance-dependent noise model tractable.",
        "abstract": "Learning with noisy labels has drawn a lot of attention. In this area, most of recent works only consider class-conditional noise, where the label noise is independent of its input features. This noise model may not be faithful to many real-world applications. Instead, few pioneer works have studied instance-dependent noise, but these methods are limited to strong assumptions on noise models. To alleviate this issue, we introduce confidence-scored instance-dependent noise (CSIDN), where each instance-label pair is associated with a confidence score. The confidence scores are sufficient to estimate the noise functions of each instance with minimal assumptions. Moreover, such scores can be easily and cheaply derived during the construction of the dataset through crowdsourcing or automatic annotation. To handle CSIDN, we design a benchmark algorithm termed instance-level forward correction. Empirical results on synthetic and real-world datasets demonstrate the utility of our proposed method.",
        "keywords": "Instance-dependent label noise;Deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Antonin Berthon;Bo Han;Gang Niu;Tongliang Liu;Masashi Sugiyama",
        "authorids": "berthon.antonin@gmail.com;bo.han@riken.jp;gang.niu@riken.jp;tongliang.liu@sydney.edu.au;sugi@k.u-tokyo.ac.jp",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nberthon2020confidence,\ntitle={Confidence Scores Make Instance-dependent Label-noise Learning Possible},\nauthor={Antonin Berthon and Bo Han and Gang Niu and Tongliang Liu and Masashi Sugiyama},\nyear={2020},\nurl={https://openreview.net/forum?id=SyevDaVYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyevDaVYwr",
        "pdf_size": 0,
        "rating": "1;8;8",
        "confidence": "0;0;0",
        "wc_review": "467;267;344",
        "wc_reply_reviewers": "264;21;0",
        "wc_reply_authors": "938;79;299",
        "reply_reviewers": "2;1;0",
        "reply_authors": "4;2;1",
        "rating_avg": [
            5.666666666666667,
            3.299831645537222
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.3333333333333,
            82.36639012499104
        ],
        "wc_reply_reviewers_avg": [
            95.0,
            119.80818002123227
        ],
        "wc_reply_authors_avg": [
            438.6666666666667,
            364.32616644369034
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 136,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4564876080296413002&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SyevYxHtDB",
        "title": "Prediction Poisoning: Towards Defenses Against DNN Model Stealing Attacks",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose the first approach that can resist DNN model stealing/extraction attacks",
        "abstract": "High-performance Deep Neural Networks (DNNs) are increasingly deployed in many real-world applications e.g., cloud prediction APIs. Recent advances in model functionality stealing attacks via black-box access (i.e., inputs in, predictions out) threaten the business model of such applications, which require a lot of time, money, and effort to develop. Existing defenses take a passive role against stealing attacks, such as by truncating predicted information. We find such passive defenses ineffective against DNN stealing attacks. In this paper, we propose the first defense which actively perturbs predictions targeted at poisoning the training objective of the attacker. We find our defense effective across a wide range of challenging datasets and DNN model stealing attacks, and additionally outperforms existing defenses. Our defense is the first that can withstand highly accurate model stealing attacks for tens of thousands of queries, amplifying the attacker's error rate up to a factor of 85$\\times$ with minimal impact on the utility for benign users.",
        "keywords": "model functionality stealing;adversarial machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tribhuvanesh Orekondy;Bernt Schiele;Mario Fritz",
        "authorids": "orekondy@mpi-inf.mpg.de;schiele@mpi-inf.mpg.de;fritz@cispa.saarland",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nOrekondy2020Prediction,\ntitle={Prediction Poisoning: Towards Defenses Against DNN Model Stealing Attacks},\nauthor={Tribhuvanesh Orekondy and Bernt Schiele and Mario Fritz},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyevYxHtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SyevYxHtDB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "301;104;310",
        "wc_reply_reviewers": "191;0;0",
        "wc_reply_authors": "1157;108;478",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            238.33333333333334,
            95.05904597786694
        ],
        "wc_reply_reviewers_avg": [
            63.666666666666664,
            90.03826347108706
        ],
        "wc_reply_authors_avg": [
            581.0,
            434.4015039875745
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 228,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10118378961637890631&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SyeyF0VtDr",
        "title": "Recurrent Event Network : Global Structure Inference Over Temporal Knowledge Graph",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an autoregressive model to infer graph structures on temporal knowledge graphs.",
        "abstract": "Modeling dynamically-evolving, multi-relational graph data has received a surge of interests with the rapid growth of heterogeneous event data. However, predicting future events on such data requires global structure inference over time and the ability to integrate temporal and structural information, which are not yet well understood. We present Recurrent Event Network (RE-Net), a novel autoregressive architecture for modeling temporal sequences of multi-relational graphs (e.g., temporal knowledge graph), which can perform sequential, global structure inference over future time stamps to predict new events. RE-Net employs a recurrent event encoder to model the temporally conditioned joint probability distribution for the event sequences, and equips the event encoder with a neighborhood aggregator for modeling the concurrent events within a time window associated with each entity. We apply teacher forcing for model training over historical data, and infer graph sequences over future time stamps by sampling from the learned joint distribution in a sequential manner. We evaluate the proposed method via temporal link prediction on \ufb01ve public datasets. Extensive experiments demonstrate the strength of RE-Net, especially on multi-step inference over future time stamps.",
        "keywords": "Temporal Knowledge Graphs;Representation Learning;Graph Sequence Inference;Knowledge Graph Completion",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Woojeong Jin;He Jiang;Meng Qu;Tong Chen;Changlin Zhang;Pedro\u00a0Szekely;Xiang Ren",
        "authorids": "woojeong.jin@usc.edu;jian567@usc.edu;meng.qu@umontreal.ca;tongc2@andrew.cmu.edu;changlin.zhang@usc.edu;pszekely@isi.edu;xiangren@usc.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\njin2020recurrent,\ntitle={Recurrent Event Network : Global Structure Inference Over Temporal Knowledge Graph},\nauthor={Woojeong Jin and He Jiang and Meng Qu and Tong Chen and Changlin Zhang and Pedro~Szekely and Xiang Ren},\nyear={2020},\nurl={https://openreview.net/forum?id=SyeyF0VtDr}\n}",
        "github": "https://github.com/fhuiewwwjklfu2iy43wtqe/jkqehwf2783fhasvfv",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyeyF0VtDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "357;851;573",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1552;1633;893",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;3;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            593.6666666666666,
            202.20341792912953
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1359.3333333333333,
            331.4014014588485
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "SyezSCNYPB",
        "title": "Disentangled GANs for Controllable Generation of High-Resolution Images",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose new GAN architectures that enable disentangled and controllable high-resolution image generation as well as new datasets that will serve as benchmarks for the research community.",
        "abstract": "Generative adversarial networks (GANs) have achieved great success at generating realistic samples. However, achieving disentangled and controllable generation still remains challenging for GANs, especially in the high-resolution image domain. Motivated by this, we introduce AC-StyleGAN, a combination of AC-GAN and StyleGAN, for demonstrating that the controllable generation of high-resolution images is possible with sufficient supervision. More importantly, only using 5% of the labelled data significantly improves the disentanglement quality. Inspired by the observed separation of fine and coarse styles in StyleGAN, we then extend AC-StyleGAN to a new image-to-image model called FC-StyleGAN for semantic manipulation of fine-grained factors in a high-resolution image. In experiments, we show that FC-StyleGAN performs well in only controlling fine-grained factors, with the use of instance normalization, and also demonstrate its good generalization ability to unseen images. Finally, we create two new datasets -- Falcor3D and Isaac3D with higher resolution, more photorealism, and richer variation, as compared to existing disentanglement datasets.",
        "keywords": "Disentangled GANs;controllable generation;high-resolution image synthesis;semantic manipulation;fine-grained factors",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Weili Nie;Tero Karras;Animesh Garg;Shoubhik Debhath;Anjul Patney;Ankit B. Patel;Anima Anandkumar",
        "authorids": "wn8@rice.edu;tkarras@nvidia.com;garg@cs.toronto.edu;shoubhikdn@gmail.com;anjul.patney@gmail.com;abp4@rice.edu;animakumar@gmail.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nnie2020disentangled,\ntitle={Disentangled {\\{}GAN{\\}}s for Controllable Generation of High-Resolution Images},\nauthor={Weili Nie and Tero Karras and Animesh Garg and Shoubhik Debhath and Anjul Patney and Ankit B. Patel and Anima Anandkumar},\nyear={2020},\nurl={https://openreview.net/forum?id=SyezSCNYPB}\n}",
        "github": "https://github.com/AnonymousDisentangledGans/Disentangled_GANs",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyezSCNYPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "800;442;616",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "487;839;527",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            619.3333333333334,
            146.17189272299316
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            617.6666666666666,
            157.35593058060724
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4348021916121468457&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Syg-ET4FPS",
        "title": "Posterior sampling for multi-agent reinforcement learning: solving extensive games with imperfect information",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "Posterior sampling for reinforcement learning (PSRL) is a useful framework for making decisions in an unknown environment.  PSRL maintains a posterior distribution of the environment and then makes planning on the environment sampled from the posterior distribution. Though PSRL works well on single-agent reinforcement learning problems, how to apply PSRL to multi-agent reinforcement learning problems is relatively unexplored. In this work, we extend PSRL to two-player zero-sum extensive-games with imperfect information (TEGI), which is a class of multi-agent systems. More specifically, we combine PSRL with counterfactual regret minimization (CFR), which is the leading algorithm for TEGI with a known environment. Our main contribution is a novel design of interaction strategies. With our interaction strategies, our algorithm provably converges to the Nash Equilibrium at a rate of $O(\\sqrt{\\log T/T})$. Empirical results show that our algorithm works well.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yichi Zhou;Jialian Li;Jun Zhu",
        "authorids": "vofhqn@gmail.com;lijialia16@mails.tsinghua.edu.cn;dcszj@mail.tsinghua.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nZhou2020Posterior,\ntitle={Posterior sampling for multi-agent reinforcement learning: solving extensive games with imperfect information},\nauthor={Yichi Zhou and Jialian Li and Jun Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Syg-ET4FPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syg-ET4FPS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "588;323;344",
        "wc_reply_reviewers": "103;0;70",
        "wc_reply_authors": "981;297;141",
        "reply_reviewers": "1;0;1",
        "reply_authors": "3;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            418.3333333333333,
            120.27838080423635
        ],
        "wc_reply_reviewers_avg": [
            57.666666666666664,
            42.94440850939994
        ],
        "wc_reply_authors_avg": [
            473.0,
            364.8122804950513
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6055205543579166357&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Syg6fxrKDB",
        "title": "A Graph Neural Network Assisted Monte Carlo Tree Search Approach to Traveling Salesman Problem",
        "track": "main",
        "status": "Reject",
        "tldr": "A Graph Neural Network Assisted Monte Carlo Tree Search Approach to Traveling Salesman Problem",
        "abstract": "We present a graph neural network assisted Monte Carlo Tree Search approach for the classical traveling salesman problem (TSP). We adopt a greedy algorithm framework to construct the optimal solution to TSP by adding the nodes successively. A graph neural network (GNN) is trained to capture the local and global graph structure and give the prior probability of selecting each vertex every step. The prior probability provides a heuristics for MCTS, and the MCTS output is an improved probability for selecting the successive vertex, as it is the feedback information by fusing the prior with the scouting procedure. Experimental results on TSP up to 100 nodes demonstrate that the proposed method obtains shorter tours than other learning-based methods.",
        "keywords": "Traveling Salesman Problem;Graph Neural Network;Monte Carlo Tree Search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhihao Xing;Shikui Tu",
        "authorids": "xingzhihao@sjtu.edu.cn;tushikui@sjtu.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nxing2020a,\ntitle={A Graph Neural Network Assisted Monte Carlo Tree Search Approach to Traveling Salesman Problem},\nauthor={Zhihao Xing and Shikui Tu},\nyear={2020},\nurl={https://openreview.net/forum?id=Syg6fxrKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Syg6fxrKDB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "556;1031;1101",
        "wc_reply_reviewers": "0;0;497",
        "wc_reply_authors": "626;965;443",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            896.0,
            242.10879097353458
        ],
        "wc_reply_reviewers_avg": [
            165.66666666666666,
            234.28804683314277
        ],
        "wc_reply_authors_avg": [
            678.0,
            216.25447972238632
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 84,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8717318451700431844&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "Syg6jTNtDH",
        "title": "Learning Numeral Embedding",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose two methods for learning better numeral embeddings that solve the numeral out-of-vocabulary (OOV) problem and can be integrated into traditional word embedding training methods. ",
        "abstract": "Word embedding is an essential building block for deep learning methods for natural language processing. Although word embedding has been extensively studied over the years, the problem of how to effectively embed numerals, a special subset of words, is still underexplored. Existing word embedding methods do not learn numeral embeddings well because there are an infinite number of numerals and their individual appearances in training corpora are highly scarce.\nIn this paper, we propose two novel numeral embedding methods that can handle the out-of-vocabulary (OOV) problem for numerals. We first induce a finite set of prototype numerals using either a self-organizing map or a Gaussian mixture model. We then represent the embedding of a numeral as a weighted average of the prototype number embeddings. Numeral embeddings represented in this manner can be plugged into existing word embedding learning approaches such as skip-gram for training.\nWe evaluated our methods and showed its effectiveness on four intrinsic and extrinsic tasks: word similarity, embedding numeracy, numeral prediction, and sequence labeling. ",
        "keywords": "Natural Language Processing;Numeral Embedding;Word Embedding;Out-of-vocabulary Problem",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chengyue Jiang;Zhonglin Nian;Kaihao Guo;Shanbo Chu;Yinggong Zhao;Libin Shen;Kewei Tu",
        "authorids": "jiangchy@shanghaitech.edu.cn;nianzhl@shanghaitech.edu.cn;guokh@shanghaitech.edu.cn;chushb@leyantech.com;ygzhao@leyantech.com;libin@leyantech.com;tukw@shanghaitech.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\njiang2020learning,\ntitle={Learning Numeral Embedding},\nauthor={Chengyue Jiang and Zhonglin Nian and Kaihao Guo and Shanbo Chu and Yinggong Zhao and Libin Shen and Kewei Tu},\nyear={2020},\nurl={https://openreview.net/forum?id=Syg6jTNtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syg6jTNtDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "350;290;460",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "369;253;459",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            366.6666666666667,
            70.39570693980959
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            360.3333333333333,
            84.32213364367757
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 38,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17756750543351554257&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "Syg7VaNYPB",
        "title": "Generative Latent Flow",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a generative model that combines deterministic Auto-encoders and normalizing flows, and we show that our model's sample quality greatly outperforms that of other AE based generative models.",
        "abstract": "In this work, we propose the Generative Latent Flow (GLF), an algorithm for generative modeling of the data distribution. GLF uses an Auto-encoder (AE) to learn latent representations of the data, and a normalizing flow to map the distribution of the latent variables to that of simple i.i.d noise. In contrast to some other Auto-encoder based generative models, which use various regularizers that encourage the encoded latent distribution to match the prior distribution, our model explicitly constructs a mapping between these two distributions, leading to better density matching while avoiding over regularizing the latent variables. We compare our model with several related techniques, and show that it has many relative advantages including fast convergence, single stage training and minimal reconstruction trade-off. We also study the relationship between our model and its stochastic counterpart, and show that our model can be viewed as a vanishing noise limit of VAEs with flow prior.  Quantitatively, under standardized evaluations, our method achieves state-of-the-art sample quality and diversity among AE based models on commonly used datasets, and is competitive with GANs' benchmarks. ",
        "keywords": "Generative Model;Auto-encoder;Normalizing Flow",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhisheng Xiao;Qing Yan;Yali Amit",
        "authorids": "zxiao@uchicago.edu;yanq@uchicago.edu;amit@marx.uchicago.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nxiao2020generative,\ntitle={Generative Latent Flow},\nauthor={Zhisheng Xiao and Qing Yan and Yali Amit},\nyear={2020},\nurl={https://openreview.net/forum?id=Syg7VaNYPB}\n}",
        "github": "https://drive.google.com/file/d/1FsNM1CxeS9Ntg5jBGWTSPI_6CXRzsZAL/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Syg7VaNYPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "605;748;632",
        "wc_reply_reviewers": "0;0;83",
        "wc_reply_authors": "800;717;658",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            661.6666666666666,
            62.03404083422442
        ],
        "wc_reply_reviewers_avg": [
            27.666666666666668,
            39.12657522565563
        ],
        "wc_reply_authors_avg": [
            725.0,
            58.24660218988458
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 44,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17448042442573435135&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Syg9YyBFvS",
        "title": "Deep Neural Forests: An Architecture for Tabular Data",
        "track": "main",
        "status": "Withdraw",
        "tldr": "An architecture for tabular data, which emulates branches of decision trees and uses dense residual connectivity ",
        "abstract": "Deep neural models, such as convolutional and recurrent networks, achieve phenomenal results over spatial data such as images and text.\nHowever, when considering tabular data, gradient boosting of decision trees (GBDT) remains the method of choice.\nAiming to bridge this gap, we propose \\emph{deep neural forests} (DNF)\n--  a novel architecture that combines elements from decision trees as well as dense residual connections. \nWe present the results of extensive empirical study in which we examine the performance of GBDTs, DNFs and (deep) fully-connected networks. \nThese results indicate that DNFs achieve comparable results to GBDTs on tabular data, and open the door to end-to-end neural modeling of multi-modal data. To this end, we present a successful application of DNFs as part of a hybrid architecture for a multi-modal driving scene understanding classification task.",
        "keywords": "neural architectures;tabular data;multi-modal data;decision trees;gradient boosting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ami Abutbul;Gal Elidan;Liran Katzir;Ran El-Yaniv",
        "authorids": "amramabutbul@cs.technion.ac.il;elidan@google.com;lirank@google.com;elyaniv@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Syg9YyBFvS",
        "pdf_size": 0,
        "rating": "3;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "248;258;554;400",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "219;130;245;461",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.0,
            124.58330546265017
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            263.75,
            121.60463601359942
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cJPAVZV3IhoJ:scholar.google.com/&scioq=Deep+Neural+Forests:+An+Architecture+for+Tabular+Data&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SygBIxSFDS",
        "title": "An Empirical and Comparative Analysis of Data Valuation with Scalable Algorithms",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a scalable algorithm for data valuation, study its utility both empirically and theoretically",
        "abstract": "This paper focuses on valuating training data for supervised learning tasks and studies the Shapley value, a data value notion originated in cooperative game theory. The Shapley value defines a unique value distribution scheme that satisfies a set of appealing properties desired by a data value notion. However, the Shapley value requires exponential complexity to calculate exactly. Existing approximation algorithms, although achieving great improvement over the exact algorithm, relies on retraining models for multiple times, thus remaining limited when applied to larger-scale learning tasks and real-world datasets.\n\nIn this work, we develop a simple and efficient algorithm to estimate the Shapley value with complexity independent with the model size. The key idea is to approximate the model via a $K$-nearest neighbor ($K$NN) classifier, which has a locality structure that can lead to efficient Shapley value calculation. We evaluate the utility of the values produced by the $K$NN proxies in various settings, including label noise correction, watermark detection, data summarization, active data acquisition, and domain adaption. Extensive experiments demonstrate that our algorithm achieves at least comparable utility to the values produced by existing algorithms while significant efficiency improvement. Moreover, we theoretically analyze the Shapley value and justify its advantage over the leave-one-out error as a data value measure.",
        "keywords": "Data valuation;machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruoxi Jia;Xuehui Sun;Jiacen Xu;Ce Zhang;Bo Li;Dawn Song",
        "authorids": "ruoxijia@berkeley.edu;zidaneandmessi@sjtu.edu.cn;coldstudy@sjtu.edu.cn;ce.zhang@inf.ethz.ch;lxbosky@gmail.com;dawnsong@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\njia2020an,\ntitle={An Empirical and Comparative Analysis of Data Valuation with Scalable Algorithms},\nauthor={Ruoxi Jia and Xuehui Sun and Jiacen Xu and Ce Zhang and Bo Li and Dawn Song},\nyear={2020},\nurl={https://openreview.net/forum?id=SygBIxSFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SygBIxSFDS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "385;854;265",
        "wc_reply_reviewers": "0;269;0",
        "wc_reply_authors": "915;1856;849",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;4;2",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            501.3333333333333,
            254.13950674558433
        ],
        "wc_reply_reviewers_avg": [
            89.66666666666667,
            126.8078160927875
        ],
        "wc_reply_authors_avg": [
            1206.6666666666667,
            459.93791851606335
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.9428090415820634
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4743861681038028600&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SygD31HFvB",
        "title": "A Novel Analysis Framework of Lower Complexity Bounds for Finite-Sum Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This paper studies the lower bound complexity for the optimization problem whose objective function is the average of $n$ individual smooth convex functions. We consider the algorithm which gets access to gradient and proximal oracle for each individual component.\nFor the strongly-convex case, we prove such an algorithm can not reach an $\\eps$-suboptimal point in fewer than $\\Omega((n+\\sqrt{\\kappa n})\\log(1/\\eps))$ iterations, where $\\kappa$ is the condition number of the objective function. This lower bound is tighter than previous results and perfectly matches the upper bound of the existing proximal incremental first-order oracle algorithm Point-SAGA.\nWe develop a novel construction to show the above result, which partitions the tridiagonal matrix of classical examples into $n$ groups to make the problem difficult enough to stochastic algorithms. \nThis construction is friendly to the analysis of proximal oracle and also could  be used in general convex and average smooth cases naturally.",
        "keywords": "convex optimization;lower bound complexity;proximal incremental first-order oracle",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guangzeng Xie;Luo Luo;Zhihua Zhang",
        "authorids": "smsxgz@pku.edu.cn;rickyluoluo@gmail.com;zhzhang@math.pku.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nxie2020a,\ntitle={A Novel Analysis Framework of Lower Complexity Bounds for Finite-Sum Optimization},\nauthor={Guangzeng Xie and Luo Luo and Zhihua Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=SygD31HFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SygD31HFvB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "428;274;402",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "285;162;387",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            368.0,
            67.31022705849882
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            278.0,
            91.98912979260103
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RRZJlg0fE9sJ:scholar.google.com/&scioq=A+Novel+Analysis+Framework+of+Lower+Complexity+Bounds+for+Finite-Sum+Optimization&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SygEukHYvB",
        "title": "CEB Improves Model Robustness",
        "track": "main",
        "status": "Reject",
        "tldr": "Training your model with the Conditional Entropy Bottleneck is easy and makes your model more robust.",
        "abstract": "We  demonstrate  that  the  Conditional  Entropy  Bottleneck  (CEB)  can  improve model robustness.  CEB is an easy strategy to implement and works in tandem with data augmentation procedures. We report results of a large scale adversarial robustness study on CIFAR-10, as well as the IMAGENET-C Common Corruptions Benchmark.",
        "keywords": "Information Theory;Adversarial Robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ian Fischer;Alex A. Alemi",
        "authorids": "iansf@google.com;alemi@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nfischer2020ceb,\ntitle={{\\{}CEB{\\}} Improves Model Robustness},\nauthor={Ian Fischer and Alex A. Alemi},\nyear={2020},\nurl={https://openreview.net/forum?id=SygEukHYvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SygEukHYvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "384;465;193",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "752;787;124",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            347.3333333333333,
            114.03021042201443
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            554.3333333333334,
            304.626912067131
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 31,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5037779233113122966&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "SygKyeHKDH",
        "title": "Making Efficient Use of Demonstrations to Solve Hard Exploration Problems",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce R2D3, an agent that makes efficient use of demonstrations to solve hard exploration problems in partially observable environments with highly variable initial conditions.",
        "abstract": "This paper introduces R2D3, an agent that makes efficient use of demonstrations to solve hard exploration problems in partially observable environments with highly variable initial conditions. We also introduce a suite of eight tasks that combine these three properties, and show that R2D3 can solve several of the tasks where other state of the art methods (both with and without demonstrations) fail to see even a single successful trajectory after tens of billions of steps of exploration.",
        "keywords": "imitation learning;deep learning;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Caglar Gulcehre;Tom Le Paine;Bobak Shahriari;Misha Denil;Matt Hoffman;Hubert Soyer;Richard Tanburn;Steven Kapturowski;Neil Rabinowitz;Duncan Williams;Gabriel Barth-Maron;Ziyu Wang;Nando de Freitas;Worlds Team",
        "authorids": "caglarg@google.com;tpaine@google.com;bshahr@google.com;mdenil@google.com;mwhoffman@google.com;soyer@google.com;tanburn@google.com;skapturowski@google.com;ncr@google.com;duncanwilliams@google.com;gabrielbm@google.com;ziyu@google.com;nandodefreitas@google.com;deepmind-worlds-team@google.com",
        "gender": ";;;;;;;;;;;;;",
        "homepage": ";;;;;;;;;;;;;",
        "dblp": ";;;;;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;;;;",
        "orcid": ";;;;;;;;;;;;;",
        "linkedin": ";;;;;;;;;;;;;",
        "or_profile": ";;;;;;;;;;;;;",
        "aff": ";;;;;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;;;;",
        "position": ";;;;;;;;;;;;;",
        "bibtex": "@inproceedings{\nGulcehre2020Making,\ntitle={Making Efficient Use of Demonstrations to Solve Hard Exploration Problems},\nauthor={Caglar Gulcehre and Tom Le Paine and Bobak Shahriari and Misha Denil and Matt Hoffman and Hubert Soyer and Richard Tanburn and Steven Kapturowski and Neil Rabinowitz and Duncan Williams and Gabriel Barth-Maron and Ziyu Wang and Nando de Freitas and Worlds Team},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SygKyeHKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SygKyeHKDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "423;412;435",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "633;456;803",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            423.3333333333333,
            9.392668535736913
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            630.6666666666666,
            141.67176461415625
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            14,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 107,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16003446299089452164&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SygLu0VtPH",
        "title": "Deep Innovation Protection",
        "track": "main",
        "status": "Reject",
        "tldr": "Deep Innovation Protection allows evolving complex world models end-to-end for 3D tasks.",
        "abstract": "Evolutionary-based optimization approaches have recently shown promising results in domains such as Atari and robot locomotion but less so in solving 3D tasks directly from pixels. This paper presents a method called Deep Innovation Protection (DIP) that allows training  complex world models end-to-end for such 3D environments. The main idea behind the approach is to employ multiobjective optimization to temporally reduce the selection pressure on specific components in a world model, allowing other components to adapt. We investigate the emergent representations of these evolved networks, which learn a model of the world without the need for a specific forward-prediction loss. ",
        "keywords": "Neuroevolution;innovation protection;world models;genetic algorithm",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sebastian Risi;Kenneth O. Stanley",
        "authorids": "sebr@itu.dk;kstanley@uber.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nrisi2020deep,\ntitle={Deep Innovation Protection},\nauthor={Sebastian Risi and Kenneth O. Stanley},\nyear={2020},\nurl={https://openreview.net/forum?id=SygLu0VtPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SygLu0VtPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "109;236;812",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "449;58;572",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            385.6666666666667,
            305.88923194879254
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            359.6666666666667,
            219.14125327945192
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ue0H-Y3I4NMJ:scholar.google.com/&scioq=Deep+Innovation+Protection&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SygQlT4FwS",
        "title": "DeepObfusCode: Source Code Obfuscation Through Sequence-to-Sequence Networks",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "Obfuscate code using seq2seq networks, and execute using the obfuscated code and key pair",
        "abstract": "The paper explores a novel methodology in source code obfuscation through the application of text-based recurrent neural network network (RNN) encoder-decoder models in ciphertext generation and key generation. Sequence-to-sequence models\nare incorporated into the model architecture to generate obfuscated code, generate the deobfuscation key, and live execution. Quantitative benchmark comparison to existing obfuscation methods indicate significant improvement in stealth and execution cost for the proposed solution, and experiments regarding the model\u2019s properties yield positive results regarding its character variation, dissimilarity to the original codebase, and consistent length of obfuscated code.",
        "keywords": "sequence-to-sequence network;source code obfuscation;homomorphic encryption",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Siddhartha Datta",
        "authorids": "sdatta@connect.ust.hk",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nanonymous2020deepobfuscode,\ntitle={DeepObfusCode: Source Code Obfuscation Through Sequence-to-Sequence Networks},\nauthor={Anonymous},\nbooktitle={Submitted to International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SygQlT4FwS},\nnote={under review}\n}",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=SygQlT4FwS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8177046613844648096&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SygRikHtvS",
        "title": "Coresets for Accelerating Incremental Gradient Methods",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Many machine learning problems reduce to the problem of minimizing an expected risk. Incremental gradient (IG) methods, such as stochastic gradient descent and its variants, have been successfully used to train the largest of machine learning models.  IG methods, however, are in general slow to converge and sensitive to stepsize choices. Therefore, much work has focused on speeding them up by reducing the variance of the estimated gradient or choosing better stepsizes. An alternative strategy would be to select a carefully chosen subset of training data, train only on that subset, and hence speed up optimization. However, it remains an open question how to achieve this, both theoretically as well as practically, while not compromising on the quality of the final model.  Here we develop CRAIG, a method for selecting a weighted subset (or coreset) of training data in order to speed up IG methods. We prove that by greedily selecting a subset S of training data that minimizes the upper-bound on the estimation error of the full gradient, running IG on this subset will converge to the (near)optimal solution in the same number of epochs as running IG on the full data. But because at each epoch the gradients are computed only on the subset S, we obtain a speedup that is inversely proportional to the size of S. Our subset selection algorithm is fully general and can be applied to most IG methods. We further demonstrate practical effectiveness of our algorithm, CRAIG, through an extensive set of experiments on several applications, including logistic regression and deep neural networks. Experiments show that CRAIG, while achieving practically the same loss, speeds up IG methods by up to 10x for convex and 3x for non-convex (deep learning) problems.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Baharan Mirzasoleiman;Jeff Bilmes;Jure Leskovec",
        "authorids": "baharanm@cs.stanford.edu;bilmes@uw.edu;jure@cs.stanford.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmirzasoleiman2020coresets,\ntitle={Coresets for Accelerating Incremental Gradient Methods},\nauthor={Baharan Mirzasoleiman and Jeff Bilmes and Jure Leskovec},\nyear={2020},\nurl={https://openreview.net/forum?id=SygRikHtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SygRikHtvS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "763;565;636",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "831;805;530",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            654.6666666666666,
            81.90373753512242
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            722.0,
            136.17880402862505
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4154972101286880455&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SygSLlStwS",
        "title": "Consistent Meta-Reinforcement Learning via Model Identification and Experience Relabeling",
        "track": "main",
        "status": "Reject",
        "tldr": "Sample efficient meta-reinforcement learning that extrapolates to out of distribution tasks.",
        "abstract": "Reinforcement learning algorithms can acquire policies for complex tasks automatically, however the number of samples required to learn a diverse set of skills can be prohibitively large. While meta-reinforcement learning has enabled agents to leverage prior experience to adapt quickly to new tasks, the performance of these methods depends crucially on how close the new task is to the previously experienced tasks. Current approaches are either not able to extrapolate well, or can do so at the expense of requiring extremely large amounts of data due to on-policy training. In this work, we present model identification and experience relabeling (MIER), a meta-reinforcement learning algorithm that is both efficient and extrapolates well when faced with out-of-distribution tasks at test time based on a simple insight: we recognize that dynamics models can be adapted efficiently and consistently with off-policy data, even if policies and value functions cannot. These dynamics models can then be used to continue training policies for out-of-distribution tasks without using meta-reinforcement learning at all, by generating synthetic experience for the new task. ",
        "keywords": "Meta-Reinforcement Learning;Reinforcement Learning;Off-Policy;Model Based",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Russell Mendonca;Xinyang Geng;Chelsea Finn;Sergey Levine",
        "authorids": "russellm@berkeley.edu;young.geng@berkeley.edu;cbfinn@cs.stanford.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmendonca2020consistent,\ntitle={Consistent Meta-Reinforcement Learning via Model Identification and Experience Relabeling},\nauthor={Russell Mendonca and Xinyang Geng and Chelsea Finn and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=SygSLlStwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SygSLlStwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "611;901;537",
        "wc_reply_reviewers": "0;178;0",
        "wc_reply_authors": "558;930;411",
        "reply_reviewers": "0;3;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            683.0,
            157.08171970877663
        ],
        "wc_reply_reviewers_avg": [
            59.333333333333336,
            83.91000470080364
        ],
        "wc_reply_authors_avg": [
            633.0,
            218.41703230288613
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8EqnIRDt8e0J:scholar.google.com/&scioq=Consistent+Meta-Reinforcement+Learning+via+Model+Identification+and+Experience+Relabeling&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SygT21SFvB",
        "title": "Towards Understanding Generalization in Gradient-Based Meta-Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We study generalization of neural networks in gradient-based meta- learning by analyzing various properties of the objective landscape.",
        "abstract": "In this work we study generalization of neural networks in gradient-based meta-learning by analyzing various properties of the objective landscapes. We experimentally demonstrate that as meta-training progresses, the meta-test solutions obtained by adapting the meta-train solution of the model to new tasks via few steps of gradient-based fine-tuning, become flatter, lower in loss, and further away from the meta-train solution. We also show that those meta-test solutions become flatter even as generalization starts to degrade, thus providing an experimental evidence against the correlation between generalization and flat minima in the paradigm of gradient-based meta-leaning. Furthermore, we provide empirical evidence that generalization to new tasks is correlated with the coherence between their adaptation trajectories in parameter space, measured by the average cosine similarity between task-specific trajectory directions, starting from a same meta-train solution. We also show that coherence of meta-test gradients, measured by the average inner product between the task-specific gradient vectors evaluated at meta-train solution, is also correlated with generalization.",
        "keywords": "meta-learning;objective landscapes",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Simon Guiroy;Vikas Verma;Christopher J. Pal",
        "authorids": "simon.guiroy@umontreal.ca;vikasverma.iitm@gmail.com;christopher.pal@polymtl.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://github.com/anonymousauthor181/anonymous_repository",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SygT21SFvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "514;536;172",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            407.3333333333333,
            166.64799895454956
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5448840377832686739&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SygW0TEFwH",
        "title": "Sign Bits Are All You Need for Black-Box Attacks",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a sign-based, rather than magnitude-based, gradient estimation approach that shifts gradient estimation from continuous to binary black-box optimization.",
        "abstract": "We present a novel black-box adversarial attack algorithm with state-of-the-art model evasion rates for query efficiency under $\\ell_\\infty$ and $\\ell_2$ metrics. It exploits a \\textit{sign-based}, rather than magnitude-based, gradient estimation approach that shifts the gradient estimation from continuous to binary black-box optimization. It adaptively constructs queries to estimate the gradient, one query relying upon the previous, rather than re-estimating the gradient each step with random query construction. Its reliance on sign bits yields  a smaller memory footprint and it requires neither hyperparameter tuning or dimensionality reduction. Further, its theoretical performance is guaranteed and it can characterize  adversarial subspaces better than white-box gradient-aligned subspaces. On two public black-box attack challenges and a model robustly trained against transfer attacks, the algorithm's evasion rates surpass all submitted attacks. For a suite of published models,  the algorithm is $3.8\\times$ less failure-prone while spending $2.5\\times$  fewer queries versus the best combination of state of art algorithms. For example, it evades a standard MNIST model using just $12$ queries on average. Similar performance is observed on a standard IMAGENET model with an average of $579$ queries.",
        "keywords": "Black-box adversarial attack models;Deep Nets;Adversarial Examples;Black-Box Optimization;Zeroth-Order Optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Abdullah Al-Dujaili;Una-May O'Reilly",
        "authorids": "ash.aldujaili@gmail.com;unamay@csail.mit.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nAl-Dujaili2020Sign,\ntitle={Sign Bits Are All You Need for Black-Box Attacks},\nauthor={Abdullah Al-Dujaili and Una-May O'Reilly},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SygW0TEFwH}\n}",
        "github": "https://github.com/ash-aldujaili/blackbox-adv-examples-signhunter",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SygW0TEFwH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "248;453;323",
        "wc_reply_reviewers": "0;240;0",
        "wc_reply_authors": "159;522;594",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            341.3333333333333,
            84.68897343941667
        ],
        "wc_reply_reviewers_avg": [
            80.0,
            113.13708498984761
        ],
        "wc_reply_authors_avg": [
            425.0,
            190.37331745809337
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 93,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7597354738321523797&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SygWvAVFPr",
        "title": "Neural Module Networks for Reasoning over Text",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper extends neural module networks to answer compositional questions against text by introducing differentiable modules that perform reasoning over text and symbols in a probabilistic manner.",
        "abstract": "Answering compositional questions that require multiple steps of reasoning against text is challenging, especially when they involve discrete, symbolic operations. Neural module networks (NMNs) learn to parse such questions as executable programs composed of learnable modules, performing well on synthetic visual QA domains. However, we find that it is challenging to learn these models for non-synthetic questions on open-domain text, where a model needs to deal with the diversity of natural language and perform a broader range of reasoning. We extend NMNs by: (a) introducing modules that reason over a paragraph of text, performing symbolic reasoning (such as arithmetic, sorting, counting) over numbers and dates in a probabilistic and differentiable manner; and (b) proposing an unsupervised auxiliary loss to help extract arguments associated with the events in text. Additionally, we show that a limited amount of heuristically-obtained question program and intermediate module output supervision provides sufficient inductive bias for accurate learning. Our proposed model significantly outperforms state-of-the-art models on a subset of the DROP dataset that poses a variety of reasoning challenges that are covered by our modules.",
        "keywords": "question answering;compositionality;neural module networks;multi-step reasoning;reading comprehension",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nitish Gupta;Kevin Lin;Dan Roth;Sameer Singh;Matt Gardner",
        "authorids": "gnnitish@gmail.com;kevinlin@eecs.berkeley.edu;danroth@seas.upenn.edu;sameer@uci.edu;mattg@allenai.org",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nGupta2020Neural,\ntitle={Neural Module Networks for Reasoning over Text},\nauthor={Nitish Gupta and Kevin Lin and Dan Roth and Sameer Singh and Matt Gardner},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SygWvAVFPr}\n}",
        "github": "https://nitishgupta.github.io/nmn-drop/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SygWvAVFPr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "244;788;434",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "311;644;171",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            488.6666666666667,
            225.4260164419557
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            375.3333333333333,
            198.38738759194234
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 169,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2046532742306416986&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "SygXPaEYvH",
        "title": "VL-BERT: Pre-training of Generic Visual-Linguistic Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "VL-BERT is a simple yet powerful pre-trainable generic representation for visual-linguistic tasks. It is pre-trained on the massive-scale caption dataset and text-only corpus, and can be finetuned for varies down-stream visual-linguistic tasks.",
        "abstract": "We introduce a new pre-trainable generic representation for visual-linguistic tasks, called Visual-Linguistic BERT (VL-BERT for short). VL-BERT adopts the simple yet powerful Transformer model as the backbone, and extends it to take both visual and linguistic embedded features as input. In it, each element of the input is either of a word from the input sentence, or a region-of-interest (RoI) from the input image. It is designed to fit for most of the visual-linguistic downstream tasks. To better exploit the generic representation, we pre-train VL-BERT on the massive-scale Conceptual Captions dataset, together with text-only corpus. Extensive empirical analysis demonstrates that the pre-training procedure can better align the visual-linguistic clues and benefit the downstream tasks, such as visual commonsense reasoning, visual question answering and referring expression comprehension. It is worth noting that VL-BERT achieved the first place of single model on the leaderboard of the VCR benchmark.",
        "keywords": "Visual-Linguistic;Generic Representation;Pre-training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Weijie Su;Xizhou Zhu;Yue Cao;Bin Li;Lewei Lu;Furu Wei;Jifeng Dai",
        "authorids": "jackroos@mail.ustc.edu.cn;ezra0408@mail.ustc.edu.cn;yuecao@microsoft.com;binli@ustc.edu.cn;lewlu@microsoft.com;fuwei@microsoft.com;jifdai@microsoft.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nSu2020VL-BERT:,\ntitle={VL-BERT: Pre-training of Generic Visual-Linguistic Representations},\nauthor={Weijie Su and Xizhou Zhu and Yue Cao and Bin Li and Lewei Lu and Furu Wei and Jifeng Dai},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SygXPaEYvH}\n}",
        "github": "https://github.com/jackroos/VL-BERT",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SygXPaEYvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "685;659;431",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "771;416;425",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            591.6666666666666,
            114.10326706784322
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            537.3333333333334,
            165.268132304917
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2015,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7768062511032572067&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SygaYANFPr",
        "title": "Guided variational autoencoder for disentanglement learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Learning a controllable generative model by performing latent representation disentanglement learning.",
        "abstract": "We propose an algorithm, guided variational autoencoder (Guided-VAE), that is able to learn a controllable generative model by performing latent representation disentanglement learning. The learning objective is achieved by providing signal to the latent encoding/embedding in VAE without changing its main backbone architecture, hence retaining the desirable properties of the VAE. We design an unsupervised and a supervised strategy in Guided-VAE and observe enhanced modeling and controlling capability over the vanilla VAE. In the unsupervised strategy, we guide the VAE learning by introducing a lightweight decoder that learns latent geometric transformation and principal components; in the supervised strategy, we use an adversarial excitation and inhibition mechanism to encourage the disentanglement of the latent variables. Guided-VAE enjoys its transparency and simplicity for the general representation learning task, as well as disentanglement learning. On a number of experiments for representation learning, improved synthesis/sampling, better disentanglement for classification, and reduced classification errors in meta learning have been observed. ",
        "keywords": "variational autoencoder;representation learning;disentanglement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zheng Ding;Yifan Xu;Weijian Xu;Yang Yang;Max Welling;Zhuowen Tu",
        "authorids": "dingz16@mails.tsinghua.edu.cn;yix081@ucsd.edu;wex041@eng.ucsd.edu;yyangy@qti.qualcomm.com;welling.max@gmail.com;ztu@ucsd.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SygaYANFPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "2719;194;307",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1073.3333333333333,
            1164.5761269902262
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 151,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15032334543924906961&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "SygagpEKwB",
        "title": "Disentangling Factors of Variations Using Few Labels",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Learning disentangled representations is considered a cornerstone problem in representation learning. Recently, Locatello et al. (2019) demonstrated that unsupervised disentanglement learning without inductive biases is theoretically impossible and that existing inductive biases and unsupervised methods do not allow to consistently learn disentangled representations. However, in many practical settings, one might have access to a limited amount of supervision, for example through manual labeling of (some) factors of variation in a few training examples. In this paper, we investigate the impact of such supervision on state-of-the-art disentanglement methods and perform a large scale study, training over 52000 models under well-defined and reproducible experimental conditions.  We observe that a small number of labeled examples (0.01--0.5% of the data set), with potentially imprecise and incomplete labels, is sufficient to perform model selection on state-of-the-art unsupervised models. Further, we investigate the benefit of incorporating supervision into the training process. Overall, we empirically validate that with little and imprecise supervision it is possible to reliably learn disentangled representations.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Francesco Locatello;Michael Tschannen;Stefan Bauer;Gunnar R\u00e4tsch;Bernhard Sch\u00f6lkopf;Olivier Bachem",
        "authorids": "flocatello@tuebingen.mpg.de;tschannen@google.com;stefan.bauer@tuebingen.mpg.de;raetsch@inf.ethz.ch;bs@tuebingen.mpg.de;bachem@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nLocatello2020Disentangling,\ntitle={Disentangling Factors of Variations Using Few Labels},\nauthor={Francesco Locatello and Michael Tschannen and Stefan Bauer and Gunnar R\u00e4tsch and Bernhard Sch\u00f6lkopf and Olivier Bachem},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SygagpEKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SygagpEKwB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "233;310;229",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "612;121;120",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            257.3333333333333,
            37.27674282385138
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            284.3333333333333,
            231.6956816362551
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 210,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6110804235668339123&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "SygcCnNKwr",
        "title": "Measuring Compositional Generalization: A Comprehensive Method on Realistic Data",
        "track": "main",
        "status": "Poster",
        "tldr": "Benchmark and method to measure compositional generalization by maximizing divergence of compound frequency at small divergence of atom frequency.",
        "abstract": "State-of-the-art machine learning methods exhibit limited compositional generalization. At the same time, there is a lack of realistic benchmarks that comprehensively measure this ability, which makes it challenging to find and evaluate improvements. We introduce a novel method to systematically construct such benchmarks by maximizing compound divergence while guaranteeing a small atom divergence between train and test sets, and we quantitatively compare this method to other approaches for creating compositional generalization benchmarks. We present a large and realistic natural language question answering dataset that is constructed according to this method, and we use it to analyze the compositional generalization ability of three machine learning architectures. We find that they fail to generalize compositionally and that there is a surprisingly strong negative correlation between compound divergence and accuracy. We also demonstrate how our method can be used to create new compositionality benchmarks on top of the existing SCAN dataset, which confirms these findings.\n",
        "keywords": "compositionality;generalization;natural language understanding;benchmark;compositional generalization;compositional modeling;semantic parsing;generalization measurement",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Keysers;Nathanael Sch\u00e4rli;Nathan Scales;Hylke Buisman;Daniel Furrer;Sergii Kashubin;Nikola Momchev;Danila Sinopalnikov;Lukasz Stafiniak;Tibor Tihon;Dmitry Tsarkov;Xiao Wang;Marc van Zee;Olivier Bousquet",
        "authorids": "keysers@google.com;schaerli@google.com;nkscales@google.com;hylke@google.com;danielfurrer@google.com;sergik@google.com;nikola@google.com;sinopalnikov@google.com;lukstafi@google.com;ttihon@google.com;tsar@google.com;wangxiao@google.com;marcvanzee@google.com;obousquet@google.com",
        "gender": ";;;;;;;;;;;;;",
        "homepage": ";;;;;;;;;;;;;",
        "dblp": ";;;;;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;;;;",
        "orcid": ";;;;;;;;;;;;;",
        "linkedin": ";;;;;;;;;;;;;",
        "or_profile": ";;;;;;;;;;;;;",
        "aff": ";;;;;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;;;;",
        "position": ";;;;;;;;;;;;;",
        "bibtex": "@inproceedings{\nkeysers2020measuring,\ntitle={Measuring Compositional Generalization: A Comprehensive Method on Realistic Data},\nauthor={Daniel Keysers and Nathanael Sch{\\\"a}rli and Nathan Scales and Hylke Buisman and Daniel Furrer and Sergii Kashubin and Nikola Momchev and Danila Sinopalnikov and Lukasz Stafiniak and Tibor Tihon and Dmitry Tsarkov and Xiao Wang and Marc van Zee and Olivier Bousquet},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SygcCnNKwr}\n}",
        "github": "https://github.com/google-research/google-research/tree/master/cfq",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SygcCnNKwr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "105;242;315",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "385;121;468",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            220.66666666666666,
            87.04915597267761
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            324.6666666666667,
            147.94668709444701
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            14,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 406,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12673909228916858481&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SygcSlHFvS",
        "title": "On Understanding Knowledge Graph Representation",
        "track": "main",
        "status": "Reject",
        "tldr": "Understanding the structure of knowledge graph representation using insight from word embeddings.",
        "abstract": "Many methods have been developed to represent knowledge graph data, which implicitly exploit low-rank latent structure in the data to encode known information and enable unknown facts to be inferred. To predict whether a relationship holds between entities, their embeddings are typically compared in the latent space following a relation-specific mapping. Whilst link prediction has steadily improved, the latent structure, and hence why such models capture semantic information, remains unexplained. We build on recent theoretical interpretation of word embeddings as a basis to consider an explicit structure for representations of relations between entities. For identifiable relation types, we are able to predict properties and justify the relative performance of leading knowledge graph representation methods, including their often overlooked ability to make independent predictions.",
        "keywords": "knowledge graphs;word embedding;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Carl Allen*;Ivana Balazevic*;Timothy M Hospedales",
        "authorids": "carl.allen@ed.ac.uk;ivana.balazevic@ed.ac.uk;t.hospedales@ed.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nallen*2020on,\ntitle={On Understanding Knowledge Graph Representation},\nauthor={Carl Allen* and Ivana Balazevic* and Timothy M Hospedales},\nyear={2020},\nurl={https://openreview.net/forum?id=SygcSlHFvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SygcSlHFvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "211;330;330",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "727;953;981",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            290.3333333333333,
            56.09713797413277
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            887.0,
            113.71308924950841
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 17,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11304165640865894477&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SygeY1SYvr",
        "title": "Are Few-shot Learning Benchmarks Too Simple ?",
        "track": "main",
        "status": "Reject",
        "tldr": "Omniglot and miniImageNet are too simple for few-shot learning because we can solve them without using labels during meta-evaluation, as demonstrated with a method called centroid networks",
        "abstract": "We argue that the widely used Omniglot and miniImageNet benchmarks are too simple because their class semantics do not vary across episodes, which defeats their intended purpose of evaluating few-shot classification methods. The class semantics of Omniglot is invariably \u201ccharacters\u201d and the class semantics of miniImageNet, \u201cobject category\u201d. Because the class semantics are so similar, we propose a new method called Centroid Networks which can achieve surprisingly high accuracies on Omniglot and miniImageNet without using any labels at metaevaluation time. Our results suggest that those benchmarks are not adapted for supervised few-shot classification since the supervision itself is not necessary during meta-evaluation. The Meta-Dataset, a collection of 10 datasets, was recently proposed as a harder few-shot classification benchmark. Using our method, we derive a new metric, the Class Semantics Consistency Criterion, and use it to quantify the difficulty of Meta-Dataset. Finally, under some restrictive assumptions, we show that Centroid Networks is faster and more accurate than a state-of-the-art learning-to-cluster method (Hsu et al., 2018). ",
        "keywords": "few-shot;classification;meta-learning;benchmark;omniglot;miniimagenet;meta-dataset;learning to cluster;learning;cluster;unsupervised",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gabriel Huang;Hugo Larochelle;Simon Lacoste-Julien",
        "authorids": "gbxhuang@gmail.com;hugolarochelle@google.com;slacoste@iro.umontreal.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhuang2020are,\ntitle={Are Few-shot Learning Benchmarks Too Simple ?},\nauthor={Gabriel Huang and Hugo Larochelle and Simon Lacoste-Julien},\nyear={2020},\nurl={https://openreview.net/forum?id=SygeY1SYvr}\n}",
        "github": "https://drive.google.com/open?id=1GZgwaY4_zQizv2tc6IhaYYO-Zam0CHZi",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SygeY1SYvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "402;522;252",
        "wc_reply_reviewers": "0;259;0",
        "wc_reply_authors": "396;2006;316",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;4;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            392.0,
            110.45361017187261
        ],
        "wc_reply_reviewers_avg": [
            86.33333333333333,
            122.09377088487722
        ],
        "wc_reply_authors_avg": [
            906.0,
            778.5028366465126
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18276588072717051462&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SygfNCEYDH",
        "title": "Weakly-supervised Knowledge Graph Alignment with Adversarial Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This paper studies aligning knowledge graphs from different sources or languages. Most existing methods train supervised methods for the alignment, which usually require a large number of aligned knowledge triplets. However, such a large number of aligned knowledge triplets may not be available or are expensive to obtain in many domains. Therefore, in this paper we propose to study aligning knowledge graphs in fully-unsupervised or weakly-supervised fashion, i.e., without or with only a few aligned triplets. We propose an unsupervised framework to align the entity and relation embddings of different knowledge graphs with an adversarial learning framework. Moreover, a regularization term which maximizes the mutual information between the embeddings of different knowledge graphs is used to mitigate the problem of mode collapse when learning the alignment functions. Such a framework can be further seamlessly integrated with existing supervised methods by utilizing a limited number of aligned triples as guidance. Experimental results on multiple datasets prove the effectiveness of our proposed approach in both the unsupervised and the weakly-supervised settings.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Meng Qu;Jian Tang;Yoshua Bengio",
        "authorids": "meng.qu@umontreal.ca;jian.tang@hec.ca;yoshua.bengio@mila.quebec",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nqu2020weaklysupervised,\ntitle={Weakly-supervised Knowledge Graph Alignment with Adversarial Learning},\nauthor={Meng Qu and Jian Tang and Yoshua Bengio},\nyear={2020},\nurl={https://openreview.net/forum?id=SygfNCEYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SygfNCEYDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "136;70;352",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "205;8;382",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            186.0,
            120.43255373859678
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            198.33333333333334,
            152.75761483110716
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15461097099151386427&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "Sygg3JHtwB",
        "title": "Step Size Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an efficient and effective step size adaptation method for the gradient methods.",
        "abstract": "This paper proposes a new approach for step size adaptation in gradient methods. The proposed method called step size optimization (SSO) formulates the step size adaptation as an optimization problem which minimizes the loss function with respect to the step size for the given model parameters and gradients. Then, the step size is optimized based on alternating direction method of multipliers (ADMM). SSO does not require the second-order information or any probabilistic models for adapting the step size, so it is efficient and easy to implement. Furthermore, we also introduce stochastic SSO for stochastic learning environments. In the experiments, we integrated SSO to vanilla SGD and Adam, and they outperformed state-of-the-art adaptive gradient methods including RMSProp, Adam, L4-Adam, and AdaBound on extensive benchmark datasets.",
        "keywords": "Deep Learning;Step Size Adaptation;Nonconvex Optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gyoung S. Na;Dongmin Hyeon;Hwanjo Yu",
        "authorids": "ngs0726@gmail.com;dmhyeon@postech.ac.kr;hwanjoyu@postech.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nna2020step,\ntitle={Step Size Optimization},\nauthor={Gyoung S. Na and Dongmin Hyeon and Hwanjo Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=Sygg3JHtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=Sygg3JHtwB",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "151;159",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "148;119",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            155.0,
            4.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            133.5,
            14.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "SygkSkSFDB",
        "title": "On the expected running time of nonconvex optimization with early stopping",
        "track": "main",
        "status": "Reject",
        "tldr": "How to bound the expected number of iterations before gradient descent finds a stationary point",
        "abstract": "This work examines the convergence of stochastic gradient algorithms that use early stopping based on a validation function, wherein optimization ends when the magnitude of a validation function gradient drops below a threshold. We derive conditions that guarantee this stopping rule is well-defined and analyze the expected number of iterations and gradient evaluations needed to meet this criteria. The guarantee accounts for the distance between the training and validation sets, measured with the Wasserstein distance. We develop the approach for stochastic gradient descent (SGD), allowing for biased update directions subject to a Lyapunov condition. We apply the approach to obtain new bounds on the expected running time of several algorithms, including Decentralized SGD (DSGD), a variant of decentralized SGD, known as \\textit{Stacked SGD}, and the stochastic variance reduced gradient (SVRG) algorithm. Finally, we consider the generalization properties of the iterate returned by early stopping.",
        "keywords": "non-convex;stopping times;statistics;gradient descent;early stopping",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thomas Flynn;Kwang Min Yu;Abid Malik;Shinjae Yoo;Nicholas D'Imperio",
        "authorids": "thomasflynn918@gmail.com;kyu@bnl.gov;amalik@bnl.gov;sjyoo@bnl.gov;dimperio@bnl.gov",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nflynn2020on,\ntitle={On the expected running time of nonconvex optimization with early stopping},\nauthor={Thomas Flynn and Kwang Min Yu and Abid Malik and Shinjae Yoo and Nicholas D'Imperio},\nyear={2020},\nurl={https://openreview.net/forum?id=SygkSkSFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SygkSkSFDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "274;370;416",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            353.3333333333333,
            59.15704147061071
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:M3JatF0Mmq0J:scholar.google.com/&scioq=On+the+expected+running+time+of+nonconvex+optimization+with+early+stopping&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "SyglyANFDr",
        "title": "SGD with Hardness Weighted Sampling for Distributionally Robust Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "An SGD-based method for training deep neural networks with distributionally robust optimization",
        "abstract": "Distributionally Robust Optimization (DRO) has been proposed as an alternative to Empirical Risk Minimization (ERM) in order to account for potential biases in the training data distribution. However, its use in deep learning has been severely restricted due to the relative inefficiency of the optimizers available for DRO compared to the wide-spread Stochastic Gradient Descent (SGD) based optimizers for deep learning with ERM. In this work, we demonstrate that SGD with hardness weighted sampling is a principled and efficient optimization method for DRO in machine learning and is particularly suited in the context of deep learning. Similar to a hard example mining strategy in essence and in practice, the proposed algorithm is straightforward to implement and computationally as efficient as SGD-based optimizers used for deep learning.  It only requires adding a softmax layer and maintaining an history of the loss values for each training example to compute adaptive sampling probabilities.  In contrast to typical ad hoc hard mining approaches, and exploiting recent theoretical results in deep learning optimization, we prove the convergence of our DRO algorithm for over-parameterized deep learning networks with ReLU activation and finite  number  of  layers  and parameters.  Preliminary results demonstrate the feasibility and usefulness of our approach.",
        "keywords": "distributionally robust optimization;distributionally robust deep learning;over-parameterized deep neural networks;deep neural networks;AI safety;hard example mining",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lucas Fidon;Sebastien Ourselin;Tom Vercauteren",
        "authorids": "lucas.fidon@kcl.ac.uk;sebastien.ourselin@kcl.ac.uk;tom.vercauteren@kcl.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfidon2020sgd,\ntitle={{\\{}SGD{\\}} with Hardness Weighted Sampling for Distributionally Robust Deep Learning},\nauthor={Lucas Fidon and Sebastien Ourselin and Tom Vercauteren},\nyear={2020},\nurl={https://openreview.net/forum?id=SyglyANFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyglyANFDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "638;437;399",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "315;735;51",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            491.3333333333333,
            104.86287342154145
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            367.0,
            281.65226787654314
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=953239630982561262&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Sygn20VtwH",
        "title": "Metagross: Meta Gated Recursive Controller Units for Sequence Modeling",
        "track": "main",
        "status": "Reject",
        "tldr": "Recursive Parameterization of Recurrent Models improve performance ",
        "abstract": "This paper proposes Metagross (Meta Gated Recursive Controller), a new neural sequence modeling unit. Our proposed unit is characterized by recursive parameterization of its gating functions, i.e., gating mechanisms of Metagross are controlled by instances of itself, which are repeatedly called in a recursive fashion. This can be interpreted as a form of meta-gating and recursively parameterizing a recurrent model. We postulate that our proposed inductive bias provides modeling benefits pertaining to learning with inherently hierarchically-structured sequence data (e.g., language, logical or music tasks). To this end, we conduct extensive experiments on recursive logic tasks (sorting, tree traversal, logical inference), sequential pixel-by-pixel classification, semantic parsing, code generation, machine translation and polyphonic music modeling, demonstrating the widespread utility of the proposed approach, i.e., achieving state-of-the-art (or close) performance on all tasks.",
        "keywords": "Deep Learning;Natural Language Processing;Recurrent Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yi Tay;Yikang Shen;Alvin Chan;Yew Soon Ong",
        "authorids": "ytay017@e.ntu.edu.sg;yikang.shn@gmail.com;guoweial001@e.ntu.edu.sg;asysong@ntu.edu.sg",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntay2020metagross,\ntitle={Metagross: Meta Gated Recursive Controller Units for Sequence Modeling},\nauthor={Yi Tay and Yikang Shen and Alvin Chan and Yew Soon Ong},\nyear={2020},\nurl={https://openreview.net/forum?id=Sygn20VtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Sygn20VtwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "353;541;417",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            437.0,
            78.04272334219678
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2701380353799703545&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SygpC6Ntvr",
        "title": "Minimizing FLOPs to Learn Efficient Sparse Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose an approach to learn sparse high dimensional representations that are fast to search, by incorporating a surrogate of the number of operations directly into the loss function.",
        "abstract": "Deep representation learning has become one of the most widely adopted approaches for visual search, recommendation, and identification. Retrieval of such  representations from a large database is however computationally challenging. Approximate methods based on learning compact representations, have been widely explored for this problem, such as locality sensitive hashing, product quantization, and PCA. In this work, in contrast to learning compact representations, we propose to learn high dimensional and sparse representations that have similar representational capacity as dense embeddings while being more efficient due to sparse matrix multiplication operations which can be much faster than dense multiplication. Following the key insight that the number of operations decreases quadratically with the sparsity of embeddings provided the non-zero entries are distributed uniformly across dimensions, we propose a novel approach to learn such distributed sparse embeddings via the use of a carefully constructed regularization function that directly minimizes a continuous relaxation of the number of floating-point operations (FLOPs) incurred during retrieval. Our experiments show that our approach is competitive to the other baselines and yields a similar or better speed-vs-accuracy tradeoff on practical datasets.",
        "keywords": "sparse embeddings;deep representations;metric learning;regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Biswajit Paria;Chih-Kuan Yeh;Ian E.H. Yen;Ning Xu;Pradeep Ravikumar;Barnab\u00e1s P\u00f3czos",
        "authorids": "bparia@cs.cmu.edu;cjyeh@cs.cmu.edu;a061105@gmail.com;ningxu01@gmail.com;pradeepr@cs.cmu.edu;bapoczos@cs.cmu.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nParia2020Minimizing,\ntitle={Minimizing FLOPs to Learn Efficient Sparse Representations},\nauthor={Biswajit Paria and Chih-Kuan Yeh and Ian E.H. Yen and Ning Xu and Pradeep Ravikumar and Barnab\u00e1s P\u00f3czos},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SygpC6Ntvr}\n}",
        "github": "https://github.com/biswajitsc/sparse-embed",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SygpC6Ntvr",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "623;659;422",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "385;258;148",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            568.0,
            104.27847332982968
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            263.6666666666667,
            96.83777958811095
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 77,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16391107852895136725&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Sygt9yBtPS",
        "title": "Text Embedding Bank Module for Detailed Image Paragraph Caption",
        "track": "main",
        "status": "Withdraw",
        "tldr": "TEB Module for IPC",
        "abstract": "Image paragraph captioning is the task of automatically generating multiple sentences for describing images in grain-fined and coherent text. Existing typical deep learning-based models for image captioning consist of an image encoder to extract visual features and a language model decoder, which has shown promising results in single high-level sentence generation. However, only the word-level scalar guiding signal is available when the image encoder is optimized to extract visual features. The inconsistency between the parallel extraction of visual features and sequential text supervision limits its success when the length of the generated text is long (more than 50 words). In this paper, we propose a new module, called the Text Embedding Bank (TEB) module, to address the problem for image paragraph captioning. This module uses the paragraph vector model to learn fixed-length feature representations from a variable-length paragraph. We refer to the fixed-length feature as the TEB. This TEB module plays two roles to benefit paragraph captioning performance. First, it acts as a form of global and coherent deep supervision to regularize visual feature extraction in the image encoder. Second, it acts as a distributed memory to provide features of the whole paragraph to the language model, which alleviating the long-term dependency problem. Adding this module to two existing state-of-the-art methods achieves a new state-of-the-art result by a large margin on the paragraph captioning Visual Genome dataset.",
        "keywords": "caption;text embedding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zengming Shen;Arjun Gupta;Thomas S. Huang",
        "authorids": "zshen5@illinois.edu;arjung2@illinois.edu;t-huang1@illinois.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Sygt9yBtPS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "179;659;234",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.3333333333333,
            214.48905695991942
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KviyRVnxlRAJ:scholar.google.com/&scioq=Text+Embedding+Bank+Module+for+Detailed+Image+Paragraph+Caption&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Syl-_aVtvH",
        "title": "Federated User Representation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose Federated User Representation Learning (FURL), a simple, scalable, privacy-preserving and bandwidth-efficient way to utilize existing neural personalization techniques in the Federated Learning (FL) setting.",
        "abstract": "Collaborative personalization, such as through learned user representations (embeddings), can improve the prediction accuracy of neural-network-based models significantly. We propose Federated User Representation Learning (FURL), a simple, scalable, privacy-preserving and resource-efficient way to utilize existing neural personalization techniques in the Federated Learning (FL) setting. FURL divides model parameters into federated and private parameters. Private parameters, such as private user embeddings, are trained locally, but unlike federated parameters, they are not transferred to or averaged on the server. We show theoretically that this parameter split does not affect training for most model personalization approaches. Storing user embeddings locally not only preserves user privacy, but also improves memory locality of personalization compared to on-server training. We evaluate FURL on two datasets, demonstrating a significant improvement in model quality with 8% and 51% performance increases, and approximately the same level of performance as centralized training with only 0% and 4% reductions. Furthermore, we show that user embeddings learned in FL and the centralized setting have a very similar structure, indicating that FURL can learn collaboratively through the shared parameters while preserving user privacy.",
        "keywords": "Machine Learning;Federated Learning;Personalization;User Representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Duc Bui;Kshitiz Malik;Jack Goetz;Seungwhan Moon;Honglei Liu;Anuj Kumar;Kang G. Shin",
        "authorids": "ducbui@umich.edu;kmalik2@fb.com;jrgoetz@umich.edu;shanemoon@fb.com;honglei@fb.com;anujk@fb.com;kgshin@umich.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nbui2020federated,\ntitle={Federated User Representation Learning},\nauthor={Duc Bui and Kshitiz Malik and Jack Goetz and Seungwhan Moon and Honglei Liu and Anuj Kumar and Kang G. Shin},\nyear={2020},\nurl={https://openreview.net/forum?id=Syl-_aVtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syl-_aVtvH",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "106;333;281",
        "wc_reply_reviewers": "0;0;74",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;1",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            240.0,
            97.10132165252266
        ],
        "wc_reply_reviewers_avg": [
            24.666666666666668,
            34.883934538536344
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 107,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10113311748106444054&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Syl-xpNtwS",
        "title": "Learning Representations in Reinforcement Learning: an Information Bottleneck Approach",
        "track": "main",
        "status": "Reject",
        "tldr": "Derive an information bottleneck framework in reinforcement learning and some simple relevant theories and tools.",
        "abstract": "The information bottleneck principle is an elegant and useful approach to representation learning. In this paper, we investigate the problem of representation learning in the context of reinforcement learning using the information bottleneck framework, aiming at improving the sample efficiency of the learning algorithms.We analytically derive the optimal conditional distribution of the representation, and provide a variational lower bound. Then, we maximize this lower bound with the Stein variational (SV) gradient method. \nWe incorporate this framework in the advantageous actor critic algorithm (A2C) and the proximal policy optimization algorithm (PPO). Our experimental results show that our framework can improve the sample efficiency of vanilla A2C and PPO significantly. Finally, we study the information-bottleneck (IB) perspective in deep RL with the algorithm called mutual information neural estimation(MINE).\nWe experimentally verify that the information extraction-compression process also exists in deep RL and our framework is capable of accelerating this process. We also analyze the relationship between MINE and our method, through this relationship, we theoretically derive an algorithm to optimize our IB framework without constructing the lower bound.",
        "keywords": "representation learning;reinforcement learning;information bottleneck",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yingjun Pei;Xinwen Hou",
        "authorids": "peiyingjun4@gmail.com;xwhou@nlpr.ia.ac.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\npei2020learning,\ntitle={Learning Representations in Reinforcement Learning: an Information Bottleneck Approach},\nauthor={Yingjun Pei and Xinwen Hou},\nyear={2020},\nurl={https://openreview.net/forum?id=Syl-xpNtwS}\n}",
        "github": "https://github.com/AnonymousSubmittedCode/SVIB",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syl-xpNtwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "627;1424;195",
        "wc_reply_reviewers": "160;0;0",
        "wc_reply_authors": "353;299;184",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            748.6666666666666,
            509.0594813531672
        ],
        "wc_reply_reviewers_avg": [
            53.333333333333336,
            75.42472332656506
        ],
        "wc_reply_authors_avg": [
            278.6666666666667,
            70.47615830115096
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6074789549807819615&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "Syl38yrFwr",
        "title": "Near-Zero-Cost Differentially Private Deep Learning with Teacher Ensembles",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Ensuring the privacy of sensitive data used to train modern machine learning models is of paramount importance in many areas of practice. One approach to study these concerns is through the lens of differential privacy. In this framework, privacy guarantees are generally obtained by perturbing models in such a way that specifics of data used to train the model are made ambiguous. A particular instance of this approach is through a ``teacher-student'' model, wherein the teacher, who owns the sensitive data, provides the student with useful, but noisy, information, hopefully allowing the student model to perform well on a given task without access to particular features of the sensitive data. Because stronger privacy guarantees generally involve more significant noising on the part of the teacher, deploying existing frameworks fundamentally involves a trade-off between utility and privacy guarantee. One of the most important techniques used in previous work involves an ensemble of teacher models, which return information to a student based on a noisy voting procedure.  In this work, we propose a novel voting mechanism, which we call an Immutable Noisy ArgMax, that, under certain conditions, can bear very large random noising from the teacher without affecting the useful information transferred to the student. Our mechanisms improve over the state-of-the-art methods on all measures, and scale to larger tasks with both higher utility and stronger privacy ($\\epsilon \\approx 0$).",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lichao Sun;Yingbo Zhou;Jia Li;Richard Socher;Philip S. Yu;Caiming Xiong",
        "authorids": "james.lichao.sun@gmail.com;yingbo.zhou@salesforce.com;jia.li@salesforce.com;rsocher@salesforce.com;psyu@uic.edu;cxiong@salesforce.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nsun2020nearzerocost,\ntitle={Near-Zero-Cost Differentially Private Deep Learning with Teacher Ensembles},\nauthor={Lichao Sun and Yingbo Zhou and Jia Li and Richard Socher and Philip S. Yu and Caiming Xiong},\nyear={2020},\nurl={https://openreview.net/forum?id=Syl38yrFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Syl38yrFwr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "97;237;143",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "310;210;137",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            159.0,
            58.26376804384236
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            219.0,
            70.91309235019064
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_XmEZQA5KGMJ:scholar.google.com/&scioq=Near-Zero-Cost+Differentially+Private+Deep+Learning+with+Teacher+Ensembles&hl=en&as_sdt=0,33",
        "gs_version_total": 5
    },
    {
        "id": "Syl5mRNtvr",
        "title": "Learning Adversarial Grammars for Future Prediction",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We design a grammar that is learned in an adversarial setting and apply it to future prediction in video.",
        "abstract": "In this paper, we propose a differentiable adversarial grammar model for future prediction. The objective is to model a formal grammar in terms of differentiable functions and latent representations, so that their learning is possible through standard backpropagation. Learning a formal grammar represented with latent terminals, non-terminals, and productions rules allows capturing sequential structures with multiple possibilities from data.\n\nThe adversarial grammar is designed so that it can learn stochastic production rules from the data distribution. Being able to select multiple production rules leads to different predicted outcomes, thus efficiently modeling many plausible futures.  We confirm the benefit of the adversarial grammar on two diverse tasks: future 3D human pose prediction and future activity prediction. For all settings, the proposed adversarial grammar outperforms the state-of-the-art approaches, being able to predict much more accurately and further in the future, than prior work.",
        "keywords": "future prediction;grammar",
        "primary_area": "",
        "supplementary_material": "",
        "author": "AJ Piergiovanni;Alexander Toshev;Anelia Angelova;Michael Ryoo",
        "authorids": "ajpiergi@indiana.edu;toshev@google.com;anelia@google.com;mryoo@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syl5mRNtvr",
        "pdf_size": 0,
        "rating": "1;1",
        "confidence": "0;0",
        "wc_review": "908;380",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "144;358",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            644.0,
            264.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            251.0,
            107.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7FRxgsRBfacJ:scholar.google.com/&scioq=Learning+Adversarial+Grammars+for+Future+Prediction&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Syl5o2EFPB",
        "title": "Learning Compact Reward for Image Captioning",
        "track": "main",
        "status": "Reject",
        "tldr": "a refiened AIRL algorithm that learns compact reward for image captioning ",
        "abstract": "Adversarial learning has shown its advances in generating natural and diverse descriptions in image captioning. However, the learned reward of existing adversarial methods is vague and ill-defined due to the reward ambiguity problem. In this paper, we propose a refined Adversarial Inverse Reinforcement Learning (rAIRL) method to handle the reward ambiguity problem by disentangling reward for each word in a sentence, as well as achieve stable adversarial training by refining the loss function to shift the stationary point towards Nash equilibrium. In addition, we introduce a conditional term in the loss function to mitigate mode collapse and to increase the diversity of the generated descriptions. Our experiments on MS COCO show that our method can learn compact reward for image captioning.",
        "keywords": "image captioning;adversarial learning;inverse reinforcement learning;vision;language",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nannan Li;Zhenzhong Chen",
        "authorids": "live@whu.edu.cn;zzchen@whu.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nli2020learning,\ntitle={Learning Compact Reward for Image Captioning},\nauthor={Nannan Li and Zhenzhong Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=Syl5o2EFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syl5o2EFPB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "567;284;252",
        "wc_reply_reviewers": "0;149;0",
        "wc_reply_authors": "760;681;236",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            367.6666666666667,
            141.55407290345113
        ],
        "wc_reply_reviewers_avg": [
            49.666666666666664,
            70.23927359786371
        ],
        "wc_reply_authors_avg": [
            559.0,
            230.66136795455512
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13620811148872521021&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Syl89aNYwS",
        "title": "Robust saliency maps with distribution-preserving decoys",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a robust saliency method which alleviate the limitations of mainstream competing methods with theoretical soundness",
        "abstract": "Saliency methods help to make deep neural network predictions more interpretable by identifying particular features, such as pixels in an image, that contribute most strongly to the network's prediction. Unfortunately, recent evidence suggests that many saliency methods perform poorly when gradients are saturated or in the presence of strong inter-feature dependence or noise injected by an adversarial attack. In this work, we propose a data-driven technique that uses the distribution-preserving decoys to infer robust saliency scores in conjunction with a pre-trained convolutional neural network classifier and any off-the-shelf saliency method.  We formulate the generation of decoys as an optimization problem, potentially applicable to any convolutional network architecture. We also propose a novel decoy-enhanced saliency score, which provably compensates for gradient saturation and considers joint activation patterns of pixels in a single-layer convolutional neural network. Empirical results on the ImageNet data set using three different deep neural network architectures---VGGNet, AlexNet and ResNet---show both qualitatively and quantitatively that decoy-enhanced saliency scores outperform raw scores produced by three existing saliency methods.",
        "keywords": "explainable machine learning;explainable AI;deep learning interpretability;saliency maps;perturbation;convolutional neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yang Young Lu;Wenbo Guo;Xinyu Xing;William Stafford Noble",
        "authorids": "ylu465@uw.edu;wzg13@ist.psu.edu;xxing@ist.psu.edu;william-noble@uw.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nlu2020robust,\ntitle={Robust saliency maps with distribution-preserving decoys},\nauthor={Yang Young Lu and Wenbo Guo and Xinyu Xing and William Stafford Noble},\nyear={2020},\nurl={https://openreview.net/forum?id=Syl89aNYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Syl89aNYwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "686;789;232",
        "wc_reply_reviewers": "125;0;0",
        "wc_reply_authors": "1026;739;267",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            569.0,
            241.97658288906112
        ],
        "wc_reply_reviewers_avg": [
            41.666666666666664,
            58.92556509887896
        ],
        "wc_reply_authors_avg": [
            677.3333333333334,
            312.913548586329
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bQJ_kcIH8QcJ:scholar.google.com/&scioq=Robust+saliency+maps+with+distribution-preserving+decoys&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SylDPJrYvS",
        "title": "EnsembleNet: A novel architecture for Incremental Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": " Deep neural networks are used in many state-of-the-art systems for machine perception. Once a network is trained to do a specific task, it cannot be easily trained to do new tasks as it leads to catastrophic forgetting of the previously learned tasks. We propose here a novel architecture called EnsembleNet that accommodates for newer classes of data without having to retrain previously trained sub-models. The novelty of our model lies in the fact that only a small portion of the network has to be retrained which makes it extremely computational efficient and also results in high performance compared to other architectures in the literature. We demonstrated our model on MNIST Handwritten digits, MNIST Fashion, and CIFAR10 datasets. The proposed architecture was benchmarked against other models in the literature on Omega-new, Omega-base, Omega-all metrics for MNIST- Handwritten dataset. The experimental results show that Ensemble Net on overall outperformed every other model in the literature.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Suri Bhasker Sri Harsha;Y Kalidas",
        "authorids": "cs18s506@iittp.ac.in;ykalidas@iittp.ac.in",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SylDPJrYvS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "148;282;659",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "265;575;806",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            363.0,
            216.3346173562305
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            548.6666666666666,
            221.6458636854947
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MBpGnqhAHYsJ:scholar.google.com/&scioq=EnsembleNet:+A+novel+architecture+for+Incremental+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SylGpT4FPS",
        "title": "Last-iterate convergence rates for min-max optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We prove that global linear last-iterate convergence rates are achievable for more general classes of convex-concave min-max optimization problems than had previously been shown.",
        "abstract": "While classic work in convex-concave min-max optimization relies on average-iterate convergence results, the emergence of nonconvex applications such as training Generative Adversarial Networks has led to renewed interest in last-iterate convergence guarantees. Proving last-iterate convergence is challenging because many natural algorithms, such as Simultaneous Gradient Descent/Ascent, provably diverge or cycle even in simple convex-concave min-max settings, and previous work on global last-iterate convergence rates has been limited to the bilinear and convex-strongly concave settings. In this work, we show that the Hamiltonian Gradient Descent (HGD) algorithm achieves linear convergence in a variety of more general settings, including convex-concave problems that satisfy a \u201csufficiently bilinear\u201d condition. We also prove similar convergence rates for some parameter settings of the Consensus Optimization (CO) algorithm of Mescheder et al. 2017.",
        "keywords": "min-max optimization;zero-sum game;saddle point;last-iterate convergence;non-asymptotic convergence;global rates;Hamiltonian;sufficiently bilinear",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jacob Abernethy;Kevin A. Lai;Andre Wibisono",
        "authorids": "prof@gatech.edu;nykal212@gmail.com;andrwbsn@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nabernethy2020lastiterate,\ntitle={Last-iterate convergence rates for min-max optimization},\nauthor={Jacob Abernethy and Kevin A. Lai and Andre Wibisono},\nyear={2020},\nurl={https://openreview.net/forum?id=SylGpT4FPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SylGpT4FPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "393;685;515",
        "wc_reply_reviewers": "0;206;0",
        "wc_reply_authors": "97;557;433",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            531.0,
            119.74417174404216
        ],
        "wc_reply_reviewers_avg": [
            68.66666666666667,
            97.10933128295251
        ],
        "wc_reply_authors_avg": [
            362.3333333333333,
            194.3284733526087
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 65,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14956410719721756406&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SylKikSYDH",
        "title": "Compressive Transformers for Long-Range Sequence Modelling",
        "track": "main",
        "status": "Poster",
        "tldr": "Long-range transformer using a compressive memory, achieves sota in wikitext-103 and enwik8 LM benchmarks, release a new book-level LM benchmark PG-19.",
        "abstract": "We present the Compressive Transformer, an attentive sequence model which compresses past memories for long-range sequence learning. We find the Compressive Transformer obtains state-of-the-art language modelling results in the WikiText-103 and Enwik8 benchmarks, achieving 17.1 ppl and 0.97bpc respectively. We also find it can model high-frequency speech effectively and can be used as a memory mechanism for RL, demonstrated on an object matching task. To promote the domain of long-range sequence learning, we propose a new open-vocabulary language modelling benchmark derived from books, PG-19.",
        "keywords": "memory;language modeling;transformer;compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jack W. Rae;Anna Potapenko;Siddhant M. Jayakumar;Chloe Hillier;Timothy P. Lillicrap",
        "authorids": "jwrae@google.com;apotapenko@google.com;sidmj@google.com;chillier@google.com;countzero@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nRae2020Compressive,\ntitle={Compressive Transformers for Long-Range Sequence Modelling},\nauthor={Jack W. Rae and Anna Potapenko and Siddhant M. Jayakumar and Chloe Hillier and Timothy P. Lillicrap},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SylKikSYDH}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=SylKikSYDH)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SylKikSYDH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "255;367;485",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "327;425;547",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.0,
            93.90775615819317
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            433.0,
            89.99259228773592
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 654,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8422884718792038200&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SylL0krYPS",
        "title": "Toward Evaluating Robustness of Deep Reinforcement Learning with Continuous Control",
        "track": "main",
        "status": "Poster",
        "tldr": "We study the problem of continuous control agents in deep RL with adversarial attacks and proposed a two-step algorithm based on learned model dynamics. ",
        "abstract": "Deep reinforcement learning has achieved great success in many previously difficult reinforcement learning tasks, yet recent studies show that deep RL agents are also unavoidably susceptible to adversarial perturbations, similar to deep neural networks in classification tasks. Prior works mostly focus on model-free adversarial attacks and agents with discrete actions. In this work, we study the problem of continuous control agents in deep RL with adversarial attacks and propose the first two-step algorithm based on learned model dynamics. Extensive experiments on various MuJoCo domains (Cartpole, Fish, Walker, Humanoid) demonstrate that our proposed framework is much more effective and efficient than model-free based attacks baselines in degrading agent performance as well as driving agents to unsafe states. ",
        "keywords": "deep learning;reinforcement learning;robustness;adversarial examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tsui-Wei Weng;Krishnamurthy (Dj) Dvijotham*;Jonathan Uesato*;Kai Xiao*;Sven Gowal*;Robert Stanforth*;Pushmeet Kohli",
        "authorids": "twweng@mit.edu;dvij@google.com;juesato@google.com;kaix@mit.edu;sgowal@google.com;stanforth@google.com;pushmeet@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nWeng2020Toward,\ntitle={Toward Evaluating Robustness of Deep Reinforcement Learning with Continuous Control},\nauthor={Tsui-Wei Weng and Krishnamurthy (Dj) Dvijotham* and Jonathan Uesato* and Kai Xiao* and Sven Gowal* and Robert Stanforth* and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SylL0krYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SylL0krYPS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "644;190;124",
        "wc_reply_reviewers": "0;0;84",
        "wc_reply_authors": "2258;883;472",
        "reply_reviewers": "0;0;1",
        "reply_authors": "4;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            319.3333333333333,
            231.1497830892822
        ],
        "wc_reply_reviewers_avg": [
            28.0,
            39.59797974644666
        ],
        "wc_reply_authors_avg": [
            1204.3333333333333,
            763.7147518689305
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12638020263730302259&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SylO2yStDr",
        "title": "Reducing Transformer Depth on Demand with Structured Dropout",
        "track": "main",
        "status": "Poster",
        "tldr": "Layerdrop, a form of structured dropout that allows you to train one model at training time and prune to any desired depth at test time. You can also use this to train even deeper models.",
        "abstract": "Overparametrized transformer networks have obtained state of the art results in various natural language processing tasks, such as machine translation, language modeling, and  question answering. These models contain hundreds of millions of parameters, necessitating a large amount of computation\tand making them prone to overfitting. In this work, we explore LayerDrop, a form of structured dropout, which has a regularization effect during training and allows for efficient pruning at inference time. In particular, we show that it is possible to select sub-networks of any depth from one large network without having to finetune them and with limited impact on performance. We demonstrate the effectiveness of our\tapproach by improving the state of the art on machine translation, language modeling, summarization, question answering, and language understanding benchmarks. Moreover, we show that our approach leads to small BERT-like models of higher quality than when training from scratch or using distillation.",
        "keywords": "reduction;regularization;pruning;dropout;transformer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Angela Fan;Edouard Grave;Armand Joulin",
        "authorids": "angelafan@fb.com;egrave@fb.com;ajoulin@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nFan2020Reducing,\ntitle={Reducing Transformer Depth on Demand with Structured Dropout},\nauthor={Angela Fan and Edouard Grave and Armand Joulin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SylO2yStDr}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=SylO2yStDr)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SylO2yStDr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "441;269;509",
        "wc_reply_reviewers": "99;0;142",
        "wc_reply_authors": "802;264;1285",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.3333333333333,
            100.99944994349667
        ],
        "wc_reply_reviewers_avg": [
            80.33333333333333,
            59.454931577530964
        ],
        "wc_reply_authors_avg": [
            783.6666666666666,
            417.02304759116396
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 708,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8009624515739749912&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SylOlp4FvH",
        "title": "V-MPO: On-Policy Maximum a Posteriori Policy Optimization for Discrete and Continuous Control",
        "track": "main",
        "status": "Poster",
        "tldr": "A state-value function-based version of MPO that achieves good results in a wide range of tasks in discrete and continuous control.",
        "abstract": "Some of the most successful applications of deep reinforcement learning to challenging domains in discrete and continuous control have used policy gradient methods in the on-policy setting. However, policy gradients can suffer from large variance that may limit performance, and in practice require carefully tuned entropy regularization to prevent policy collapse. As an alternative to policy gradient algorithms, we introduce V-MPO, an on-policy adaptation of Maximum a Posteriori Policy Optimization (MPO) that performs policy iteration based on a learned state-value function. We show that V-MPO surpasses previously reported scores for both the Atari-57 and DMLab-30 benchmark suites in the multi-task setting, and does so reliably without importance weighting, entropy regularization, or population-based tuning of hyperparameters. On individual DMLab and Atari levels, the proposed algorithm can achieve scores that are substantially higher than has previously been reported. V-MPO is also applicable to problems with high-dimensional, continuous action spaces, which we demonstrate in the context of learning to control simulated humanoids with 22 degrees of freedom from full state observations and 56 degrees of freedom from pixel observations, as well as example OpenAI Gym tasks where V-MPO achieves substantially higher asymptotic scores than previously reported.",
        "keywords": "reinforcement learning;policy iteration;multi-task learning;continuous control",
        "primary_area": "",
        "supplementary_material": "",
        "author": "H. Francis Song;Abbas Abdolmaleki;Jost Tobias Springenberg;Aidan Clark;Hubert Soyer;Jack W. Rae;Seb Noury;Arun Ahuja;Siqi Liu;Dhruva Tirumala;Nicolas Heess;Dan Belov;Martin Riedmiller;Matthew M. Botvinick",
        "authorids": "songf@google.com;aabdolmaleki@google.com;springenberg@google.com;aidanclark@google.com;soyer@google.com;jwrae@google.com;snoury@google.com;arahuja@google.com;liusiqi@google.com;dhruvat@google.com;heess@google.com;danbelov@google.com;riedmiller@google.com;botvinick@google.com",
        "gender": ";;;;;;;;;;;;;",
        "homepage": ";;;;;;;;;;;;;",
        "dblp": ";;;;;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;;;;",
        "orcid": ";;;;;;;;;;;;;",
        "linkedin": ";;;;;;;;;;;;;",
        "or_profile": ";;;;;;;;;;;;;",
        "aff": ";;;;;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;;;;",
        "position": ";;;;;;;;;;;;;",
        "bibtex": "@inproceedings{\nSong2020V-MPO:,\ntitle={V-MPO: On-Policy Maximum a Posteriori Policy Optimization for Discrete and Continuous Control},\nauthor={H. Francis Song and Abbas Abdolmaleki and Jost Tobias Springenberg and Aidan Clark and Hubert Soyer and Jack W. Rae and Seb Noury and Arun Ahuja and Siqi Liu and Dhruva Tirumala and Nicolas Heess and Dan Belov and Martin Riedmiller and Matthew M. Botvinick},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SylOlp4FvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SylOlp4FvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "513;600;289",
        "wc_reply_reviewers": "0;200;0",
        "wc_reply_authors": "964;971;595",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            467.3333333333333,
            131.00720930120175
        ],
        "wc_reply_reviewers_avg": [
            66.66666666666667,
            94.28090415820634
        ],
        "wc_reply_authors_avg": [
            843.3333333333334,
            175.62143630231728
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            14,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 136,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=344233223947048064&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SylR-CEKDS",
        "title": "Modeling question asking using neural program generation",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a model of human question asking that combines neural networks and symbolic programs, which can learn to generate good questions with or without supervised examples.",
        "abstract": "People ask questions that are far richer, more informative, and more creative than current AI systems. We propose a neural program generation framework for modeling human question asking, which represents questions as formal programs and generates programs with an encoder-decoder based deep neural network. From extensive experiments using an information-search game, we show that our method can ask optimal questions in synthetic settings, and predict which questions humans are likely to ask in unconstrained settings. We also propose a novel grammar-based question generation framework trained with reinforcement learning, which is able to generate creative questions without supervised data.",
        "keywords": "question asking;language generation;program induction;reinforcement learning;density estimation;cognitive science",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ziyun Wang;Brenden M. Lake",
        "authorids": "ziyunw@nyu.edu;brenden@nyu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nwang2020modeling,\ntitle={Modeling question asking using neural program generation},\nauthor={Ziyun Wang and Brenden M. Lake},\nyear={2020},\nurl={https://openreview.net/forum?id=SylR-CEKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SylR-CEKDS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "530;519;720",
        "wc_reply_reviewers": "0;0;64",
        "wc_reply_authors": "670;505;773",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            589.6666666666666,
            92.26893060806306
        ],
        "wc_reply_reviewers_avg": [
            21.333333333333332,
            30.169889330626027
        ],
        "wc_reply_authors_avg": [
            649.3333333333334,
            110.38216442080768
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3044431303638898510&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SylR6n4tPS",
        "title": "Learning to Generate Grounded Visual Captions without Localization Supervision",
        "track": "main",
        "status": "Reject",
        "tldr": "We improve visual grounding accuracy for both image and video captioning tasks without using ground-truth grounding annotations.",
        "abstract": "When automatically generating a sentence description for an image or video, it often remains unclear how well the generated caption is grounded, or if the model hallucinates based on priors in the dataset and/or the language model. The most common way of relating image regions with words in caption models is through an attention mechanism over the regions that are used as input to predict the next word. The model must therefore learn to predict the attentional weights without knowing the word it should localize. This is difficult to train without grounding supervision since recurrent models can propagate past information and there is no explicit signal to force the captioning model to properly ground the individual decoded words. In this work, we help the model to achieve this via a novel cyclical training regimen that forces the model to localize each word in the image after the sentence decoder generates it, and then reconstruct the sentence from the localized image region(s) to match the ground-truth. Our proposed framework only requires learning one extra fully-connected layer (the localizer), a layer that can be removed at test time. We show that our model significantly improves grounding accuracy without relying on grounding supervision or introducing extra computation during inference for both image and video captioning tasks.",
        "keywords": "image captioning;video captioning;self-supervised learning;visual grounding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chih-Yao Ma;Yannis Kalantidis;Ghassan AlRegib;Peter Vajda;Marcus Rohrbach;Zsolt Kira",
        "authorids": "cyma@gatech.edu;ykalant@image.ntua.gr;vajdap@fb.com;alregib@gatech.edu;maroffm@gmail.com;zkira@gatech.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nma2020learning,\ntitle={Learning to Generate Grounded Visual Captions without Localization Supervision},\nauthor={Chih-Yao Ma and Yannis Kalantidis and Ghassan AlRegib and Peter Vajda and Marcus Rohrbach and Zsolt Kira},\nyear={2020},\nurl={https://openreview.net/forum?id=SylR6n4tPS}\n}",
        "github": "https://www.dropbox.com/s/569iz5opptn3s8a/cyclical-grounding-code.zip?dl=1",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SylR6n4tPS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "665;393;213",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "984;410;1176",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;4",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            423.6666666666667,
            185.79797869968576
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            856.6666666666666,
            325.4221190324277
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 53,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2334837474768692013&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SylUiREKvB",
        "title": "Variational Hyper RNN for Sequence Modeling",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel probabilistic sequence model that excels at capturing high variability in time series data using hypernetworks.",
        "abstract": "In this work, we propose a novel probabilistic sequence model that excels at capturing high variability in time series data, both across sequences and within an individual sequence. Our method uses temporal latent variables to capture information about the underlying data pattern and dynamically decodes the latent information into modifications of weights of the base decoder and recurrent model. The efficacy of the proposed method is demonstrated on a range of synthetic and real-world sequential data that exhibit large scale variations, regime shifts, and complex dynamics.",
        "keywords": "variational autoencoder;hypernetwork;recurrent neural network;time series",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruizhi Deng;Yanshuai Cao;Bo Chang;Leonid Sigal;Greg Mori;Marcus Brubaker",
        "authorids": "ruizhid@sfu.ca;yanshuaicao@gmail.com;bchang@stat.ubc.ca;lsigal@cs.ubc.ca;mori@cs.sfu.ca;marcus.brubaker@borealisai.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ndeng2020variational,\ntitle={Variational Hyper {\\{}RNN{\\}} for Sequence Modeling},\nauthor={Ruizhi Deng and Yanshuai Cao and Bo Chang and Leonid Sigal and Greg Mori and Marcus Brubaker},\nyear={2020},\nurl={https://openreview.net/forum?id=SylUiREKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SylUiREKvB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "247;389;360",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "704;1270;1108",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;5;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            332.0,
            61.25901294231459
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1027.3333333333333,
            238.00466848829294
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.699673171197595
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18416479166176845056&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SylUzpNFDS",
        "title": "SoftLoc: Robust Temporal Localization under Label Misalignment",
        "track": "main",
        "status": "Reject",
        "tldr": "This work introduces a novel loss function for the robust training of temporal localization DNN in the presence of misaligned labels.",
        "abstract": "This work addresses the long-standing problem of robust event localization in the presence of temporally of misaligned labels in the training data. We propose a novel versatile loss function that generalizes a number of training regimes from standard fully-supervised cross-entropy to count-based weakly-supervised learning. Unlike classical models which are constrained to strictly fit the annotations during training, our soft localization learning approach relaxes the reliance on the exact position of labels instead. Training with this new loss function exhibits strong robustness to temporal misalignment of labels, thus alleviating the burden of precise annotation of temporal sequences. We demonstrate state-of-the-art performance against standard benchmarks in a number of challenging experiments and further show that robustness to label noise is not achieved at the expense of raw performance. ",
        "keywords": "deep learning;temporal localization;robustness;label misalignment;music;time series",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Julien Schroeter;Kirill Sidorov;Dave Marshall",
        "authorids": "schroeterj1@cardiff.ac.uk;sidorovk@cardiff.ac.uk;marshallad@cardiff.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nschroeter2020softloc,\ntitle={SoftLoc: Robust Temporal Localization under Label Misalignment},\nauthor={Julien Schroeter and Kirill Sidorov and Dave Marshall},\nyear={2020},\nurl={https://openreview.net/forum?id=SylUzpNFDS}\n}",
        "github": "https://github.com/SoftLocICLR/submission",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SylUzpNFDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "320;256;397",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "536;940;622",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            324.3333333333333,
            57.644504411859494
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            699.3333333333334,
            173.76101084215898
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18319154537054356081&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SylVJTNKDr",
        "title": "Entropy Minimization In Emergent Languages",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "There is a growing interest in studying the languages emerging when neural agents are jointly trained to solve tasks requiring communication through a discrete channel.  We investigate here the information-theoretic complexity of such languages, focusing on the basic two-agent, one-exchange setup. We find that, under common training procedures, the emergent languages are subject to an entropy minimization pressure that has also been detected in human language, whereby the mutual information between the communicating agent's inputs and the messages is minimized, within the range afforded by the need for successful communication. This pressure is amplified as we increase communication channel discreteness. Further, we observe that stronger discrete-channel-driven entropy minimization leads to representations with increased robustness to overfitting and adversarial attacks. We conclude by discussing the implications of our findings for the study of natural and artificial communication systems.",
        "keywords": "language emergence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eugene Kharitonov;Rahma Chaabouni;Diane Bouchacourt;Marco Baroni",
        "authorids": "eugene.kharitonov@gmail.com;rchaabouni@fb.com;dianeb@fb.com;marco.baroni@unitn.it",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkharitonov2020entropy,\ntitle={Entropy Minimization In Emergent Languages},\nauthor={Eugene Kharitonov and Rahma Chaabouni and Diane Bouchacourt and Marco Baroni},\nyear={2020},\nurl={https://openreview.net/forum?id=SylVJTNKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SylVJTNKDr",
        "pdf_size": 0,
        "rating": "1;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "1119;207;537;573",
        "wc_reply_reviewers": "0;0;37;578",
        "wc_reply_authors": "662;530;602;1017",
        "reply_reviewers": "0;0;1;1",
        "reply_authors": "1;1;1;2",
        "rating_avg": [
            4.75,
            2.165063509461097
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            609.0,
            327.17885017219555
        ],
        "wc_reply_reviewers_avg": [
            153.75,
            245.40616842288216
        ],
        "wc_reply_authors_avg": [
            702.75,
            187.35444350214917
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 42,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9085278772671430646&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "SylVNerFvr",
        "title": "Permutation Equivariant Models for Compositional Generalization in Language",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a link between permutation equivariance and compositional generalization, and provide equivariant language models",
        "abstract": "Humans understand novel sentences by composing meanings and roles of core language components. In contrast, neural network models for natural language modeling fail when such compositional generalization is required. The main contribution of this paper is to hypothesize that language compositionality is a form of group-equivariance. Based on this hypothesis, we propose a set of tools for constructing equivariant sequence-to-sequence models. Throughout a variety of experiments on the SCAN tasks, we analyze the behavior of existing models under the lens of equivariance, and demonstrate that our equivariant architecture is able to achieve the type compositional generalization required in human language understanding.",
        "keywords": "Compositionality;Permutation Equivariance;Language Processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jonathan Gordon;David Lopez-Paz;Marco Baroni;Diane Bouchacourt",
        "authorids": "jg801@cam.ac.uk;dlp@fb.com;mbaroni@fb.com;dianeb@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nGordon2020Permutation,\ntitle={Permutation Equivariant Models for Compositional Generalization in Language},\nauthor={Jonathan Gordon and David Lopez-Paz and Marco Baroni and Diane Bouchacourt},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SylVNerFvr}\n}",
        "github": "https://github.com/facebookresearch/Permutation-Equivariant-Seq2Seq",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SylVNerFvr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "232;536;1006",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "113;1276;645",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            591.3333333333334,
            318.39737575691305
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            678.0,
            475.36582404151295
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 129,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4655477190954693022&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SylWNC4FPH",
        "title": "Auto Completion of User Interface Layout Design Using Transformer-Based Tree Decoders",
        "track": "main",
        "status": "Reject",
        "tldr": "The paper investigates several Transformer-based decoder models for predicting a complete layout given a partial layout tree.",
        "abstract": "It has been of increasing interest in the field to develop automatic machineries to facilitate the design process. In this paper, we focus on assisting graphical user interface (UI) layout design, a crucial task in app development. Given a partial layout, which a designer has entered, our model learns to complete the layout by predicting the remaining UI elements with a correct position and dimension as well as the hierarchical structures. Such automation will significantly ease the effort of UI designers and developers. While we focus on interface layout prediction, our model can be generally applicable for other layout prediction problems that involve tree structures and 2-dimensional placements. Particularly, we design two versions of Transformer-based tree decoders: Pointer and Recursive Transformer, and experiment with these models on a public dataset. We also propose several metrics for measuring the accuracy of tree prediction and ground these metrics in the domain of user experience. These contribute a new task and methods to deep learning research.",
        "keywords": "Transformer;decoder;user interface;layout design",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yang Li;Julien Amelot;Xin Zhou;Samy Bengio;Si Si",
        "authorids": "liyang@google.com;jamelot@google.com;zhouxin@google.com;bengio@google.com;sisidaisy@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nli2020auto,\ntitle={Auto Completion of User Interface Layout Design Using Transformer-Based Tree Decoders},\nauthor={Yang Li and Julien Amelot and Xin Zhou and Samy Bengio and Si Si},\nyear={2020},\nurl={https://openreview.net/forum?id=SylWNC4FPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SylWNC4FPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "175;142;387",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "241;83;240",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            234.66666666666666,
            108.55515751092724
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            188.0,
            74.2473344078201
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2374571378363683280&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Syld53NtvH",
        "title": "Expected Tight Bounds for Robust Deep Neural Network Training",
        "track": "main",
        "status": "Reject",
        "tldr": "For networks with ReLU activations, we derive output interval bounds, which are tight and true (in expectation) and easy to use in robust training.",
        "abstract": "Training Deep Neural Networks (DNNs) that are robust to norm bounded adversarial attacks remains an elusive problem. While verification based methods are generally too expensive to robustly train large networks, it was demonstrated by Gowal et. al. that bounded input intervals can be inexpensively propagated from layer to layer through deep networks. This interval bound propagation (IBP) approach led to high robustness and was the first to be employed on large networks. However, due to the very loose nature of the IBP bounds, particularly for large/deep networks, the required training procedure is complex and involved. In this paper, we closely examine the bounds of a block of layers composed of an affine layer, followed by a ReLU, followed by another affine layer. To this end, we propose \\emph{expected} bounds (true bounds in expectation), which are provably tighter than IBP bounds in expectation. We then extend this result to deeper networks through blockwise propagation and show that we can achieve orders of magnitudes tighter bounds compared to IBP. Using these tight bounds, we demonstrate that a simple standard training procedure can achieve impressive robustness-accuracy trade-off across several architectures on both MNIST and CIFAR10.",
        "keywords": "network robustness;network verification;interval bound propagation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Salman Alsubaihi;Adel Bibi;Modar Alfadly;Abdullah Hamdi;Bernard Ghanem",
        "authorids": "salman.subaihi@kaust.edu.sa;adel.bibi@kaust.edu.sa;modar.alfadly@kaust.edu.sa;abdullah.hamdi@kaust.edu.sa;bernard.ghanem@kaust.edu.sa",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nalsubaihi2020expected,\ntitle={Expected Tight Bounds for Robust Deep Neural Network Training},\nauthor={Salman Alsubaihi and Adel Bibi and Modar Alfadly and Abdullah Hamdi and Bernard Ghanem},\nyear={2020},\nurl={https://openreview.net/forum?id=Syld53NtvH}\n}",
        "github": "https://drive.google.com/file/d/1OQ2SZw0ewS-XevgG419lhTd-QQpFupx4/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Syld53NtvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "413;389;1256",
        "wc_reply_reviewers": "0;138;0",
        "wc_reply_authors": "451;867;955",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            686.0,
            403.16993935560225
        ],
        "wc_reply_reviewers_avg": [
            46.0,
            65.05382386916237
        ],
        "wc_reply_authors_avg": [
            757.6666666666666,
            219.80193103994534
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13333435139759185006&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Sylgsn4Fvr",
        "title": "To Relieve Your Headache of Training an MRF, Take AdVIL",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a black-box algorithm called AdVIL  to perform inference and learning on a general Markov random field.",
        "abstract": "We propose a black-box algorithm called {\\it Adversarial Variational Inference and Learning} (AdVIL)  to perform inference and learning on a general Markov random field (MRF). AdVIL employs two variational distributions to approximately infer the latent variables and estimate the partition function of an MRF, respectively. The two variational distributions provide an estimate of the negative log-likelihood of the MRF as a minimax optimization problem, which is solved by stochastic gradient descent. AdVIL is proven convergent under certain conditions. On one hand, compared with contrastive divergence, AdVIL requires a minimal assumption about the model structure and can deal with a broader family of MRFs. On the other hand, compared with existing black-box methods, AdVIL provides a tighter estimate of the log partition function and achieves much better empirical results. ",
        "keywords": "Markov Random Fields;Undirected Graphical Models;Variational Inference;Black-box Infernece",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chongxuan Li;Chao Du;Kun Xu;Max Welling;Jun Zhu;Bo Zhang",
        "authorids": "chongxuanli1991@gmail.com;duchao0726@gmail.com;kunxu.thu@gmail.com;m.welling@uva.nl;dcszj@mail.tsinghua.edu.cn;dcszb@mail.tsinghua.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nLi2020To,\ntitle={To Relieve Your Headache of Training an MRF, Take AdVIL},\nauthor={Chongxuan Li and Chao Du and Kun Xu and Max Welling and Jun Zhu and Bo Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Sylgsn4Fvr}\n}",
        "github": "https://anonymous.4open.science/r/8c779fbc-6394-40c7-8273-e52504814703/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Sylgsn4Fvr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "450;372;861",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "913;843;1390",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            561.0,
            214.50874108063755
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1048.6666666666667,
            243.04503469293277
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3296071883892399501&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "SyliaANtwH",
        "title": "MetaPoison: Learning to craft adversarial poisoning examples via meta-learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Generate corrupted training images that are imperceptible yet change CNN behavior on a target during any new training.",
        "abstract": " We consider a new class of \\emph{data poisoning} attacks on neural networks, in which the attacker takes control of a model by making small perturbations to a subset of its training data.  We formulate the task of finding poisons as a bi-level optimization problem, which can be solved using methods borrowed from the meta-learning community.  Unlike previous poisoning strategies, the meta-poisoning can poison networks that are trained from scratch using an initialization unknown to the attacker and transfer across hyperparameters. Further we show that our attacks are more versatile: they can cause misclassification of the target image into an arbitrarily chosen class. Our results show above 50% attack success rate when poisoning just 3-10% of the training dataset.",
        "keywords": "Adversarial Examples;Poisoning;Backdoor Attacks;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "W. Ronny Huang;Jonas Geiping;Liam Fowl;Gavin Taylor;Tom Goldstein",
        "authorids": "wronnyhuang@gmail.com;jonas.geiping@uni-siegen.de;lfowl@math.umd.edu;taylor@usna.edu;tomg@cs.umd.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "https://github.com/2350532677/metapoison",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SyliaANtwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "146;226;366",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "605;722;432",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            246.0,
            90.92121131323904
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            586.3333333333334,
            119.125517370918
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5329304006909231867&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "SyljQyBFDH",
        "title": "Meta-Learning Deep Energy-Based Memory Models",
        "track": "main",
        "status": "Poster",
        "tldr": "Deep associative memory models using arbitrary neural networks as a storage.",
        "abstract": "We study the problem of learning an associative memory model -- a system which is able to retrieve a remembered pattern based on its distorted or incomplete version.\nAttractor networks provide a sound model of associative memory: patterns are stored as attractors of the network dynamics and associative retrieval is performed by running the dynamics starting from a query pattern until it converges to an attractor. \nIn such models the dynamics are often implemented as an optimization procedure that minimizes an energy function, such as in the classical Hopfield network. \nIn general it is difficult to derive a writing rule for a given dynamics and energy that is both compressive and fast.\nThus, most research in energy-based memory has been limited either to tractable energy models not expressive enough to handle complex high-dimensional objects such as natural images, or to models that do not offer fast writing.\nWe present a novel meta-learning approach to energy-based memory models (EBMM) that allows one to use an arbitrary neural architecture as an energy model and quickly store patterns in its weights. \nWe demonstrate experimentally that our EBMM approach can build compressed memories for synthetic and natural data, and is capable of associative retrieval that outperforms existing memory systems in terms of the reconstruction error and compression rate.",
        "keywords": "associative memory;energy-based memory;meta-learning;compressive memory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sergey Bartunov;Jack Rae;Simon Osindero;Timothy Lillicrap",
        "authorids": "bartunov@google.com;jwrae@google.com;osindero@google.com;countzero@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nBartunov2020Meta-Learning,\ntitle={Meta-Learning Deep Energy-Based Memory Models},\nauthor={Sergey Bartunov and Jack Rae and Simon Osindero and Timothy Lillicrap},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyljQyBFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SyljQyBFDH",
        "pdf_size": 0,
        "rating": "6;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "701;348;696;526",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "671;941;479;71",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;2;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            567.75,
            145.11784004732155
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            540.5,
            316.87655325063105
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 38,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16494804031082424001&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SylkYeHtwr",
        "title": "SUMO: Unbiased Estimation of Log Marginal Probability for Latent Variable Models",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We create an unbiased estimator for the log probability of latent variable models, extending such models to a larger scope of applications.",
        "abstract": "Standard variational lower bounds used to train latent variable models produce biased estimates of most quantities of interest. We introduce an unbiased estimator of the log marginal likelihood and its gradients for latent variable models based on randomized truncation of infinite series. If parameterized by an encoder-decoder architecture, the parameters of the encoder can be optimized to minimize its variance of this estimator. We show that models trained using our estimator give better test-set likelihoods than a standard importance-sampling based approach for the same average computational cost. This estimator also allows use of latent variable models for tasks where unbiased estimators, rather than marginal likelihood lower bounds, are preferred, such as minimizing reverse KL divergences and estimating score functions.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yucen Luo;Alex Beatson;Mohammad Norouzi;Jun Zhu;David Duvenaud;Ryan P. Adams;Ricky T. Q. Chen",
        "authorids": "luoyc15@mails.tsinghua.edu.cn;abeatson@cs.princeton.edu;mnorouzi@google.com;dcszj@mail.tsinghua.edu.cn;duvenaud@cs.toronto.edu;rpa@princeton.edu;rtqichen@cs.toronto.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nLuo2020SUMO:,\ntitle={SUMO: Unbiased Estimation of Log Marginal Probability for Latent Variable Models},\nauthor={Yucen Luo and Alex Beatson and Mohammad Norouzi and Jun Zhu and David Duvenaud and Ryan P. Adams and Ricky T. Q. Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SylkYeHtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SylkYeHtwr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "215;172;724",
        "wc_reply_reviewers": "0;0;58",
        "wc_reply_authors": "288;114;438",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            370.3333333333333,
            250.69547706774094
        ],
        "wc_reply_reviewers_avg": [
            19.333333333333332,
            27.34146220587984
        ],
        "wc_reply_authors_avg": [
            280.0,
            132.39335330748293
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6939285844644504580&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SylkzaEYPS",
        "title": "Encoder-decoder Network as Loss Function for Summarization",
        "track": "main",
        "status": "Reject",
        "tldr": "We present the use of a secondary encoder-decoder as a loss function to help train a summarizer.",
        "abstract": "We present a new approach to defining a sequence loss function to train a summarizer by using a secondary encoder-decoder as a loss function, alleviating a shortcoming of word level training for sequence outputs. The technique is based on the intuition that if a summary is a good one, it should contain the most essential information from the original article, and therefore should itself be a good input sequence, in lieu of the original, from which a summary can be generated. We present experimental results where we apply this additional loss function to a general abstractive summarizer on a news summarization dataset. The result is an improvement in the ROUGE metric and an especially large improvement in human evaluations, suggesting enhanced performance that is competitive with specialized state-of-the-art models.",
        "keywords": "encoder-decoder;summarization;loss functions",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Glen Jeh",
        "authorids": "glenjeh@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\njeh2020encoderdecoder,\ntitle={Encoder-decoder Network as Loss Function for Summarization},\nauthor={Glen Jeh},\nyear={2020},\nurl={https://openreview.net/forum?id=SylkzaEYPS}\n}",
        "github": "https://github.com/iclr2020recoder/code_for_paper",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SylkzaEYPS",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "567;281;229",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1124;384;408",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.0,
            148.60237772884614
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            638.6666666666666,
            343.3223299207644
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17728319761051766294&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SylpBgrKPH",
        "title": "MissDeepCausal: causal inference from incomplete data using deep latent variable models",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Inferring causal effects of a treatment, intervention or policy from observational data is central to many applications. However, state-of-the-art methods for causal inference seldom consider the possibility that covariates have missing values, which is ubiquitous in many real-world analyses.  Missing data greatly complicate causal inference procedures as they require an adapted unconfoundedness hypothesis which can be difficult to justify in practice. We circumvent this issue by considering latent confounders whose distribution is learned through variational autoencoders adapted to missing values. They can be used either as a pre-processing step prior to causal inference but we also suggest to embed them in a multiple imputation strategy to take into account the variability due to missing values.  Numerical experiments demonstrate the effectiveness of the proposed methodology especially for non-linear models compared to competitors.",
        "keywords": "treatment effect estimation;missing values;variational autoencoders;importance sampling;double robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Julie Josse;Imke Mayer;Jean-Philippe Vert",
        "authorids": "julie.josse@polytechnique.edu;imke.mayer@polytechnique.edu;jpvert@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\njosse2020missdeepcausal,\ntitle={MissDeepCausal: causal inference from incomplete data using deep latent variable models},\nauthor={Julie Josse and Imke Mayer and Jean-Philippe Vert},\nyear={2020},\nurl={https://openreview.net/forum?id=SylpBgrKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SylpBgrKPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "469;250;850",
        "wc_reply_reviewers": "0;0;664",
        "wc_reply_authors": "510;153;981",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            523.0,
            247.90724071716824
        ],
        "wc_reply_reviewers_avg": [
            221.33333333333334,
            313.012601805245
        ],
        "wc_reply_authors_avg": [
            548.0,
            339.09585665413255
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14836161253513948478&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SylurJHFPS",
        "title": "The Detection of Distributional Discrepancy for Text Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The text generated by neural language models is not as good as the real text. This means that their distributions are different. Generative Adversarial Nets (GAN) are used to alleviate it. However, some researchers argue that GAN variants do not work at all. When both sample quality (such as Bleu) and sample diversity (such as self-Bleu) are taken into account, the GAN variants even are worse than a well-adjusted language model. But, Bleu and self-Bleu can not precisely measure this distributional discrepancy. In fact, how to measure the distributional discrepancy between real text and generated text is still an open problem. In this paper, we theoretically propose two metric functions to measure the distributional difference between real text and generated text. Besides that, a method is put forward to estimate them. First, we evaluate language model with these two functions and find the difference is huge. Then, we try several methods to use the detected discrepancy signal to improve the generator. However the difference becomes even bigger than before. Experimenting on two existing language GANs, the distributional discrepancy between real text and generated text increases with more adversarial learning rounds. It demonstrates both of these language GANs fail. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xingyuan Chen;Ping Cai;Peng Jin;Haokun Du;Hongjun Wang;Xinyu Dai;Jiajun Chen",
        "authorids": "1045258214@qq.com;1061185275@qq.com;jandp@pku.edu.cn;626913553@qq.com;wanghongjun@swjtu.edu.cn;daixinyu@nju.edu.cn;chenjj@nju.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nchen2020the,\ntitle={The Detection of Distributional Discrepancy for Text Generation},\nauthor={Xingyuan Chen and Ping Cai and Peng Jin and Haokun Du and Hongjun Wang and Xinyu Dai and Jiajun Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=SylurJHFPS}\n}",
        "github": "https://github.com/anonymousBoy-sys/seqgan-relgan",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SylurJHFPS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "231;236;349",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            272.0,
            54.48547206977899
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sZ6qHGEV-d4J:scholar.google.com/&scioq=The+Detection+of+Distributional+Discrepancy+for+Text+Generation&hl=en&as_sdt=0,33",
        "gs_version_total": 3
    },
    {
        "id": "SylwBpNKDr",
        "title": "Boosting Network: Learn by Growing Filters and Layers via SplitLBI",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Network structures are important to learning good representations of many tasks in computer vision and machine learning communities. These structures are either manually designed, or searched by Neural Architecture Search (NAS) in previous works, which however requires either expert-level efforts, or prohibitive computational cost. In practice, it is desirable to efficiently and simultaneously learn both the structures and parameters of a network from arbitrary classes with budgeted computational cost. We identify it as a new learning paradigm -- Boosting Network, where one starts from simple models, delving into complex trained models progressively.\nIn this paper, by virtue of an iterative sparse regularization path -- Split Linearized Bregman Iteration (SplitLBI), we propose a simple yet effective boosting network method that can simultaneously grow and train a network by progressively adding both convolutional filters and layers. Extensive experiments with VGG and ResNets validate the effectiveness of our proposed algorithms.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zuyuan Zhong;Chen Liu;Yanwei Fu;Yuan Yao",
        "authorids": "zyzhong19@fudan.edu.cn;corwinliu9669@gmail.com;yanweifu@fudan.edu.cn;yuany@ust.hk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhong2020boosting,\ntitle={Boosting Network: Learn by Growing Filters and Layers via Split{\\{}LBI{\\}}},\nauthor={Zuyuan Zhong and Chen Liu and Yanwei Fu and Yuan Yao},\nyear={2020},\nurl={https://openreview.net/forum?id=SylwBpNKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SylwBpNKDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "282;377;363",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "695;793;379",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            340.6666666666667,
            41.87547678003864
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            622.3333333333334,
            176.6528296468025
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nnTDKhw9TJoJ:scholar.google.com/&scioq=Boosting+Network:+Learn+by+Growing+Filters+and+Layers+via+SplitLBI&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SylyHkHYDB",
        "title": "Knossos: Compiling AI with AI",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We combine A* search with reinforcement learning to speed up machine learning code",
        "abstract": "Machine learning workloads are often expensive to train, taking weeks to converge. The current generation of frameworks relies on custom back-ends in order to achieve efficiency, making it impractical to train models on less common hardware where no such back-ends exist. Knossos builds on recent work that avoids the need for hand-written libraries, instead compiles machine learning models in much the same way one would compile other kinds of software. In order to make the resulting code efficient, the Knossos complier directly optimises the abstract syntax tree of the program. However in contrast to traditional compilers that employ hand-written optimisation passes, we take a rewriting approach driven by the $A^\\star$ search algorithm and a learn value function that evaluates future potential cost reduction of taking various rewriting actions to the program. We show that Knossos can automatically learned optimisations that past compliers had to implement by hand. Furthermore, we demonstrate that Knossos can achieve wall time reduction compared to a hand-tuned compiler on a suite of machine learning programs, including basic linear algebra and convolutional networks. The Knossos compiler has minimal dependencies and can be used on any architecture that supports a \\Cpp toolchain. \nSince cost model the proposed algorithm optimises can be tailored to a particular hardware architecture, the proposed approach can potentially applied to a variety of hardware.",
        "keywords": "machine learning software;compiler optimization;reinforcement learning;A* search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuu Jinnai;Arash Mehrjou;Kamil Ciosek;Anna Mitenkova;Alan Lawrence;Tom Ellis;Ryota Tomioka;Simon Peyton Jones;Andrew Fitzgibbon",
        "authorids": "yuu_jinnai@brown.edu;mehrjou.arash@gmail.com;kamil.ciosek@microsoft.com;anna.mitenkova@microsoft.com;allawr@microsoft.com;tom.ellis@microsoft.com;ryoto@microsoft.com;simonpj@microsoft.com;awf@microsoft.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=SylyHkHYDB",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7267167970309458475&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SylzhkBtDB",
        "title": "Understanding and Improving Information Transfer in Multi-Task Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "A Theoretical Study of Multi-Task Learning with Practical Implications for Improving Multi-Task Training and Transfer Learning",
        "abstract": "We investigate multi-task learning approaches that use a shared feature representation for all tasks. To better understand the transfer of task information, we study an architecture with a shared module for all tasks and a separate output module for each task. We study the theory of this setting on linear and ReLU-activated models. Our key observation is that whether or not tasks' data are well-aligned can significantly affect the performance of multi-task learning. We show that misalignment between task data can cause negative transfer (or hurt performance) and provide sufficient conditions for positive transfer. Inspired by the theoretical insights, we show that aligning tasks' embedding layers leads to performance gains for multi-task training and transfer learning on the GLUE benchmark and sentiment analysis tasks; for example, we obtained a 2.35% GLUE score average improvement on 5 GLUE tasks over BERT LARGE using our alignment method. We also design an SVD-based task re-weighting scheme and show that it improves the robustness of multi-task training on a multi-label image dataset.",
        "keywords": "Multi-Task Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sen Wu;Hongyang R. Zhang;Christopher R\u00e9",
        "authorids": "senwu@cs.stanford.edu;hongyang@cs.stanford.edu;chrismre@stanford.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nWu2020Understanding,\ntitle={Understanding and Improving Information Transfer in Multi-Task Learning},\nauthor={Sen Wu and Hongyang R. Zhang and Christopher R\u00e9},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SylzhkBtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SylzhkBtDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "202;386;396",
        "wc_reply_reviewers": "0;0;128",
        "wc_reply_authors": "505;795;637",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;3",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            328.0,
            89.18893802858439
        ],
        "wc_reply_reviewers_avg": [
            42.666666666666664,
            60.339778661252055
        ],
        "wc_reply_authors_avg": [
            645.6666666666666,
            118.55050494292391
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 205,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6980010652470348251&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Syx-bCEFPS",
        "title": "Synthetic vs Real: Deep Learning on Controlled Noise",
        "track": "main",
        "status": "Reject",
        "tldr": "We establish a benchmark of controlled real noise and reveal several interesting findings about real-world noisy data.",
        "abstract": "Performing controlled experiments on noisy data is essential in thoroughly understanding deep learning across a spectrum of noise levels. Due to the lack of suitable datasets, previous research have only examined deep learning on controlled synthetic noise, and real-world noise has never been systematically studied in a controlled setting. To this end, this paper establishes a benchmark of real-world noisy labels at 10 controlled noise levels. As real-world noise possesses unique properties, to understand the difference, we conduct a large-scale study across a variety of noise levels and types, architectures, methods, and training settings. Our study shows that: (1) Deep Neural Networks (DNNs) generalize much better on real-world noise. (2) DNNs may not learn patterns first on real-world noisy data. (3) When networks are fine-tuned, ImageNet architectures generalize well on noisy data. (4) Real-world noise appears to be less harmful, yet it is more difficult for robust DNN methods to improve. (5) Robust learning methods that work well on synthetic noise may not work as well on real-world noise, and vice versa. We hope our benchmark, as well as our findings, will facilitate deep learning research on noisy data.\n",
        "keywords": "controlled experiments;robust deep learning;corrupted label;real-world noisy data",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lu Jiang;Di Huang;Weilong Yang",
        "authorids": "lujiang@google.com;dihuang@google.com;weilongyang@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\njiang2020synthetic,\ntitle={Synthetic vs Real: Deep Learning on Controlled Noise},\nauthor={Lu Jiang and Di Huang and Weilong Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx-bCEFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Syx-bCEFPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "262;424;89",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "597;1572;259",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            258.3333333333333,
            136.78775123851145
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            809.3333333333334,
            556.6604790075265
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15875134671870810879&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Syx1DkSYwB",
        "title": "Variance Reduction With Sparse Gradients",
        "track": "main",
        "status": "Poster",
        "tldr": "We use sparsity to improve the computational complexity of variance reduction methods.",
        "abstract": "Variance reduction methods such as SVRG and SpiderBoost use a mixture of large and small batch gradients to reduce the variance of stochastic gradients. Compared to SGD, these methods require at least double the number of operations per update to model parameters. To reduce the computational cost of these methods, we introduce a new sparsity operator: The random-top-k operator. Our operator reduces computational complexity by estimating gradient sparsity exhibited in a variety of applications by combining the top-k operator and the randomized coordinate descent operator. With this operator, large batch gradients offer an extra benefit beyond variance reduction: A reliable estimate of gradient sparsity. Theoretically, our algorithm is at least as good as the best algorithm (SpiderBoost), and further excels in performance whenever the random-top-k operator captures gradient sparsity. Empirically, our algorithm consistently outperforms SpiderBoost using various models on various tasks including image classification, natural language processing, and sparse matrix factorization. We also provide empirical evidence to support the intuition behind our algorithm via a simple gradient entropy computation, which serves to quantify gradient sparsity at every iteration.",
        "keywords": "optimization;variance reduction;machine learning;deep neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Melih Elibol;Lihua Lei;Michael I. Jordan",
        "authorids": "elibol@cs.berkeley.edu;lihualei@stanford.edu;jordan@cs.berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nElibol2020Variance,\ntitle={Variance Reduction With Sparse Gradients},\nauthor={Melih Elibol and Lihua Lei and Michael I. Jordan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx1DkSYwB}\n}",
        "github": "http://s000.tinyupload.com/index.php?file_id=39477384063585848544",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syx1DkSYwB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "295;412;360",
        "wc_reply_reviewers": "77;0;0",
        "wc_reply_authors": "377;95;241",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            355.6666666666667,
            47.86323107447813
        ],
        "wc_reply_reviewers_avg": [
            25.666666666666668,
            36.29814810090944
        ],
        "wc_reply_authors_avg": [
            237.66666666666666,
            115.15014353250088
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18339995969267123985&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Syx33erYwH",
        "title": "ASYNCHRONOUS MULTI-AGENT GENERATIVE ADVERSARIAL IMITATION LEARNING",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper extends the multi-agent generative adversarial imitation learning to extensive-form Markov games.",
        "abstract": "Imitation learning aims to inversely learn a policy from expert demonstrations, which has been extensively studied in the literature for both single-agent setting with Markov decision process (MDP) model, and multi-agent setting with Markov game (MG) model. However, existing approaches for general multi-agent Markov games are not applicable to multi-agent extensive Markov games, where agents make asynchronous decisions following a certain order, rather than simultaneous decisions. We propose a novel framework for asynchronous multi-agent generative adversarial imitation learning (AMAGAIL) under general extensive Markov game settings, and the learned expert policies are proven to guarantee subgame perfect equilibrium (SPE), a more general and stronger equilibrium than Nash equilibrium (NE). The experiment results demonstrate that compared to state-of-the-art baselines, our AMAGAIL model can better infer the policy of each expert agent using their demonstration data collected from asynchronous decision-making scenarios (i.e., extensive Markov games).",
        "keywords": "Multi-agent;Imitation Learning;Inverse Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin Zhang;Weixiao Huang;Renjie Liao;Yanhua Li",
        "authorids": "xzhang17@wpi.edu;whuang2@wpi.edu;rjliao@cs.toronto.edu;yli15@wpi.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020asynchronous,\ntitle={{\\{}ASYNCHRONOUS{\\}} {\\{}MULTI{\\}}-{\\{}AGENT{\\}} {\\{}GENERATIVE{\\}} {\\{}ADVERSARIAL{\\}} {\\{}IMITATION{\\}} {\\{}LEARNING{\\}}},\nauthor={Xin Zhang and Weixiao Huang and Renjie Liao and Yanhua Li},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx33erYwH}\n}",
        "github": "https://www.dropbox.com/sh/ngklsadt974d2pw/AAAVArZfh7W2GBjiNv2YPhcja?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Syx33erYwH",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "219;611;92",
        "wc_reply_reviewers": "803;0;0",
        "wc_reply_authors": "2704;1262;55",
        "reply_reviewers": "5;0;0",
        "reply_authors": "9;3;2",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            307.3333333333333,
            220.89565158438248
        ],
        "wc_reply_reviewers_avg": [
            267.6666666666667,
            378.5378301951984
        ],
        "wc_reply_authors_avg": [
            1340.3333333333333,
            1082.8672843684749
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            2.357022603955158
        ],
        "reply_authors_avg": [
            4.666666666666667,
            3.0912061651652345
        ],
        "replies_avg": [
            26,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wgqDPorozpMJ:scholar.google.com/&scioq=ASYNCHRONOUS+MULTI-AGENT+GENERATIVE+ADVERSARIAL+IMITATION+LEARNING&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Syx4wnEtvH",
        "title": "Large Batch Optimization for Deep Learning: Training BERT in 76 minutes",
        "track": "main",
        "status": "Poster",
        "tldr": "A fast optimizer for general applications and large-batch training.",
        "abstract": "Training large deep neural networks on massive datasets is  computationally very challenging. There has been recent surge in interest in using large batch stochastic optimization methods to tackle this issue. The most prominent algorithm in this line of research is LARS, which by  employing layerwise adaptive learning rates trains ResNet on ImageNet in a few minutes. However, LARS performs poorly for attention models like BERT, indicating that its performance gains are not consistent across tasks. In this paper, we first study a principled layerwise adaptation strategy to accelerate training of deep neural networks using large mini-batches. Using this strategy, we develop a new layerwise adaptive large batch optimization technique called LAMB; we then provide convergence analysis of LAMB as well as LARS, showing convergence to a stationary point in general nonconvex settings. Our empirical results demonstrate the superior performance of LAMB across various tasks such as BERT and ResNet-50 training with very little hyperparameter tuning. In particular, for BERT training, our optimizer enables use of very large batch sizes of 32868 without any degradation of performance.  By increasing the batch size to the memory limit of a TPUv3 Pod, BERT training time can be reduced from 3 days to just 76 minutes.",
        "keywords": "large-batch optimization;distributed training;fast optimizer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yang You;Jing Li;Sashank Reddi;Jonathan Hseu;Sanjiv Kumar;Srinadh Bhojanapalli;Xiaodan Song;James Demmel;Kurt Keutzer;Cho-Jui Hsieh",
        "authorids": "youyang@cs.berkeley.edu;jingli@google.com;sashank@google.com;jhseu@google.com;sanjivk@google.com;bsrinadh@google.com;xiaodansong@google.com;demmel@berkeley.edu;keutzer@berkeley.edu;chohsieh@cs.ucla.edu",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@inproceedings{\nYou2020Large,\ntitle={Large Batch Optimization for Deep Learning: Training BERT in 76 minutes},\nauthor={Yang You and Jing Li and Sashank Reddi and Jonathan Hseu and Sanjiv Kumar and Srinadh Bhojanapalli and Xiaodan Song and James Demmel and Kurt Keutzer and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx4wnEtvH}\n}",
        "github": "https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lamb.py",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syx4wnEtvH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "302;337;101",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "704;392;16",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            246.66666666666666,
            103.98824719916936
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            370.6666666666667,
            281.27961572467746
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1205,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10416850915790778090&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "Syx5eT4KDS",
        "title": "Discrete InfoMax Codes for Meta-Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We derive a generalization bound for meta-learning, and propose a meta-learning model that generalizes well",
        "abstract": "This paper analyzes how generalization works in meta-learning. Our core contribution is an information-theoretic generalization bound for meta-learning, which identifies the expressivity of the task-specific learner as the key factor that makes generalization to new datasets difficult. Taking inspiration from our bound, we present Discrete InfoMax Codes (DIMCO), a novel meta-learning model that trains a stochastic encoder to output discrete codes. Experiments show that DIMCO requires less memory and less time for similar performance to previous metric learning methods and that our method generalizes particularly well in a challenging small-data setting.",
        "keywords": "meta-learning;generalization;discrete representations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yoonho Lee;Wonjae Kim;Seungjin Choi",
        "authorids": "einet89@gmail.com;dandelin.kim@kakaocorp.com;seungjin.choi.mlg@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlee2020discrete,\ntitle={Discrete InfoMax Codes for Meta-Learning},\nauthor={Yoonho Lee and Wonjae Kim and Seungjin Choi},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx5eT4KDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Syx5eT4KDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1020;310;206",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "818;144;421",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            512.0,
            361.7107500015263
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            461.0,
            276.6092309860007
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "Syx79eBKwr",
        "title": "A Mutual Information Maximization Perspective of Language Representation Learning",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "We show state-of-the-art word representation learning methods maximize an objective function that is a lower bound on the mutual information between different parts of a word sequence (i.e., a sentence). Our formulation provides an alternative perspective that unifies classical word embedding models (e.g., Skip-gram) and modern contextual embeddings (e.g., BERT, XLNet). In addition to enhancing our theoretical understanding of these methods, our derivation leads to a principled framework that can be used to construct new self-supervised tasks. We provide an example by drawing inspirations from related methods based on mutual information maximization that have been successful in computer vision, and introduce a simple self-supervised objective that maximizes the mutual information between a global sentence representation and n-grams in the sentence. Our analysis offers a holistic view of representation learning methods to transfer knowledge and translate progress across multiple domains (e.g., natural language processing, computer vision, audio processing).",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lingpeng Kong;Cyprien de Masson d'Autume;Lei Yu;Wang Ling;Zihang Dai;Dani Yogatama",
        "authorids": "lingpenk@google.com;cyprien@google.com;leiyu@google.com;lingwang@google.com;zihangd@google.com;dyogatama@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nKong2020A,\ntitle={A Mutual Information Maximization Perspective of Language Representation Learning},\nauthor={Lingpeng Kong and Cyprien de Masson d'Autume and Lei Yu and Wang Ling and Zihang Dai and Dani Yogatama},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx79eBKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syx79eBKwr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "172;192;654",
        "wc_reply_reviewers": "3;0;0",
        "wc_reply_authors": "194;25;226",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            339.3333333333333,
            222.6526941723864
        ],
        "wc_reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "wc_reply_authors_avg": [
            148.33333333333334,
            88.18289075678015
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 77,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10399673469998502500&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "Syx7A3NFvH",
        "title": "Multi-agent Reinforcement Learning for Networked System Control",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper proposes a new formulation and a new communication protocol for networked multi-agent control problems",
        "abstract": "This paper considers multi-agent reinforcement learning (MARL) in networked system control. Specifically, each agent learns a decentralized control policy based on local observations and messages from connected neighbors. We formulate such a networked MARL (NMARL) problem as a spatiotemporal Markov decision process and introduce a spatial discount factor to stabilize the training of each local agent. Further, we propose a new differentiable communication protocol, called NeurComm, to reduce information loss and non-stationarity in NMARL. Based on experiments in realistic NMARL scenarios of adaptive traffic signal control and cooperative adaptive cruise control, an appropriate spatial discount factor effectively enhances the learning curves of non-communicative MARL algorithms, while NeurComm outperforms existing communication protocols in both learning efficiency and control performance.",
        "keywords": "deep reinforcement learning;multi-agent reinforcement learning;decision and control",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianshu Chu;Sandeep Chinchali;Sachin Katti",
        "authorids": "cts198859@hotmail.com;csandeep@stanford.edu;skatti@stanford.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nChu2020Multi-agent,\ntitle={Multi-agent Reinforcement Learning for Networked System Control},\nauthor={Tianshu Chu and Sandeep Chinchali and Sachin Katti},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx7A3NFvH}\n}",
        "github": "https://github.com/cts198859/deeprl_network",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syx7A3NFvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "364;207;171",
        "wc_reply_reviewers": "99;0;0",
        "wc_reply_authors": "705;382;564",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            247.33333333333334,
            83.79472271900872
        ],
        "wc_reply_reviewers_avg": [
            33.0,
            46.66904755831214
        ],
        "wc_reply_authors_avg": [
            550.3333333333334,
            132.21783372735902
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 170,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8406297615890251928&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Syx7WyBtwB",
        "title": "Interpretations are useful: penalizing explanations to align neural networks with prior knowledge",
        "track": "main",
        "status": "Reject",
        "tldr": "Explanations are useful now! We introduce CDEP, a technique for penalizing explanations in order to improve predictive accuracy.",
        "abstract": "For an explanation of a deep learning model to be effective, it must provide both insight into a model and suggest a corresponding action in order to achieve some objective.  Too often, the litany of proposed explainable deep learning methods stop  at  the  first  step,  providing  practitioners  with  insight  into  a  model,  but  no way to act on it.  In this paper, we propose contextual decomposition explanation penalization (CDEP), a method which enables practitioners to leverage existing explanation methods in order to increase the predictive accuracy of deep learning models.  In particular, when shown that a model has incorrectly assigned importance to some features, CDEP enables practitioners to correct these errors by directly regularizing the provided explanations.  Using explanations provided by contextual decomposition (CD) (Murdoch et al., 2018), we demonstrate the ability of our method to increase performance on an array of toy and real datasets.",
        "keywords": "explainability;deep learning;interpretability;computer vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Laura Rieger;Chandan Singh;W. James Murdoch;Bin Yu",
        "authorids": "lauri@dtu.dk;c_singh@berkeley.edu;jmurdoch@berkeley.edu;binyu@berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nrieger2020interpretations,\ntitle={Interpretations are useful: penalizing explanations to align neural networks with prior knowledge},\nauthor={Laura Rieger and Chandan Singh and W. James Murdoch and Bin Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx7WyBtwB}\n}",
        "github": "https://drive.google.com/drive/folders/16XHi-Onen2gjOvRx3qIUP1Z-3SvrAY4P?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syx7WyBtwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "391;209;552",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "641;656;831",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            384.0,
            140.11661809602268
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            709.3333333333334,
            86.24899355290407
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 262,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15865202666417121360&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "Syx9ET4YPB",
        "title": "Do Image Classifiers Generalize Across Time?",
        "track": "main",
        "status": "Reject",
        "tldr": "We systematically measure the sensitivity of image classifiers to temporal perturbations by introducing two human-reviewed benchmarks of similar video frames.",
        "abstract": "We study the robustness of image classifiers to temporal perturbations derived from videos. As part of this study, we construct ImageNet-Vid-Robust and YTBB-Robust, containing a total 57,897 images grouped into 3,139 sets of perceptually similar images. Our datasets were derived from ImageNet-Vid and Youtube-BB respectively and thoroughly re-annotated by human experts for image similarity. We evaluate a diverse array of classifiers pre-trained on ImageNet and show a median classification accuracy drop of 16 and 10 percent on our two datasets. Additionally, we evaluate three detection models and show that natural perturbations induce both classification as well as localization errors, leading to a median drop in detection mAP of 14 points. Our analysis demonstrates that perturbations occurring naturally in videos pose a substantial and realistic challenge to deploying convolutional neural networks in environments that require both reliable and low-latency predictions.",
        "keywords": "robustness;image classification;distribution shift",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vaishaal Shankar;Achal Dave;Rebecca Roelofs;Deva Ramanan;Ben Recht;Ludwig Schmidt",
        "authorids": "vaishaal@berkeley.edu;achald@cs.cmu.edu;roelofs@cs.berkely.edu;deva@cs.cmu.edu;brecht@berkeley.edu;ludwigschmidt2@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nshankar2020do,\ntitle={Do Image Classifiers Generalize Across Time?},\nauthor={Vaishaal Shankar and Achal Dave and Rebecca Roelofs and Deva Ramanan and Ben Recht and Ludwig Schmidt},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx9ET4YPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Syx9ET4YPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "532;166;248",
        "wc_reply_reviewers": "492;0;0",
        "wc_reply_authors": "719;289;114",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            315.3333333333333,
            156.8212003808442
        ],
        "wc_reply_reviewers_avg": [
            164.0,
            231.93102422918759
        ],
        "wc_reply_authors_avg": [
            374.0,
            254.19808548977443
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 93,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3535848647294688985&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "Syx9Q1rYvH",
        "title": "Mutual Information Maximization for Robust Plannable Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "Representational learning objective for planning that is robust to visual distractors",
        "abstract": "Extending the capabilities of robotics to real-world complex, unstructured environments requires the capability of developing better perception systems while maintaining low sample complexity. When dealing with high-dimensional state spaces, current methods are either model-free, or model-based with reconstruction based objectives. The sample inefficiency of the former constitutes a major barrier for applying them to the real-world. While the latter present low sample complexity, they learn latent spaces that need to reconstruct every single detail of the scene. Real-world environments are unstructured and cluttered with objects. Capturing all the variability on the latent representation harms its applicability to downstream tasks. In this work, we present mutual information maximization for robust plannable representations (MIRO), an information theoretic representational learning objective for model-based reinforcement learning. Our objective optimizes for a latent space that maximizes the mutual information with future observations and emphasizes the relevant aspects of the dynamics, which allows to capture all the information needed for planning.\nWe show that our approach learns a latent representation that in cluttered scenes focuses on the task relevant features, ignoring the irrelevant aspects. At the same time, state-of-the-art methods with reconstruction objectives are unable to learn in such environments.",
        "keywords": "reinforcement learning;robust learning;model based;planning;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yiming Ding;Ignasi Clavera;Pieter Abbeel",
        "authorids": "dingyiming0427@berkeley.edu;iclavera@berkeley.edu;pabbeel@cs.berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nding2020mutual,\ntitle={Mutual Information Maximization for Robust Plannable Representations},\nauthor={Yiming Ding and Ignasi Clavera and Pieter Abbeel},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx9Q1rYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syx9Q1rYvH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "204;1408;309",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "319;372;522",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            640.3333333333334,
            544.5122180039265
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            404.3333333333333,
            85.97027910207625
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4452456013853061794&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SyxBgkBFPS",
        "title": "Guided Adaptive Credit Assignment for Sample Efficient Policy Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "A new and general credit assignment method for obtaining sample efficiency of policy optimization in sparse reward setting",
        "abstract": "Policy gradient methods have achieved remarkable successes in solving challenging reinforcement learning problems. However, it still often suffers from sparse reward tasks, which leads to poor sample efficiency during training. In this work, we propose a guided adaptive credit assignment method to do effectively credit assignment for policy gradient methods. Motivated by entropy regularized policy optimization, our method extends the previous credit assignment methods by introducing more general guided adaptive credit assignment(GACA). The benefit of GACA is a principled way of utilizing off-policy samples. The effectiveness of proposed algorithm is demonstrated on the challenging \\textsc{WikiTableQuestions} and \\textsc{WikiSQL} benchmarks and an instruction following environment. The task is generating action sequences or program sequences from natural language questions or instructions, where only final binary success-failure execution feedback is available. Empirical studies show that our method significantly improves the sample efficiency of the state-of-the-art policy optimization approaches.",
        "keywords": "credit assignment;sparse reward;policy optimization;sample efficiency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hao Liu;Richard Socher;Caiming Xiong",
        "authorids": "lhao499@gmail.com;rsocher@salesforce.com;cxiong@salesforce.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nliu2020guided,\ntitle={Guided Adaptive Credit Assignment for Sample Efficient Policy Optimization},\nauthor={Hao Liu and Richard Socher and Caiming Xiong},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxBgkBFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxBgkBFPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "750;260;465",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "506;460;277",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            491.6666666666667,
            200.92840073574024
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            414.3333333333333,
            98.90848744616858
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aD2dxTL5rMwJ:scholar.google.com/&scioq=Guided+Adaptive+Credit+Assignment+for+Sample+Efficient+Policy+Optimization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SyxBxCNFwr",
        "title": "DRASIC: Distributed Recurrent Autoencoder for Scalable Image Compression",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "We introduce a data-driven Distributed Source Coding framework based on Distributed Recurrent Autoencoder for Scalable Image Compression (DRASIC).",
        "abstract": "We propose a new architecture for distributed image compression from a group of distributed data sources. The work is motivated by practical needs of data-driven codec design, low power consumption, robustness, and data privacy. The proposed architecture, which we refer to as Distributed Recurrent Autoencoder for Scalable Image Compression (DRASIC), is able to train distributed encoders and one joint decoder on correlated data sources. Its compression capability is much better than the method of training codecs separately. Meanwhile, for 10 distributed sources, our distributed system remarkably performs within 2 dB peak signal-to-noise ratio (PSNR) of that of a single codec trained with all data sources. We experiment distributed sources with different correlations and show how our methodology well matches the Slepian-Wolf Theorem in Distributed Source Coding (DSC). Our method is also shown to be robust to the lack of presence of encoded data from a number of distributed sources. Moreover, it is scalable in the sense that codes can be decoded simultaneously at more than one compression quality level. To the best of our knowledge, this is the first data-driven DSC framework for general distributed code design with deep learning.",
        "keywords": "Image Compression;Distributed Source Coding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Enmao Diao;Jie Ding;Vahid Tarokh",
        "authorids": "enmao.diao@duke.edu;dingj@umn.edu;vahid.tarokh@duke.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndiao2020drasic,\ntitle={{\\{}DRASIC{\\}}: Distributed Recurrent Autoencoder for Scalable Image Compression},\nauthor={Enmao Diao and Jie Ding and Vahid Tarokh},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxBxCNFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxBxCNFwr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "201;298;584",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "251;454;200",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            361.0,
            162.58126173291518
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            301.6666666666667,
            109.70971799354066
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 29,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7779109412224346801&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SyxC9TEtPH",
        "title": "Conditional Invertible Neural Networks for Guided Image Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this work, we address the task of natural image generation guided by a conditioning input. We introduce a new architecture called conditional invertible neural network (cINN). It combines the purely generative INN model with an unconstrained feed-forward network, which efficiently pre-processes the conditioning input into useful features.  All parameters of a cINN are jointly optimized with a stable, maximum likelihood-based training procedure. Even though INNs and other normalizing flow models have received very little attention in the literature in contrast to GANs, we find that cINNs can achieve comparable quality, with some remarkable properties absent in cGANs, e.g.  apparent immunity to mode collapse. We demonstrate these properties for the tasks of MNIST digit generation and image colorization. Furthermore, we take advantage of our bidirectional cINN architecture to explore and manipulate emergent properties of the latent space, such as changing the image style in an intuitive way.",
        "keywords": "Invertible neural networks;generative models;conditional generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lynton Ardizzone;Carsten L\u00fcth;Jakob Kruse;Carsten Rother;Ullrich K\u00f6the",
        "authorids": "lynton.ardizzone@iwr.uni-heidelberg.de;clueth@live.de;jakob.kruse@iwr.uni-heidelberg.de;carsten.rother@iwr.uni-heidelberg.de;ullrich.koethe@iwr.uni-heidelberg.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nardizzone2020conditional,\ntitle={Conditional Invertible Neural Networks for Guided Image Generation},\nauthor={Lynton Ardizzone and Carsten L{\\\"u}th and Jakob Kruse and Carsten Rother and Ullrich K{\\\"o}the},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxC9TEtPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SyxC9TEtPH",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "161;705;444;249",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "168;306;225;173",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            389.75,
            208.84369154944565
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            218.0,
            55.49324283189801
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2250670842873232523&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SyxD7lrFPH",
        "title": "Frequency Pooling: Shift-Equivalent and Anti-Aliasing Down Sampling",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Convolutional layer utilizes the shift-equivalent prior of images which makes it a great success for image processing. However, commonly used down sampling methods in convolutional neural networks (CNNs), such as max-pooling, average-pooling, and strided-convolution, are not shift-equivalent. This destroys the shift-equivalent property of CNNs and degrades their performance. In this paper, we propose a novel pooling method which is \\emph{strict shift equivalent and anti-aliasing} in theory. This is achieved by (inverse) Discrete Fourier Transform and we call our method frequency pooling. Experiments on image classifications show that frequency pooling improves accuracy and robustness w.r.t shifts of CNNs. ",
        "keywords": "pooling;anti-aliasing;shift-equivalent;frequency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhendong Zhang;Cheolkon Jung",
        "authorids": "zhd.zhang.ai@gmail.com;zhengzk@xidian.edu.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzhang2020frequency,\ntitle={Frequency Pooling: Shift-Equivalent and Anti-Aliasing Down Sampling},\nauthor={Zhendong Zhang and Cheolkon Jung},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxD7lrFPH}\n}",
        "github": "https://anonymous.4open.science/r/87040761-4ef7-4a02-99d5-b82ec65e1a11/",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyxD7lrFPH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "191;355;261",
        "wc_reply_reviewers": "139;0;0",
        "wc_reply_authors": "295;241;417",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            269.0,
            67.19126927411527
        ],
        "wc_reply_reviewers_avg": [
            46.333333333333336,
            65.5252283899534
        ],
        "wc_reply_authors_avg": [
            317.6666666666667,
            73.61763073853677
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2781319992444939264&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SyxDXJStPS",
        "title": "Reparameterized Variational Divergence Minimization for Stable Imitation",
        "track": "main",
        "status": "Reject",
        "tldr": "The overall goal of this work is to enable sample-efficient imitation from expert demonstrations, both with and without the provision of expert action labels, through the use of f-divergences.",
        "abstract": "State-of-the-art results in imitation learning are currently held by adversarial methods that iteratively estimate the divergence between student and expert policies and then minimize this divergence to bring the imitation policy closer to expert behavior. Analogous techniques for imitation learning from observations alone (without expert action labels), however, have not enjoyed the same ubiquitous successes. \nRecent work in adversarial methods for generative models has shown that the measure used to judge the discrepancy between real and synthetic samples is an algorithmic design choice, and that different choices can result in significant differences in model performance. Choices including Wasserstein distance and various $f$-divergences have already been explored in the adversarial networks literature, while more recently the latter class has been investigated for imitation learning. Unfortunately, we find that in practice this existing imitation-learning framework for using $f$-divergences suffers from numerical instabilities stemming from the combination of function approximation and policy-gradient reinforcement learning. In this work, we alleviate these challenges and offer a reparameterization of adversarial imitation learning as $f$-divergence minimization before further extending the framework to handle the problem of imitation from observations only. Empirically, we demonstrate that our design choices for coupling imitation learning and $f$-divergences are critical to recovering successful imitation policies. Moreover, we find that with the appropriate choice of $f$-divergence, we can obtain imitation-from-observation algorithms that outperform baseline approaches and more closely match expert performance in continous-control tasks with low-dimensional observation spaces. With high-dimensional observations, we still observe a significant gap with and without action labels, offering an interesting avenue for future work.",
        "keywords": "Imitation Learning;Reinforcement Learning;Adversarial Learning;Learning from Demonstration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dilip Arumugam;Debadeepta Dey;Alekh Agarwal;Asli Celikyilmaz;Elnaz Nouri;Eric Horvitz;Bill Dolan",
        "authorids": "dilip@cs.stanford.edu;dedey@microsoft.com;alekha@microsoft.com;aslicel@microsoft.com;elnouri@microsoft.com;horvitz@microsoft.com;billdol@microsoft.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\narumugam2020reparameterized,\ntitle={Reparameterized Variational Divergence Minimization for Stable Imitation},\nauthor={Dilip Arumugam and Debadeepta Dey and Alekh Agarwal and Asli Celikyilmaz and Elnaz Nouri and Eric Horvitz and Bill Dolan},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxDXJStPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxDXJStPS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "814;520;271",
        "wc_reply_reviewers": "0;206;0",
        "wc_reply_authors": "991;982;390",
        "reply_reviewers": "0;1;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            535.0,
            221.9324221469229
        ],
        "wc_reply_reviewers_avg": [
            68.66666666666667,
            97.10933128295251
        ],
        "wc_reply_authors_avg": [
            787.6666666666666,
            281.2168005096582
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5673634173799013838&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SyxGoJrtPr",
        "title": "SPROUT: Self-Progressing Robust Training",
        "track": "main",
        "status": "Reject",
        "tldr": "We proposed a new robust training framework that is scalable, effective and comprehensive",
        "abstract": "Enhancing model robustness under new and even adversarial environments is a crucial milestone toward building trustworthy and reliable machine learning systems. Current robust training methods such as adversarial training explicitly specify an ``attack'' (e.g., $\\ell_{\\infty}$-norm bounded perturbation) to generate adversarial examples during model training in order to improve adversarial robustness. In this paper, we take a different perspective and propose a new framework SPROUT, self-progressing robust training. During model training, SPROUT progressively adjusts training label distribution via our proposed parametrized label smoothing technique, making training free of attack generation and more scalable. We also motivate SPROUT using a general formulation based on vicinity risk minimization, which includes many robust training methods as special cases. Compared with state-of-the-art adversarial training methods (PGD-$\\ell_\\infty$ and TRADES) under $\\ell_{\\infty}$-norm bounded attacks and various invariance tests, SPROUT consistently attains superior performance and is more scalable to large neural networks. Our results shed new light on scalable, effective and attack-independent robust training methods.",
        "keywords": "robustness;robust training;trustworthy machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Minhao Cheng;Pin-Yu Chen;Sijia Liu;Shiyu Chang;Cho-Jui Hsieh;Payel Das",
        "authorids": "mhcheng@ucla.edu;pin-yu.chen@ibm.com;sijia.liu@ibm.com;shiyu.chang@ibm.com;chohsieh@cs.ucla.edu;daspa@us.ibm.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ncheng2020sprout,\ntitle={{\\{}SPROUT{\\}}: Self-Progressing Robust Training},\nauthor={Minhao Cheng and Pin-Yu Chen and Sijia Liu and Shiyu Chang and Cho-Jui Hsieh and Payel Das},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxGoJrtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer5;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyxGoJrtPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "227;790;201",
        "wc_reply_reviewers": "0;325;0",
        "wc_reply_authors": "245;1432;269",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.0,
            271.73639187025844
        ],
        "wc_reply_reviewers_avg": [
            108.33333333333333,
            153.2064692570853
        ],
        "wc_reply_authors_avg": [
            648.6666666666666,
            553.9869633444054
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qPUc3G5LK50J:scholar.google.com/&scioq=SPROUT:+Self-Progressing+Robust+Training&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SyxIWpVYvr",
        "title": "Input Complexity and Out-of-distribution Detection with Likelihood-based Generative Models",
        "track": "main",
        "status": "Poster",
        "tldr": "We pose that generative models' likelihoods are excessively influenced by the input's complexity, and propose a way to compensate it when detecting out-of-distribution inputs",
        "abstract": "Likelihood-based generative models are a promising resource to detect out-of-distribution (OOD) inputs which could compromise the robustness or reliability of a machine learning system. However, likelihoods derived from such models have been shown to be problematic for detecting certain types of inputs that significantly differ from training data. In this paper, we pose that this problem is due to the excessive influence that input complexity has in generative models' likelihoods. We report a set of experiments supporting this hypothesis, and use an estimate of input complexity to derive an efficient and parameter-free OOD score, which can be seen as a likelihood-ratio, akin to Bayesian model comparison. We find such score to perform comparably to, or even better than, existing OOD detection approaches under a wide range of data sets, models, model sizes, and complexity estimates.",
        "keywords": "OOD;generative models;likelihood",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Joan Serr\u00e0;David \u00c1lvarez;Vicen\u00e7 G\u00f3mez;Olga Slizovskaia;Jos\u00e9 F. N\u00fa\u00f1ez;Jordi Luque",
        "authorids": "joansj@gmail.com;davidalvarezdlt@gmail.com;vicen.gomez@upf.edu;oslizovskaia@gmail.com;jfn237@nyu.edu;jordi.luqueserrano@telefonica.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nSerr\u00e02020Input,\ntitle={Input Complexity and Out-of-distribution Detection with Likelihood-based Generative Models},\nauthor={Joan Serr\u00e0 and David \u00c1lvarez and Vicen\u00e7 G\u00f3mez and Olga Slizovskaia and Jos\u00e9 F. N\u00fa\u00f1ez and Jordi Luque},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxIWpVYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxIWpVYvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "363;659;339",
        "wc_reply_reviewers": "0;229;0",
        "wc_reply_authors": "621;792;366",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            453.6666666666667,
            145.5228122628507
        ],
        "wc_reply_reviewers_avg": [
            76.33333333333333,
            107.95163526114627
        ],
        "wc_reply_authors_avg": [
            593.0,
            175.03713891628828
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 315,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14952442854745261387&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "SyxIterYwS",
        "title": "Dynamical System Embedding for Efficient Intrinsically Motivated Artificial Agents",
        "track": "main",
        "status": "Reject",
        "tldr": "A faster approach to calculate empowerment from images.",
        "abstract": "Mutual Information between agent Actions and environment States (MIAS) quantifies the influence of agent on its environment. Recently, it was found that intrinsic motivation in artificial agents emerges from the maximization of MIAS.\nFor example, empowerment is an information-theoretic approach to intrinsic motivation, which has been shown to solve a broad range of standard RL benchmark problems. The estimation of empowerment for arbitrary dynamics is a challenging problem because it relies on the estimation of MIAS. Existing approaches rely on sampling, which have formal limitations, requiring exponentially many samples. In this work, we develop a novel approach for the estimation of empowerment in unknown arbitrary dynamics from visual stimulus only, without sampling for the estimation of MIAS. The core idea is to represent the relation between action sequences and future states by a stochastic dynamical system in latent space, which admits an efficient estimation of MIAS by the ``Water-Filling\" algorithm from information theory. We construct this embedding with deep neural networks trained on a novel objective function and demonstrate our approach by numerical simulations of non-linear continuous-time dynamical systems. We show that the designed embedding preserves information-theoretic properties of the original dynamics, and enables us to solve the standard AI benchmark problems.",
        "keywords": "intrinsic motivation;empowerment;latent representation;encoder",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruihan Zhao;Stas Tiomkin;Pieter Abbeel",
        "authorids": "philipzhao@berkeley.edu;stas@berkeley.edu;pabbeel@cs.berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhao2020dynamical,\ntitle={Dynamical System Embedding for Efficient Intrinsically Motivated Artificial Agents},\nauthor={Ruihan Zhao and Stas Tiomkin and Pieter Abbeel},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxIterYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxIterYwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "523;607;185",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "351;648;114",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            438.3333333333333,
            182.38664668469806
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            371.0,
            218.46281148058128
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2046889391586166451&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SyxJU64twr",
        "title": "Model Ensemble-Based Intrinsic Reward for Sparse Reward Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "For sparse-reward reinforcement learning, the ensemble of multiple dynamics models is used to generate intrinsic reward designed as the minimum of the surprise.",
        "abstract": "In this paper, a new intrinsic reward generation method for sparse-reward reinforcement learning is proposed based on an ensemble of dynamics models. In the proposed method, the mixture of multiple dynamics models is used to approximate the true unknown transition probability, and the intrinsic reward is designed as the minimum of the surprise seen from each dynamics model to the mixture of the dynamics models. In order to show the effectiveness of the proposed intrinsic reward generation method, a working algorithm is constructed by combining the proposed intrinsic reward generation method with the proximal policy optimization (PPO) algorithm. Numerical results show that for representative locomotion tasks, the proposed model-ensemble-based intrinsic reward generation method outperforms the previous methods based on a single dynamics model.",
        "keywords": "Reinforcement Learning;Intrinsic Reward;Dynamics Model;Ensemble",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Giseung Park;Whiyoung Jung;Sungho Choi;Youngchul Sung",
        "authorids": "gs.park@kaist.ac.kr;wy.jung@kaist.ac.kr;sungho.choi@kaist.ac.kr;ycsung@kaist.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\npark2020model,\ntitle={Model Ensemble-Based Intrinsic Reward for Sparse Reward Reinforcement Learning},\nauthor={Giseung Park and Whiyoung Jung and Sungho Choi and Youngchul Sung},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxJU64twr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxJU64twr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "598;415;319",
        "wc_reply_reviewers": "0;0;129",
        "wc_reply_authors": "277;196;543",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            444.0,
            115.73245007343446
        ],
        "wc_reply_reviewers_avg": [
            43.0,
            60.81118318204309
        ],
        "wc_reply_authors_avg": [
            338.6666666666667,
            148.22130601082813
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gAUA_8qBW9MJ:scholar.google.com/&scioq=Model+Ensemble-Based+Intrinsic+Reward+for+Sparse+Reward+Reinforcement+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 2
    },
    {
        "id": "SyxKrySYPr",
        "title": "Stabilizing Transformers for Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We succeed in stabilizing transformers for training in the RL setting and demonstrate a large improvement over LSTMs on DMLab-30, matching an external memory architecture.",
        "abstract": "Owing to their ability to both effectively integrate information over long time horizons and scale to massive amounts of data, self-attention architectures have recently shown breakthrough success in natural language processing (NLP), achieving state-of-the-art results in domains such as language modeling and machine translation. Harnessing the transformer's ability to process long time horizons of information could provide a similar performance boost in partially-observable reinforcement learning (RL) domains, but the large-scale transformers used in NLP have yet to be successfully applied to the RL setting. In this work we demonstrate that the standard transformer architecture is difficult to optimize, which was previously observed in the supervised learning setting but becomes especially pronounced with RL objectives. We propose architectural modifications that substantially improve the stability and learning speed of the original Transformer and XL variant. The proposed architecture, the Gated Transformer-XL (GTrXL), surpasses LSTMs on challenging memory environments and achieves state-of-the-art results on the multi-task DMLab-30 benchmark suite, exceeding the performance of an external memory architecture. We show that the GTrXL, trained using the same losses, has stability and performance that consistently matches or exceeds a competitive LSTM baseline, including on more reactive tasks where memory is less critical. GTrXL offers an easy-to-train, simple-to-implement but substantially more expressive architectural alternative to the standard multi-layer LSTM ubiquitously used for RL agents in partially-observable environments.  ",
        "keywords": "Deep Reinforcement Learning;Transformer;Reinforcement Learning;Self-Attention;Memory;Memory for Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Emilio Parisotto;Francis Song;Jack Rae;Razvan Pascanu;Caglar Gulcehre;Siddhant Jayakumar;Max Jaderberg;Rapha\u00ebl Lopez Kaufman;Aidan Clark;Seb Noury;Matt Botvinick;Nicolas Heess;Raia Hadsell",
        "authorids": "eparisot@cs.cmu.edu;songf@google.com;jwrae@google.com;razp@google.com;caglarg@google.com;sidmj@google.com;jaderberg@google.com;rlopezkaufman@google.com;aidanclark@google.com;snoury@google.com;botvinick@google.com;heess@google.com;raia@google.com",
        "gender": ";;;;;;;;;;;;",
        "homepage": ";;;;;;;;;;;;",
        "dblp": ";;;;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;;;",
        "orcid": ";;;;;;;;;;;;",
        "linkedin": ";;;;;;;;;;;;",
        "or_profile": ";;;;;;;;;;;;",
        "aff": ";;;;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;;;",
        "position": ";;;;;;;;;;;;",
        "bibtex": "@misc{\nparisotto2020stabilizing,\ntitle={Stabilizing Transformers for Reinforcement Learning},\nauthor={Emilio Parisotto and Francis Song and Jack Rae and Razvan Pascanu and Caglar Gulcehre and Siddhant Jayakumar and Max Jaderberg and Rapha{\\\"e}l Lopez Kaufman and Aidan Clark and Seb Noury and Matt Botvinick and Nicolas Heess and Raia Hadsell},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxKrySYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SyxKrySYPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "345;287;401",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "891;903;698",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.3333333333333,
            46.54269246855216
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            830.6666666666666,
            93.93733135565553
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            13,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 478,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2089112864819096585&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "SyxL2TNtvr",
        "title": "Unsupervised Model Selection for Variational Disentangled Representation Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce a method for unsupervised disentangled model selection for VAE-based disentangled representation learning approaches.",
        "abstract": "Disentangled representations have recently been shown to improve fairness, data efficiency and generalisation in simple supervised and reinforcement learning tasks. To extend the benefits of disentangled representations to more complex domains and practical applications, it is important to enable hyperparameter tuning and model selection of existing unsupervised approaches without requiring access to ground truth attribute labels, which are not available for most datasets. This paper addresses this problem by introducing a simple yet robust and reliable method for unsupervised disentangled model selection. We show that our approach performs comparably to the existing supervised alternatives across 5400 models from six state of the art unsupervised disentangled representation learning model classes. Furthermore, we show that the ranking produced by our approach correlates well with the final task performance on two different domains.",
        "keywords": "unsupervised disentanglement metric;disentangling;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sunny Duan;Loic Matthey;Andre Saraiva;Nick Watters;Chris Burgess;Alexander Lerchner;Irina Higgins",
        "authorids": "sunnyd@google.com;lmatthey@google.com;andresnds@google.com;nwatters@google.com;cpburgess@google.com;lerchner@google.com;irinah@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nDuan2020Unsupervised,\ntitle={Unsupervised Model Selection for Variational Disentangled Representation Learning},\nauthor={Sunny Duan and Loic Matthey and Andre Saraiva and Nick Watters and Chris Burgess and Alexander Lerchner and Irina Higgins},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxL2TNtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=SyxL2TNtvr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "211;455;479",
        "wc_reply_reviewers": "0;500;414",
        "wc_reply_authors": "7;1323;1063",
        "reply_reviewers": "0;2;2",
        "reply_authors": "1;4;3",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            381.6666666666667,
            121.07665157062924
        ],
        "wc_reply_reviewers_avg": [
            304.6666666666667,
            218.27403774969565
        ],
        "wc_reply_authors_avg": [
            797.6666666666666,
            569.0725398009955
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.247219128924647
        ],
        "replies_avg": [
            22,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 92,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18377560287725828064&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SyxM51BYPB",
        "title": "A new perspective in understanding of Adam-Type algorithms and beyond",
        "track": "main",
        "status": "Reject",
        "tldr": "A new perspective in understanding of Adam-Type algorithms",
        "abstract": "First-order adaptive optimization algorithms such as Adam play an important role in modern deep learning due to their super fast convergence speed in solving large scale optimization problems. However, Adam's non-convergence behavior and regrettable generalization ability make it fall into a love-hate relationship to deep learning community. Previous studies on Adam and its variants (refer as Adam-Type algorithms) mainly rely on theoretical regret bound analysis, which overlook the natural characteristic reside in such algorithms and limit our thinking. In this paper, we aim at seeking a different interpretation of Adam-Type algorithms so that we can intuitively comprehend and improve them. The way we chose is based on a traditional online convex optimization algorithm scheme known as mirror descent method. By bridging Adam and mirror descent, we receive a clear map of the functionality of each part in Adam. In addition, this new angle brings us a new insight on identifying the non-convergence issue of Adam. Moreover, we provide new variant of Adam-Type algorithm, namely AdamAL which can naturally mitigate the non-convergence issue of Adam and improve its performance. We further conduct experiments on various popular deep learning tasks and models, and the results are quite promising.",
        "keywords": "Machine Learning;Algorithm;Adam;First-Order Method",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zeyi Tao;Qi Xia;Qun Li",
        "authorids": "ztao@email.wm.edu;qxia01@email.wm.edu;liqun@cs.wm.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntao2020a,\ntitle={A new perspective in understanding of Adam-Type algorithms and beyond},\nauthor={Zeyi Tao and Qi Xia and Qun Li},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxM51BYPB}\n}",
        "github": "https://www.dropbox.com/s/qgqhg6znuimzci9/adamAL.py?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyxM51BYPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "316;350;150",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "813;741;740",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            272.0,
            87.37657962329875
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            764.6666666666666,
            34.179265969622904
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16390539847404749523&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SyxQ_TEFwS",
        "title": "Artificial Design: Modeling Artificial Super Intelligence with Extended General Relativity and Universal Darwinism via Geometrization for Universal Design Automation",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "Artificial Design",
        "abstract": "Let us share the joy of our unprecedented discovery with you: Physical Geometry and Biological Geometry are the outcome of the physical laws and biological laws respectively, and Artificial Super Intelligence (ASI) has to be a combination of both design and learning instead of learning alone, with that we propose Artificial Design, a bio-physical inspired mathematical model for Hierarchical Multi-Agent Multi-Environment Model-agnostic Policy-agnostic Deep Reinforcement Learning (HMMMPDRL) based ASI by reusing and extending General Relativity and Universal Darwinism with Geometrization. With Artificial Design we solve Deep Reinforcement Learning blackbox puzzle in AI and ASI. By treating HMMMPDRL as multiverse regardless the mutual exclusiveness between Multi-Agent and Multi-Environment, we reuse General Relativity's 4-Dimensional Pseudo-Riemannian Manifold based SpaceTime Model for Reinforcement Learning part of HMMMPDRL,\nwe also make a T-symmetry extension to General \nRelativity, replace N-Dimensional space with N-Dimensional GeneSpace, \nand formulate a N-Dimensional Riemannian Manifold based GeneSpace Model for Deep Learning part of HMMMPDRL, whereas Deep Learning architecture is adopted to approximate very complex state-action space composed environments in HMMMPDRL. By modeling ASI with Artificial Design rigorously in this way, we claim that intelligence, whether natural, artificial, or super-artificial like ASI, is just the geometry effect of N-Dimensional GeneSpace caused by Geometrization, and that paves the way in achieving ASI through Universal Design Automation of Artificial Design in theory. Of course, our Multiversal endeavor won't stop from there, endorse us to artificially co-accelerate human civilization in every possible way you might imagine. \n\n",
        "keywords": "Artificial Design;Artificial Super Intelligence;General Relativity;Universal Darwinism;Geometrization;Gene;Manifold;Tensor;SpaceTime;GeneSpace;Deep Reinforcement Learning;Universal Approximation;Geometry Effect;Universal Design Automation;Universal Paradox",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yang Zhang",
        "authorids": "hwswworld@yandex.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nanonymous2020artificial,\ntitle={Artificial Design: Modeling Artificial Super Intelligence with Extended General Relativity and Universal Darwinism via Geometrization for Universal Design Automation},\nauthor={Anonymous},\nbooktitle={Submitted to International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxQ_TEFwS},\nnote={under review}\n}",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=SyxQ_TEFwS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:msIn9HO-yaQJ:scholar.google.com/&scioq=Artificial+Design:+Modeling+Artificial+Super+Intelligence+with+Extended+General+Relativity+and+Universal+Darwinism+via+Geometrization+for+Universal+Design+Automation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SyxQh3EFDr",
        "title": "Hierarchical Complement Objective Training",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose Hierarchical Complement Objective Training, a novel training paradigm to effectively leverage category hierarchy in the labeling space on both image classification and semantic segmentation.",
        "abstract": "Hierarchical label structures widely exist in many machine learning tasks, ranging from those with explicit label hierarchies such as image classification to the ones that have latent label hierarchies such as semantic segmentation. Unfortunately, state-of-the-art methods often utilize cross-entropy loss which in-explicitly assumes the independence among class labels. Motivated by the fact that class members from the same hierarchy need to be similar to each others, we design a new training diagram called Hierarchical Complement Objective Training (HCOT). In HCOT, in addition to maximizing the probability of the ground truth class, we also neutralize the probabilities of rest of the classes in a hierarchical fashion, making the model take advantage of the label hierarchy explicitly. We conduct our method on both image classification and semantic segmentation. Results show that HCOT outperforms state-of-the-art models in CIFAR100, Imagenet, and PASCAL-context. Our experiments also demonstrate that HCOT can be applied on tasks with latent label hierarchies, which is a common characteristic in many machine learning tasks. ",
        "keywords": "category hierarchy;optimization;entropy;image recognition;semantic segmentation;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hao-Yun Chen;Li-Huang Tsai;Shih-Chieh Chang;Jia-Yu Pan;Yu-Ting Chen;Wei Wei;Da-Cheng Juan",
        "authorids": "haoyunchen@gapp.nthu.edu.tw;lihuangtsai@gapp.nthu.edu.tw;scchang@cs.nthu.edu.tw;jypan@google.com;yutingchen@google.com;wewei@google.com;dacheng@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "https://github.com/iclr2020-HCOT-anonymized-submission/Hierarchical-Complement-Objective-Training",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxQh3EFDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "324;163;51",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "409;624;95",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            179.33333333333334,
            112.04860056640105
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            376.0,
            217.2203182638923
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GsWhBeOVGmkJ:scholar.google.com/&scioq=Hierarchical+Complement+Objective+Training&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SyxS0T4tvS",
        "title": "RoBERTa: A Robustly Optimized BERT Pretraining Approach",
        "track": "main",
        "status": "Reject",
        "tldr": "We evaluate a number of design decisions when pretraining BERT models and propose an improved recipe that achieves state-of-the-art results on many natural language understanding tasks.",
        "abstract": "Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE, SQuAD, SuperGLUE and XNLI. These results highlight the importance of previously overlooked design choices, and raise questions about the source of recently reported improvements. We release our models and code.",
        "keywords": "Deep learning;language representation learning;natural language understanding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yinhan Liu;Myle Ott;Naman Goyal;Jingfei Du;Mandar Joshi;Danqi Chen;Omer Levy;Mike Lewis;Luke Zettlemoyer;Veselin Stoyanov",
        "authorids": "yinhanliu@fb.com;myleott@fb.com;namangoyal@instagram.com;jingfeidu@fb.com;mandar90@cs.washington.edu;danqic@cs.princeton.edu;omerlevy@gmail.com;mikelewis@fb.com;lsz@fb.com;ves@fb.com",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@misc{\nliu2020roberta,\ntitle={Ro{\\{}BERT{\\}}a: A Robustly Optimized {\\{}BERT{\\}} Pretraining Approach},\nauthor={Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and Luke Zettlemoyer and Veselin Stoyanov},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxS0T4tvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SyxS0T4tvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "306;229;313",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "69;243;158",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            282.6666666666667,
            38.05551500403354
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            156.66666666666666,
            71.04145894397784
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19629,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12744939629894656578&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SyxTZ1HYwB",
        "title": "TWO-STEP UNCERTAINTY NETWORK FOR TASKDRIVEN SENSOR PLACEMENT",
        "track": "main",
        "status": "Reject",
        "tldr": "Strategy of sensor placement to maximize the information gain with generative neural network. ",
        "abstract": "Optimal sensor placement achieves the minimal cost of sensors while obtaining the prespecified objectives. In this work, we propose a framework for sensor placement to maximize the information gain called Two-step Uncertainty Network(TUN). TUN encodes an arbitrary number of measurements, models the conditional distribution of high dimensional data, and estimates the task-specific information gain at un-observed locations. Experiments on the synthetic data show that TUN outperforms the random sampling strategy and Gaussian Process-based strategy consistently.",
        "keywords": "Uncertainty Estimation;Sensor Placement;Sequential Control;Adaptive Sensing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yangyang Sun;Yang Zhang;Hassan Foroosh;Shuo Pang",
        "authorids": "yangyang@knights.ucf.edu;yangzhang@knights.ucf.edu;foroosh@cs.ucf.edu;pang@creol.ucf.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsun2020twostep,\ntitle={{\\{}TWO{\\}}-{\\{}STEP{\\}} {\\{}UNCERTAINTY{\\}} {\\{}NETWORK{\\}} {\\{}FOR{\\}} {\\{}TASKDRIVEN{\\}} {\\{}SENSOR{\\}} {\\{}PLACEMENT{\\}}},\nauthor={Yangyang Sun and Yang Zhang and Hassan Foroosh and Shuo Pang},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxTZ1HYwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxTZ1HYwB",
        "pdf_size": 0,
        "rating": "1;1",
        "confidence": "0;0",
        "wc_review": "603;228",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            415.5,
            187.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:THayAso335EJ:scholar.google.com/&scioq=TWO-STEP+UNCERTAINTY+NETWORK+FOR+TASKDRIVEN+SENSOR+PLACEMENT&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SyxV9ANFDH",
        "title": "Economy Statistical Recurrent Units For Inferring Nonlinear Granger Causality",
        "track": "main",
        "status": "Poster",
        "tldr": "A new recurrent neural network architecture for detecting pairwise Granger causality between nonlinearly interacting time series. ",
        "abstract": "Granger causality is a widely-used criterion for analyzing interactions in large-scale networks. As most physical interactions are inherently nonlinear, we consider the problem of inferring the existence of pairwise Granger causality between nonlinearly interacting stochastic processes from their time series measurements. Our proposed approach relies on modeling the embedded nonlinearities in the measurements using a component-wise time series prediction model based on Statistical Recurrent Units (SRUs). We make a case that the network topology of Granger causal relations is directly inferrable from a structured sparse estimate of the internal parameters of the SRU networks trained to predict the processes\u2019 time series measurements. We propose a variant of SRU, called economy-SRU, which, by design has considerably fewer trainable parameters, and therefore less prone to overfitting. The economy-SRU computes a low-dimensional sketch of its high-dimensional hidden state in the form of random projections to generate the feedback for its recurrent processing. Additionally, the internal weight parameters of the economy-SRU are strategically regularized in a group-wise manner to facilitate the proposed network in extracting meaningful predictive features that are highly time-localized to mimic real-world causal events. Extensive experiments are carried out to demonstrate that the proposed economy-SRU based time series prediction model outperforms the MLP, LSTM and attention-gated CNN-based time series models considered previously for inferring Granger causality. ",
        "keywords": "Recurrent neural networks;Granger causality;Causal inference;Statistical Recurrent Unit",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Saurabh Khanna;Vincent Y. F. Tan",
        "authorids": "elesaur@nus.edu.sg;vtan@nus.edu.sg",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nKhanna2020Economy,\ntitle={Economy Statistical Recurrent Units For Inferring Nonlinear Granger Causality},\nauthor={Saurabh Khanna and Vincent Y. F. Tan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxV9ANFDH}\n}",
        "github": "https://github.com/sakhanna/SRU_for_GCI",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyxV9ANFDH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "157;475;120",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "341;996;535",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            250.66666666666666,
            159.34518784352696
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            624.0,
            274.70833017341624
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 85,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9739971127623592335&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SyxXWC4KPB",
        "title": "Structured consistency loss for semi-supervised semantic segmentation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel structured consistency loss for semi-supervised semantic segmentation",
        "abstract": "The consistency loss has played a key role in solving problems in recent studies on semi-supervised learning. Yet extant studies with the consistency loss are limited to its application to classification tasks; extant studies on semi-supervised semantic segmentation rely on pixel-wise classification, which does not reflect the structured nature of characteristics in prediction. We propose a structured consistency loss to address this limitation of extant studies. Structured consistency loss promotes consistency in inter-pixel similarity between teacher and student networks. Specifically, collaboration with CutMix optimizes the efficient performance of semi-supervised semantic segmentation with structured consistency loss by reducing computational burden dramatically. The superiority of proposed method is verified with the Cityscapes; The Cityscapes benchmark results with validation and with test data are 81.9 mIoU and 83.84 mIoU respectively. This ranks the first place on the pixel-level semantic labeling task of Cityscapes benchmark suite. To the best of our knowledge, we are the first to present the superiority of state-of-the-art semi-supervised learning in semantic segmentation.",
        "keywords": "semi-supervised learning;semantic segmentation;structured prediction;structured consistency loss",
        "primary_area": "",
        "supplementary_material": "",
        "author": "JongMok Kim;Joo Young Jang;Hyunwoo Park",
        "authorids": "win98man1@gmail.com;jyjang1090@gmail.com;phw08132@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkim2020structured,\ntitle={Structured consistency loss for semi-supervised semantic segmentation},\nauthor={JongMok Kim and Joo Young Jang and Hyunwoo Park},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxXWC4KPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxXWC4KPB",
        "pdf_size": 0,
        "rating": "1;1",
        "confidence": "0;0",
        "wc_review": "390;250",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            320.0,
            70.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 82,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6441920146277859714&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "Syx_f6EFPr",
        "title": "Supervised learning with incomplete data via sparse representations",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This paper addresses the problem of training a classifier on incomplete data and its application to a complete or incomplete test dataset. A supervised learning method is developed to train a general classifier, such as a logistic regression or a deep neural network, using only a limited number of observed entries, assuming sparse representations of data vectors on an unknown dictionary. The proposed method simultaneously learns the classifier, the dictionary and the corresponding sparse representations of each input data sample. A theoretical analysis is also provided comparing this method with the standard imputation approach, which consists on performing data completion followed by training the classifier based on their reconstructions. The limitations of this last \"sequential\" approach are identified, and a description of how the proposed new \"simultaneous\" method can overcome the problem of indiscernible observations is provided. Additionally, it is shown that, if it is possible to train a classifier on incomplete observations so that its reconstructions are well separated by a hyperplane, then the same classifier also correctly separates the original (unobserved) data samples. Extensive simulation results are presented on synthetic and well-known reference datasets that demonstrate the effectiveness of the proposed method compared to traditional data imputation methods.",
        "keywords": "Incomplete data;supervised learning;sparse representations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cesar F. Caiafa;Ziyao Wang;Jordi Sol\u00e9-Casals;Qibin Zhao",
        "authorids": "ccaiafa@gmail.com;zy_wang@seu.edu.cn;jordi.sole@uvic.cat;qibin.zhao@riken.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ncaiafa2020supervised,\ntitle={Supervised learning with incomplete data via sparse representations},\nauthor={Cesar F. Caiafa and Ziyao Wang and Jordi Sol{\\'e}-Casals and Qibin Zhao},\nyear={2020},\nurl={https://openreview.net/forum?id=Syx_f6EFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=Syx_f6EFPr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "193;462;326",
        "wc_reply_reviewers": "0;0;142",
        "wc_reply_authors": "541;804;530",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            327.0,
            109.82106658864075
        ],
        "wc_reply_reviewers_avg": [
            47.333333333333336,
            66.93944195232649
        ],
        "wc_reply_authors_avg": [
            625.0,
            126.65175350806109
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4HvJhCphi2QJ:scholar.google.com/&scioq=Supervised+learning+with+incomplete+data+via+sparse+representations&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "Syxc1yrKvr",
        "title": "Implicit \u03bb-Jeffreys Autoencoders: Taking the Best of Both Worlds",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new form of an autoencoding model which incorporates the best properties of variational autoencoders (VAE) and generative adversarial networks (GAN)",
        "abstract": "We propose a new form of an autoencoding model which incorporates the best properties of variational autoencoders (VAE) and generative adversarial networks (GAN). It is known that GAN can produce very realistic samples while VAE does not suffer from mode collapsing problem. Our model optimizes \u03bb-Jeffreys divergence between the model distribution and the true data distribution. We show that it takes the best properties of VAE and GAN objectives. It consists of two parts. One of these parts can be optimized by using the standard adversarial training, and the second one is the very objective of the VAE model. However, the straightforward way of substituting the VAE loss does not work well if we use an explicit likelihood such as Gaussian or Laplace which have limited flexibility in high dimensions and are unnatural for modelling images in the space of pixels. To tackle this problem we propose a novel approach to train the VAE model with an implicit likelihood by an adversarially trained discriminator. In an extensive set of experiments on CIFAR-10 and TinyImagent datasets, we show that our model achieves the state-of-the-art generation and reconstruction quality and demonstrate how we can balance between mode-seeking and mode-covering behaviour of our model by adjusting the weight \u03bb in our objective. ",
        "keywords": "Variational Inference;Generative Adversarial Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aibek Alanov;Max Kochurov;Artem Sobolev;Daniil Yashkov;Dmitry Vetrov",
        "authorids": "alanov.aibek@gmail.com;maxim.v.kochurov@gmail.com;asobolev@bayesgroup.ru;daniil.yashkov@phystech.edu;vetrovd@yandex.ru",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nalanov2020implicit,\ntitle={Implicit {\\ensuremath{\\lambda}}-Jeffreys Autoencoders: Taking the Best of Both Worlds},\nauthor={Aibek Alanov and Max Kochurov and Artem Sobolev and Daniil Yashkov and Dmitry Vetrov},\nyear={2020},\nurl={https://openreview.net/forum?id=Syxc1yrKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Syxc1yrKvr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "553;487;450",
        "wc_reply_reviewers": "0;0;26",
        "wc_reply_authors": "990;808;727",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            496.6666666666667,
            42.60151275352659
        ],
        "wc_reply_reviewers_avg": [
            8.666666666666666,
            12.256517540566826
        ],
        "wc_reply_authors_avg": [
            841.6666666666666,
            109.97676522288191
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FR00EyVoDeYJ:scholar.google.com/&scioq=Implicit+%CE%BB-Jeffreys+Autoencoders:+Taking+the+Best+of+Both+Worlds&hl=en&as_sdt=0,33",
        "gs_version_total": 3
    },
    {
        "id": "SyxdC6NKwH",
        "title": "Uncertainty-Aware Prediction for Graph Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Multiple Uncertainty Prediction for Graph Neural Networks in Node Classification",
        "abstract": "Thanks to graph neural networks (GNNs), semi-supervised node classification has shown the state-of-the-art performance in graph data.  However, GNNs do not consider any types of uncertainties associated with the class probabilities to minimize risk due to misclassification under uncertainty in real life. In this work, we propose a Bayesian deep learning framework reflecting various types of uncertainties for classification predictions by leveraging the powerful modeling and learning capabilities of GNNs. We considered multiple uncertainty types in both deep learning (DL) and belief/evidence theory domains. We treat the predictions of a Bayesian GNN (BGNN) as nodes' multinomial subjective opinions in a graph based on Dirichlet distributions where each belief mass is a belief probability of each class. By collecting evidence from the given labels of training nodes, the BGNN model is designed for accurately predicting probabilities of each class and detecting out-of-distribution.  We validated the outperformance of the proposed BGNN, compared to the state-of-the-art counterparts in terms of the accuracy of node classification prediction and out-of-distribution detection based on six real network datasets.",
        "keywords": "Uncertainty;Graph Neural Networks;Subjective Logic;Bayesian",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xujiang Zhao;Feng Chen;Shu Hu;jin-Hee Cho",
        "authorids": "xxz190020@utdallas.edu;feng.chen@utdallas.edu;shu2@albany.edu;jicho@vt.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhao2020uncertaintyaware,\ntitle={Uncertainty-Aware Prediction for Graph Neural Networks},\nauthor={Xujiang Zhao and Feng Chen and Shu Hu and jin-Hee Cho},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxdC6NKwH}\n}",
        "github": "https://www.dropbox.com/sh/cs5gs2i1umdx4b6/AAC-r_EYRw9lryk95giqW8-Fa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxdC6NKwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "567;401;186",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1753;1057;706",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            384.6666666666667,
            155.9707949870388
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1172.0,
            435.10228682460405
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13734506124194915202&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SyxhVkrYvr",
        "title": "Towards Verified Robustness under Text Deletion Interventions",
        "track": "main",
        "status": "Poster",
        "tldr": "Formal verification of a specification on a model's prediction undersensitivity using Interval Bound Propagation",
        "abstract": "Neural networks are widely used in Natural Language Processing, yet despite their empirical successes, their behaviour is brittle: they are both over-sensitive to small input changes, and under-sensitive to deletions of large fractions of input text. This paper aims to tackle under-sensitivity in the context of natural language inference by ensuring that models do not become more confident in their predictions as arbitrary subsets of words from the input text are deleted. We develop a novel technique for formal verification of this specification for models based on the popular decomposable attention mechanism by employing the efficient yet effective interval bound propagation (IBP) approach. Using this method we can efficiently prove, given a model, whether a particular sample is free from the under-sensitivity problem. We compare different training methods to address under-sensitivity, and compare metrics to measure it. In our experiments on the SNLI and MNLI datasets, we observe that IBP training leads to a significantly improved verified accuracy. On the SNLI test set, we can verify 18.4% of samples, a substantial improvement over only 2.8% using standard training.",
        "keywords": "natural language processing;specification;verification;model undersensitivity;adversarial;interval bound propagation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Johannes Welbl;Po-Sen Huang;Robert Stanforth;Sven Gowal;Krishnamurthy (Dj) Dvijotham;Martin Szummer;Pushmeet Kohli",
        "authorids": "johannes.welbl.14@ucl.ac.uk;posenhuang@google.com;stanforth@google.com;sgowal@google.com;dvij@google.com;szummer@google.com;pushmeet@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nWelbl2020Towards,\ntitle={Towards Verified Robustness under Text Deletion Interventions},\nauthor={Johannes Welbl and Po-Sen Huang and Robert Stanforth and Sven Gowal and Krishnamurthy (Dj) Dvijotham and Martin Szummer and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxhVkrYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyxhVkrYvr",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "107;261;314;154",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "325;447;450;109",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            209.0,
            82.39842231499338
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            332.75,
            138.6765571392656
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13011794889803007704&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SyxhapVYvH",
        "title": "Unsupervised Few-shot Object Recognition by Integrating Adversarial, Self-supervision, and Deep Metric Learning of Latent Parts",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We address the problem of unsupervised few-shot object recognition, where all training images are unlabeled and do not share classes with test images.",
        "abstract": "This paper addresses unsupervised few-shot object recognition, where all training images are unlabeled and do not share classes with labeled support images for few-shot recognition in testing. We use a new GAN-like deep architecture aimed at unsupervised learning of an image representation which will encode latent object parts and thus generalize well to unseen classes in our few-shot recognition task. Our unsupervised training integrates adversarial, self-supervision, and deep metric learning. We make two contributions. First, we extend the vanilla GAN with reconstruction loss to enforce the discriminator capture the most relevant characteristics of \"fake\" images generated from randomly sampled codes. Second, we compile a training set of triplet image examples for estimating the triplet loss in metric learning by using an image masking procedure suitably designed to identify latent object parts. Hence, metric learning ensures that the deep representation of images showing similar object classes which share some parts are closer than the representations of images which do not have common parts. Our results show that we significantly outperform the state of the art, as well as get similar performance to the common episodic training for fully-supervised few-shot learning on the Mini-Imagenet and Tiered-Imagenet datasets.",
        "keywords": "Unsupervised Few-shot Learning;Deep Metric Learning;Self-supervised Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Khoi Nguyen;Sinisa Todorovic",
        "authorids": "nguyenkh@oregonstate.edu;sinisa@oregonstate.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SyxhapVYvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "566;414;298",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            426.0,
            109.73908449894535
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AO3oEfct9GQJ:scholar.google.com/&scioq=Unsupervised+Few-shot+Object+Recognition+by+Integrating+Adversarial,+Self-supervision,+and+Deep+Metric+Learning+of+Latent+Parts&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "SyxhaxBKPS",
        "title": "Smart Ternary Quantization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural network models are resource hungry. Low bit quantization such as binary and ternary quantization is a common approach to alleviate this resource requirements. Ternary quantization provides a more flexible model and often beats binary quantization in terms of accuracy, but doubles memory and increases computation cost. Mixed quantization depth models, on another hand, allows a trade-off between accuracy and memory footprint. In such models, quantization depth is often chosen manually (which is a tiring task), or is tuned using a separate optimization routine (which requires training a quantized network multiple times). Here, we propose Smart Ternary Quantization (STQ) in which we modify the quantization depth directly through an adaptive regularization function, so that we train a model only once. This method jumps between binary and ternary quantization while training. We show its application  on image classification.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gregoire Morin;Ryan Razani;Vahid Partovi Nia;Eyyub Sari",
        "authorids": "gregoire.morin@huawei.com;ryan.razani@huawei.com;vahid.partovinia@huawei.com;eyyub.sari@huawei.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmorin2020smart,\ntitle={Smart Ternary Quantization},\nauthor={Gregoire Morin and Ryan Razani and Vahid Partovi Nia and Eyyub Sari},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxhaxBKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxhaxBKPS",
        "pdf_size": 0,
        "rating": "1;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "449;111;311;526",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "287;184;0;54",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;0;1",
        "rating_avg": [
            3.25,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            349.25,
            157.65210908833413
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            131.25,
            112.06555001426621
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.75,
            0.4330127018922193
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2904061060139299512&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "Syxi6grFwH",
        "title": "HIPPOCAMPAL NEURONAL REPRESENTATIONS IN CONTINUAL LEARNING",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The hippocampus has long been associated with spatial memory and goal-directed\nspatial navigation. However, the region\u2019s independent role in continual learning of\nnavigational strategies has seldom been investigated. Here we analyse populationlevel\nactivity of hippocampal CA1 neurons in the context of continual learning of\ntwo different spatial navigation strategies. Demixed Principal Component Analysis\n(dPCA) is applied on neuronal recordings from 612 hippocampal CA1 neurons\nof rodents learning to perform allocentric and egocentric spatial tasks. The components\nuncovered using dPCA from the firing activity reveal that hippocampal\nneurons encode relevant task variables such decisions, navigational strategies and\nreward location. We compare this hippocampal features with standard reinforcement\nlearning algorithms, highlighting similarities and differences. Finally, we\ndemonstrate that a standard deep reinforcement learning model achieves similar\naverage performance when compared to animal learning, but fails to mimic animals\nduring task switching. Overall, our results gives insights into how the hippocampus\nsolves reinforced spatial continual learning, and puts forward a framework\nto explicitly compare biological and machine learning during spatial continual\nlearning.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Samia Mohinta;Rui Ponte Costa;Stephane Ciocchi",
        "authorids": ";;",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmohinta2020hippocampal,\ntitle={{\\{}HIPPOCAMPAL{\\}} {\\{}NEURONAL{\\}} {\\{}REPRESENTATIONS{\\}} {\\{}IN{\\}} {\\{}CONTINUAL{\\}} {\\{}LEARNING{\\}}},\nauthor={Samia Mohinta and Rui Ponte Costa and Stephane Ciocchi},\nyear={2020},\nurl={https://openreview.net/forum?id=Syxi6grFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Syxi6grFwH",
        "pdf_size": 0,
        "rating": "1;1;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "734;614;725;91",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "804;279;726;8",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            2.75,
            2.0463381929681126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            541.0,
            264.0710131763803
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            454.25,
            326.3528573492195
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_KNyK8bxMQkJ:scholar.google.com/&scioq=HIPPOCAMPAL+NEURONAL+REPRESENTATIONS+IN+CONTINUAL+LEARNING&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "SyxiRJStwr",
        "title": "Dynamic Scale Inference by Entropy Minimization",
        "track": "main",
        "status": "Reject",
        "tldr": "Unsupervised optimization during inference gives top-down feedback to iteratively adjust feedforward prediction of scale variation for more equivariant recognition.",
        "abstract": "Given the variety of the visual world there is not one true scale for recognition: objects may appear at drastically different sizes across the visual field. Rather than enumerate variations across filter channels or pyramid levels, dynamic models locally predict scale and adapt receptive fields accordingly. The degree of variation and diversity of inputs makes this a difficult task. Existing methods either learn a feedforward predictor, which is not itself totally immune to the scale variation it is meant to counter, or select scales by a fixed algorithm, which cannot learn from the given task and data. We extend dynamic scale inference from feedforward prediction to iterative optimization for further adaptivity. We propose a novel entropy minimization objective for inference and optimize over task and structure parameters to tune the model to each input. Optimization during inference improves semantic segmentation accuracy and generalizes better to extreme scale variations that cause feedforward dynamic inference to falter.",
        "keywords": "unsupervised learning;dynamic inference;equivariance;entropy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dequan Wang;Evan Shelhamer;Bruno Olshausen;Trevor Darrell",
        "authorids": "dqwang@eecs.berkeley.edu;shelhamer@cs.berkeley.edu;baolshausen@berkeley.edu;trevor@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020dynamic,\ntitle={Dynamic Scale Inference by Entropy Minimization},\nauthor={Dequan Wang and Evan Shelhamer and Bruno Olshausen and Trevor Darrell},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxiRJStwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyxiRJStwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "340;316;326",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "308;231;284",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            327.3333333333333,
            9.843215373488933
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            274.3333333333333,
            32.1696889771861
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11939087491312325887&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "SyxjVRVKDB",
        "title": "Switched linear projections and inactive state sensitivity for deep neural network interpretability",
        "track": "main",
        "status": "Reject",
        "tldr": "The neurons that are \"off\" in artificial neural networks carry a lot of information about patterns the network is sesitive to. ",
        "abstract": "We introduce switched linear projections for expressing the activity of a neuron in a ReLU-based deep neural network in terms of a single linear projection in the input space. The method works by isolating the active subnetwork, a series of linear transformations, that completely determine the entire computation of the deep network for a given input instance. We also propose that for interpretability it is more instructive and meaningful to focus on the patterns that deactive the neurons in the network, which are ignored by the exisiting methods that implicitly track only the active aspect of the network's computation. We introduce a novel interpretability method for the inactive state sensitivity (Insens). Comparison against existing methods shows that Insens is more robust (in the presence of noise), more complete (in terms of patterns that affect the computation) and a very effective interpretability method for deep neural networks",
        "keywords": "deep learning;interpretability;artificial neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lech Szymanski;Brendan McCane;Craig Atkinson",
        "authorids": "lechszym@cs.otago.ac.nz;mccane@cs.otago.ac.nz;atkcr398@student.otago.ac.nz",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nszymanski2020switched,\ntitle={Switched linear projections and inactive state sensitivity for deep neural network interpretability},\nauthor={Lech Szymanski and Brendan McCane and Craig Atkinson},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxjVRVKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer7;AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=SyxjVRVKDB",
        "pdf_size": 0,
        "rating": "1;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "428;353;181;558",
        "wc_reply_reviewers": "251;0;207;51",
        "wc_reply_authors": "1260;554;530;642",
        "reply_reviewers": "3;0;2;1",
        "reply_authors": "4;1;3;2",
        "rating_avg": [
            4.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            380.0,
            136.30663960350574
        ],
        "wc_reply_reviewers_avg": [
            127.25,
            104.49970095650991
        ],
        "wc_reply_authors_avg": [
            746.5,
            299.38729097942684
        ],
        "reply_reviewers_avg": [
            1.5,
            1.118033988749895
        ],
        "reply_authors_avg": [
            2.5,
            1.118033988749895
        ],
        "replies_avg": [
            22,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4177522425869920024&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "SyxoygBKwB",
        "title": "CopyCAT: Taking Control of Neural Policies with Constant Attacks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a new attack for taking full control of neural policies in realistic settings.",
        "abstract": "We propose a new perspective on adversarial attacks against deep reinforcement learning agents. Our main contribution is CopyCAT, a targeted attack able to consistently lure an agent into following an outsider's policy. It is pre-computed, therefore fast inferred, and could thus be usable in a real-time scenario. We show its effectiveness on Atari 2600 games in the novel read-only setting. In the latter, the adversary cannot directly modify the agent's state -its representation of the environment- but can only attack the agent's observation -its perception of the environment. Directly modifying the agent's state would require a write-access to the agent's inner workings and we argue that this assumption is too strong in realistic settings.",
        "keywords": "reinforcement learning;adversarial examples;attack",
        "primary_area": "",
        "supplementary_material": "",
        "author": "L\u00e9onard Hussenot;Matthieu Geist;Olivier Pietquin",
        "authorids": "hussenot@google.com;mfgeist@google.com;pietquin@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyxoygBKwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "817;621;106",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1363;733;295",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            514.6666666666666,
            299.8447746566361
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            797.0,
            438.351457166507
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10734687720584042387&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "Syxp-1HtvB",
        "title": "Semantic Hierarchy Emerges in the Deep Generative Representations for Scene Synthesis",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that highly-structured semantic hierarchy emerges in the deep generative representations as a result for synthesizing scenes.",
        "abstract": "Despite the success of Generative Adversarial Networks (GANs) in image synthesis, there lacks enough understanding on what networks have learned inside the deep generative representations and how photo-realistic images are able to be composed from random noises. In this work, we show that highly-structured semantic hierarchy emerges from the generative representations as the variation factors for synthesizing scenes. By probing the layer-wise representations with a broad set of visual concepts at different abstraction levels, we are able to quantify the causality between the activations and the semantics occurring in the output image. Such a quantification identifies the human-understandable variation factors learned by GANs to compose scenes. The qualitative and quantitative results suggest that the generative representations learned by GAN are specialized to synthesize different hierarchical semantics: the early layers tend to determine the spatial layout and configuration, the middle layers control the categorical objects, and the later layers finally render the scene attributes as well as color scheme. Identifying such a set of manipulatable latent semantics facilitates semantic scene manipulation.",
        "keywords": "Feature visualization;feature interpretation;generative models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ceyuan Yang;Yujun Shen;Bolei Zhou",
        "authorids": "limbo0066@gmail.com;sy116@ie.cuhk.edu.hk;bzhou@ie.cuhk.edu.hk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyang2020semantic,\ntitle={Semantic Hierarchy Emerges in the Deep Generative Representations for Scene Synthesis},\nauthor={Ceyuan Yang and Yujun Shen and Bolei Zhou},\nyear={2020},\nurl={https://openreview.net/forum?id=Syxp-1HtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syxp-1HtvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "1063;819;460",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1122;841;163",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            780.6666666666666,
            247.66150734868393
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            708.6666666666666,
            402.5372308522806
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 232,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15496925499086289770&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "SyxrxR4KPS",
        "title": "Deep neuroethology of a virtual rodent",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We built a physical simulation of a rodent, trained it to solve a set of tasks, and analyzed the resulting networks.",
        "abstract": "Parallel developments in neuroscience and deep learning have led to mutually productive exchanges, pushing our understanding of real and artificial neural networks in sensory and cognitive systems. However, this interaction between fields is less developed in the study of motor control. In this work, we develop a virtual rodent as a platform for the grounded study of motor activity in artificial models of embodied control. We then use this platform to study motor activity across contexts by training a model to solve four complex tasks. Using methods familiar to neuroscientists, we describe the behavioral representations and algorithms employed by different layers of the network using a neuroethological approach to characterize motor activity relative to the rodent's behavior and goals. We find that the model uses two classes of representations which respectively encode the task-specific behavioral strategies and task-invariant behavioral kinematics. These representations are reflected in the sequential activity and population dynamics of neural subpopulations. Overall, the virtual rodent facilitates grounded collaborations between deep reinforcement learning and motor neuroscience.",
        "keywords": "computational neuroscience;motor control;deep RL",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Josh Merel;Diego Aldarondo;Jesse Marshall;Yuval Tassa;Greg Wayne;Bence Olveczky",
        "authorids": "jsmerel@google.com;diegoaldarondo@g.harvard.edu;jesse_d_marshall@fas.harvard.edu;tassa@google.com;gregwayne@google.com;olveczky@fas.harvard.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nMerel2020Deep,\ntitle={Deep neuroethology of a virtual rodent},\nauthor={Josh Merel and Diego Aldarondo and Jesse Marshall and Yuval Tassa and Greg Wayne and Bence Olveczky},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=SyxrxR4KPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=SyxrxR4KPS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "375;346;636",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "442;225;217",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.3333333333333,
            130.41046311124308
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            294.6666666666667,
            104.23157977418467
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 93,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5496195456094805561&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "Syxss0EYPS",
        "title": "Agent as Scientist: Learning to Verify Hypotheses",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, we formulate hypothesis verification as a reinforcement learning problem. Specifically, we aim to build an agent that, given a hypothesis about the dynamics of the world can take actions to generate observations which can help predict whether the hypothesis is true or false. Our first observation is that agents trained end-to-end with the reward fail to learn to solve this problem. In order to train the agents, we exploit the underlying structure in the majority of hypotheses -- they can be formulated as triplets (pre-condition, action sequence,  post-condition). Once the agents have been pretrained to verify hypotheses with this structure, they can be fine-tuned to verify more general hypotheses.  Our work takes a step towards a ``scientist agent'' that develops an understanding of the world by generating and testing hypotheses about its environment.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kenneth Marino;Rob Fergus;Arthur Szlam;Abhinav Gupta",
        "authorids": "kdmarino@cs.cmu.edu;fergus@cs.nyu.edu;aszlam@fb.com;abhinavg@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmarino2020agent,\ntitle={Agent as Scientist: Learning to Verify Hypotheses},\nauthor={Kenneth Marino and Rob Fergus and Arthur Szlam and Abhinav Gupta},\nyear={2020},\nurl={https://openreview.net/forum?id=Syxss0EYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=Syxss0EYPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "820;1194;481",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1056;682;506",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            831.6666666666666,
            291.1979090279019
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            748.0,
            229.33527130964103
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HONBbePnTzQJ:scholar.google.com/&scioq=Agent+as+Scientist:+Learning+to+Verify+Hypotheses&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "Syxwsp4KDB",
        "title": "TED: A Pretrained Unsupervised Summarization Model with Theme Modeling and Denoising",
        "track": "main",
        "status": "Reject",
        "tldr": "A new state-of-the-art for unsupervised abstractive text summarization",
        "abstract": "Text summarization aims to extract essential information from a piece of text and transform it into a concise version. Existing unsupervised abstractive summarization models use recurrent neural networks framework and ignore abundant unlabeled corpora resources. In order to address these issues, we propose TED, a transformer-based unsupervised summarization system with dataset-agnostic pretraining. We first leverage the lead bias in news articles to pretrain the model on large-scale corpora. Then, we finetune TED on target domains through theme modeling and a denoising autoencoder to enhance the quality of summaries. Notably, TED outperforms all unsupervised abstractive baselines on NYT, CNN/DM and English Gigaword datasets with various document styles. Further analysis shows that the summaries generated by TED are abstractive and containing even higher proportions of novel tokens than those from supervised models.",
        "keywords": "text summarization;unsupervised learning;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ziyi Yang;Chenguang Zhu;Michael Zeng;Xuedong Huang;Eric Darve",
        "authorids": "ziyi.yang@stanford.edu;chezhu@microsoft.com;nzeng@microsoft.com;xdh@microsoft.com;darve@stanford.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyang2020ted,\ntitle={{\\{}TED{\\}}: A Pretrained Unsupervised Summarization Model with Theme Modeling and Denoising},\nauthor={Ziyi Yang and Chenguang Zhu and Michael Zeng and Xuedong Huang and Eric Darve},\nyear={2020},\nurl={https://openreview.net/forum?id=Syxwsp4KDB}\n}",
        "github": "https://drive.google.com/file/d/17pp6coa19oOTbW3JEXlS_WMb7vjcCGWJ/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=Syxwsp4KDB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "191;516;1079",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "79;471;934",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            595.3333333333334,
            366.8390503870722
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            494.6666666666667,
            349.4532237017646
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 73,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14259318789913896998&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "SyxytxBFDr",
        "title": "Lyceum: An efficient and scalable ecosystem for robot learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A high performance robotics simulation and algorithm development framework.",
        "abstract": "We introduce Lyceum, a high-performance computational ecosystem for robotlearning.   Lyceum is built on top of the Julia programming language and theMuJoCo physics simulator, combining the ease-of-use of a high-level program-ming  language  with  the  performance  of  native  C.  Lyceum  is  up  to  10-20Xfaster  compared  to  other  popular  abstractions  like  OpenAI\u2019sGymand  Deep-Mind\u2019sdm-control.  This substantially reduces training time for various re-inforcement learning algorithms;  and is also fast enough to support real-timemodel  predictive  control  with  physics  simulators.   Lyceum  has  a  straightfor-ward API and supports parallel computation across multiple cores or machines.The code base,  tutorials,  and demonstration videos can be found at: https://sites.google.com/view/lyceum-anon.",
        "keywords": "Robotics;RL;Julia",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Colin X. Summers;Kendall Lowrey;Aravind Rajeswaran;Emanuel Todorov;Siddhartha Srinivasa",
        "authorids": "colinxs@cs.washington.edu;klowrey@cs.washington.edu;todorov@cs.washington.edu;siddh@cs.washington.edu;aravraj@cs.washington.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "https://sites.google.com/view/lyceum-anon.",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=SyxytxBFDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "455;1097;287",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            613.0,
            349.0444097819073
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10767937057888315225&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "r1e0G04Kvr",
        "title": "Deep Graph Translation",
        "track": "main",
        "status": "Reject",
        "tldr": "a novel Graph-Translation-Generative-Adversarial-Networks (GT-GAN) that transforms the input graphs into their target output graphs",
        "abstract": "Deep graph generation models have achieved great successes recently, among which, however, are typically unconditioned generative models that have no control over the target graphs are given an input graph. In this paper, we propose a novel Graph-Translation-Generative-Adversarial-Networks (GT-GAN) that transforms the input graphs into their target output graphs. GT-GAN consists of a graph translator equipped with innovative graph convolution and deconvolution layers to learn the translation mapping considering both global and local features, and a new conditional graph discriminator to classify target graphs by conditioning on input graphs. Extensive experiments on multiple synthetic and real-world datasets demonstrate that our proposed GT-GAN significantly outperforms other baseline methods in terms of both effectiveness and scalability. For instance, GT-GAN achieves at least 10X and 15X faster runtimes than GraphRNN and RandomVAE, respectively, when the size of the graph is around 50.",
        "keywords": "Graph translation;graph generation;deep neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiaojie Guo;Lingfei Wu;Liang Zhao",
        "authorids": "xguo7@gmu.edu;wuli@us.ibm.com;lzhao9@gmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nguo2020deep,\ntitle={Deep Graph Translation},\nauthor={Xiaojie Guo and Lingfei Wu and Liang Zhao},\nyear={2020},\nurl={https://openreview.net/forum?id=r1e0G04Kvr}\n}",
        "github": "https://github.com/anonymous1025/Deep-Graph-Translation-",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1e0G04Kvr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "270;797;235",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1092;972;184",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            434.0,
            257.07716091995934
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            749.3333333333334,
            402.7417148606398
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12513540941978749079&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "r1e1q3VFvH",
        "title": "Compressing Deep Neural Networks With Learnable Regularization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "We consider learning and compressing deep neural networks (DNNs) that consist of low-precision weights and activations for efficient inference of fixed-point operations. In training low-precision DNNs, gradient descent in the backward pass is performed with high-precision weights while quantized low-precision weights and activations are used in the forward pass for computing the loss function. Thus, the gradient descent becomes suboptimal, and accuracy loss follows. In order to reduce the mismatch in the forward and backward passes, we utilize mean squared quantization error (MSQE) regularization. In particular, we propose using a learnable regularization coefficient with the MSQE regularizer to reinforce the convergence of high-precision weights to their quantized values. Furthermore, we investigate how partial L2 regularization can be employed for weight pruning in a similar manner. Finally, combining weight pruning, quantization, and entropy coding, we establish a low-precision DNN compression pipeline. In our experiments, the proposed method produces low-precision MobileNet and ShuffleNet models on ImageNet classification with the state-of-the-art compression ratios. Moreover, we examine our method for image super resolution DNNs to produce low-precision models at negligible performance loss.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yoojin Choi;Mostafa El-Khamy;Jungwon Lee",
        "authorids": "yoojin.c@samsung.com;mostafa.e@samsung.com;jungwon2.lee@samsung.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1e1q3VFvH",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "570;150;300;414",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            358.5,
            153.87251216510376
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TooHARQrjV4J:scholar.google.com/&scioq=Compressing+Deep+Neural+Networks+With+Learnable+Regularization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1e30AEKPr",
        "title": "A Group-Theoretic Framework for Knowledge Graph Embedding",
        "track": "main",
        "status": "Reject",
        "tldr": "A group-theoretic framework for knowledge graph embedding learning",
        "abstract": "We have rigorously proved the existence of a group algebraic structure hidden in relational knowledge embedding problems, which suggests that a group-based embedding framework is essential for model design. Our theoretical analysis explores merely the intrinsic property of the embedding problem itself without introducing extra designs. Using the proposed framework, one could construct embedding models that naturally accommodate all possible local graph patterns, which are necessary for reproducing a complete graph from atomic knowledge triplets. We reconstruct many state-of-the-art models from the framework and re-interpret them as embeddings with different groups. Moreover, we also propose new instantiation models using simple continuous non-abelian groups.",
        "keywords": "group theory;knowledge graph embedding;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tong Yang;Long Sha;Pengyu Hong",
        "authorids": "yangto@bc.edu;longsha@brandeis.edu;hongpeng@brandeis.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nyang2020a,\ntitle={A Group-Theoretic Framework for Knowledge Graph Embedding},\nauthor={Tong Yang and Long Sha and Pengyu Hong},\nyear={2020},\nurl={https://openreview.net/forum?id=r1e30AEKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1e30AEKPr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "817;186;210",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1311;701;385",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.3333333333333,
            291.96384859925536
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            799.0,
            384.33665797926
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16864408609927530494&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1e4MkSFDr",
        "title": "Continuous Convolutional Neural Network forNonuniform Time Series",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Convolutional neural network (CNN) for time series data implicitly assumes that the data are uniformly sampled, whereas many event-based and multi-modal data are nonuniform or have heterogeneous sampling rates. Directly applying regularCNN to nonuniform time series is ungrounded, because it is unable to recognize and extract common patterns from the nonuniform input signals. Converting the nonuniform time series to uniform ones by interpolation preserves the pattern extraction capability of CNN, but the interpolation kernels are often preset and may be unsuitable for the data or tasks. In this paper, we propose the ContinuousCNN (CCNN), which estimates the inherent continuous inputs by interpolation, and performs continuous convolution on the continuous input. The interpolation and convolution kernels are learned in an end-to-end manner, and are able to learn useful patterns despite the nonuniform sampling rate. Besides, CCNN is a strict generalization to CNN. Results of several experiments verify that CCNN achieves abetter performance on nonuniform data, and learns meaningful continuous kernels",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hui Shi;Yang Zhang;Hao Wu;Shiyu Chang;Kaizhi Qian;Mark Hasegawa-Johnson;Jishen Zhao",
        "authorids": ";yang.zhang2@ibm.com;;;;;",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nshi2020continuous,\ntitle={Continuous Convolutional Neural Network forNonuniform Time Series},\nauthor={Hui Shi and Yang Zhang and Hao Wu and Shiyu Chang and Kaizhi Qian and Mark Hasegawa-Johnson and Jishen Zhao},\nyear={2020},\nurl={https://openreview.net/forum?id=r1e4MkSFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1e4MkSFDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "146;490;194",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "127;252;571",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            276.6666666666667,
            152.1169141008177
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            316.6666666666667,
            186.9408700335186
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16860711609705700096&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1e74a4twH",
        "title": "CZ-GEM: A FRAMEWORK FOR DISENTANGLED REPRESENTATION LEARNING",
        "track": "main",
        "status": "Reject",
        "tldr": "Hierarchical generative model (hybrid of VAE and GAN) that learns a disentangled representation of data without compromising the generative quality.",
        "abstract": "Learning disentangled representations of  data is one of the central themes in unsupervised learning in general and generative modelling in particular.  In this work,  we tackle a slightly more intricate scenario where the observations are generated from a conditional distribution of some known control variate and some latent noise variate.  To this end, we present a hierarchical model and a training method (CZ-GEM) that leverages some of the recent developments in likelihood-based and likelihood-free generative models.  We show that by formulation, CZ-GEM introduces the right inductive biases that ensure the disentanglement of the control from the noise variables, while also keeping the components of the control variate disentangled. This is achieved without compromising on the quality of the generated samples. Our approach is simple, general, and can be applied both in supervised and unsupervised settings.",
        "keywords": "disentangled representation learning;gan;generative model;simulator",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akash Srivastava;Yamini Bansal;Yukun Ding;Bernhard Egger;Prasanna Sattigeri;Josh Tenenbaum;David D. Cox;Dan Gutfreund",
        "authorids": "akash.srivastava@me.com;ybansal@g.harvard.edu;yding5@nd.edu;egger@mit.edu;psattig@us.ibm.com;jbt@mit.edu;david.d.cox@ibm.com;dgutfre@us.ibm.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nsrivastava2020czgem,\ntitle={{\\{}CZ{\\}}-{\\{}GEM{\\}}:  A  {\\{}FRAMEWORK{\\}}  {\\{}FOR{\\}} {\\{}DISENTANGLED{\\}} {\\{}REPRESENTATION{\\}} {\\{}LEARNING{\\}}},\nauthor={Akash Srivastava and Yamini Bansal and Yukun Ding and Bernhard Egger and Prasanna Sattigeri and Josh Tenenbaum and David D. Cox and Dan Gutfreund},\nyear={2020},\nurl={https://openreview.net/forum?id=r1e74a4twH}\n}",
        "github": "https://github.com/AnonymousAuthors000/CZ-GEM",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1e74a4twH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "318;306;383",
        "wc_reply_reviewers": "0;0;41",
        "wc_reply_authors": "139;231;415",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            335.6666666666667,
            33.82635395992631
        ],
        "wc_reply_reviewers_avg": [
            13.666666666666666,
            19.3275853524323
        ],
        "wc_reply_authors_avg": [
            261.6666666666667,
            114.74415986106753
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VI4rdaUcPJQJ:scholar.google.com/&scioq=CZ-GEM:+A+FRAMEWORK+FOR+DISENTANGLED+REPRESENTATION+LEARNING&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1e7M6VYwH",
        "title": "RotationOut as a Regularization Method for Neural Network",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a regularization method for neural network and a noise analysis method",
        "abstract": "In this paper, we propose a novel regularization method, RotationOut, for neural networks. \nDifferent from Dropout that handles each neuron/channel independently, RotationOut regards its input layer as an entire vector and introduces regularization by randomly rotating the vector. \nRotationOut can also be used in convolutional layers and recurrent layers with a small modification.\nWe further use a noise analysis method to interpret the difference between RotationOut and Dropout in co-adaptation reduction. \nUsing this method, we also show how to use RotationOut/Dropout together with Batch Normalization. \nExtensive experiments in vision and language tasks are conducted to show the effectiveness of the proposed method. \nCodes will be available.",
        "keywords": "Neural Network;Regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kai Hu;Barnabas Poczos",
        "authorids": "kaihu@cmu.edu;bapoczos@cs.cmu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nhu2020rotationout,\ntitle={RotationOut as a Regularization Method for Neural Network},\nauthor={Kai Hu and Barnabas Poczos},\nyear={2020},\nurl={https://openreview.net/forum?id=r1e7M6VYwH}\n}",
        "github": "https://github.com/RotationOut/RotationOut",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1e7M6VYwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "181;624;498",
        "wc_reply_reviewers": "42;0;0",
        "wc_reply_authors": "408;0;0",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            434.3333333333333,
            186.37298325907886
        ],
        "wc_reply_reviewers_avg": [
            14.0,
            19.79898987322333
        ],
        "wc_reply_authors_avg": [
            136.0,
            192.33304448274092
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16013709111940958004&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1e7NgrYvH",
        "title": "DO-AutoEncoder: Learning and Intervening Bivariate Causal Mechanisms in Images",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new framework  for deep representation learning that fully capture bivariate causal relationship in the images.",
        "abstract": "Some fundamental limitations of deep learning have been exposed such as lacking generalizability and being vunerable to adversarial attack. Instead, researchers realize that causation is much more stable than association relationship in data. In this paper, we propose a new framework called do-calculus AutoEncoder(DO-AE) for deep representation learning that fully capture bivariate causal relationship in the images which allows us to intervene in images generation process. DO-AE consists of two key ingredients: causal relationship mining in images and intervention-enabling deep causal structured representation learning. The goal here is to learn deep representations that correspond to the concepts in the physical world as well as their causal structure. To verify the proposed method, we create a dataset named PHY2D, which contains abstract graphic description in accordance with the laws of physics. Our experiments demonstrate our method is able to correctly identify the bivariate causal relationship between concepts in images and the representation learned enables a do-calculus manipulation to images, which generates artificial images that might possibly break the physical law depending on where we intervene the causal system.",
        "keywords": "Causality discovery;AutoEncoder;Deep representation learning;Do-calculus",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianshuo Cong;Dan Peng;Furui Liu;Zhitang Chen",
        "authorids": "cts17@mails.tsinghua.edu.cn;lepangdan@outlook.com;liufurui2@huawei.com;chenzhitang2@huawei.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ncong2020doautoencoder,\ntitle={{\\{}DO{\\}}-AutoEncoder: Learning and Intervening Bivariate Causal Mechanisms in Images},\nauthor={Tianshuo Cong and Dan Peng and Furui Liu and Zhitang Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=r1e7NgrYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1e7NgrYvH",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "403;209;864",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "167;68;172",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            492.0,
            274.70833017341624
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            135.66666666666666,
            47.891080125171065
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:c8_N8utK9rcJ:scholar.google.com/&scioq=DO-AutoEncoder:+Learning+and+Intervening+Bivariate+Causal+Mechanisms+in+Images&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1e8WTEYPB",
        "title": "Sparse and Structured Visual Attention",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new sparse and structured attention mechanism, TVmax, which promotes sparsity and encourages the weight of related adjacent locations to be the same.",
        "abstract": "Visual attention mechanisms have been widely used in image captioning models. In this paper, to better link the image structure with the generated text, we replace the traditional softmax attention mechanism by two alternative sparsity-promoting transformations: sparsemax and Total-Variation Sparse Attention (TVmax). With sparsemax, we obtain sparse attention weights, selecting relevant features.  In order to promote sparsity and encourage fusing of the related adjacent spatial locations, we propose TVmax.  By selecting relevant groups of features, the TVmax transformation improves interpretability. We present results in the Microsoft COCO and Flickr30k datasets, obtaining gains in comparison to softmax.  TVmax outperforms the other compared attention mechanisms in terms of human-rated caption quality and attention relevance.",
        "keywords": "Sparsity;attention;structured attention;total variation;fused lasso;image captioning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pedro Henrique Martins;Vlad Niculae;Zita Marinho;Andr\u00e9 F.T. Martins",
        "authorids": "pedrohenriqueamartins@gmail.com;vlad@vene.ro;zita.marinho@priberam.pt;andre.martins@unbabel.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmartins2020sparse,\ntitle={Sparse and Structured Visual Attention},\nauthor={Pedro Henrique Martins and Vlad Niculae and Zita Marinho and Andr{\\'e} F.T. Martins},\nyear={2020},\nurl={https://openreview.net/forum?id=r1e8WTEYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=r1e8WTEYPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "357;435;247",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "273;744;332",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            346.3333333333333,
            77.1203964950619
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            449.6666666666667,
            209.51425302881478
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5679964665597201905&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "r1e8qpVKPS",
        "title": "Role of two learning rates in convergence of model-agnostic meta-learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We analyzed the role of two learning rates in model-agnostic meta-learning in convergence.",
        "abstract": "Model-agnostic meta-learning (MAML) is known as a powerful meta-learning method. However, MAML is notorious for being hard to train because of the existence of two learning rates. Therefore, in this paper, we derive the conditions that inner learning rate $\\alpha$ and meta-learning rate $\\beta$ must satisfy for MAML to converge to minima with some simplifications. We find that the upper bound of $\\beta$ depends on $ \\alpha$, in contrast to the case of using the normal gradient descent method. Moreover, we show that the threshold of $\\beta$ increases as $\\alpha$ approaches its own upper bound. This result is verified by experiments on various few-shot tasks and architectures; specifically, we perform sinusoid regression and classification of Omniglot and MiniImagenet datasets with a multilayer perceptron and a convolutional neural network. Based on this outcome, we present a guideline for determining the learning rates: first, search for the largest possible $\\alpha$; next, tune $\\beta$ based on the chosen value of $\\alpha$.",
        "keywords": "meta-learning;convergence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shiro Takagi;Yoshihiro Nagano;Yuki Yoshida;Masato Okada",
        "authorids": "takagi@mns.k.u-tokyo.ac.jp;nagano@mns.k.u-tokyo.ac.jp;yoshida@mns.k.u-tokyo.ac.jp;okada@edu.k.u-tokyo.ac.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntakagi2020role,\ntitle={Role of two learning rates in convergence of model-agnostic meta-learning},\nauthor={Shiro Takagi and Yoshihiro Nagano and Yuki Yoshida and Masato Okada},\nyear={2020},\nurl={https://openreview.net/forum?id=r1e8qpVKPS}\n}",
        "github": "https://drive.google.com/file/d/1Seej9xI03F7_2wh4deDTBk_4aFyb2otb/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1e8qpVKPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "901;368;428",
        "wc_reply_reviewers": "196;0;0",
        "wc_reply_authors": "1647;930;159",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            565.6666666666666,
            238.37831743307154
        ],
        "wc_reply_reviewers_avg": [
            65.33333333333333,
            92.39528607504222
        ],
        "wc_reply_authors_avg": [
            912.0,
            607.6067807389908
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MV4FNwgr50YJ:scholar.google.com/&scioq=Role+of+two+learning+rates+in+convergence+of+model-agnostic+meta-learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1e9GCNKvH",
        "title": "One-Shot Pruning of Recurrent Neural Networks by Jacobian Spectrum Evaluation",
        "track": "main",
        "status": "Poster",
        "tldr": "New Objective for One-Shot Pruning Recurrent Neural Networks",
        "abstract": "  Recent advances in the sparse neural network literature have made it possible to prune many large feed forward and convolutional networks with only a small quantity of data. Yet, these same techniques often falter when applied to the problem of recovering sparse recurrent networks. These failures are quantitative: when pruned with recent techniques, RNNs typically obtain worse performance than they do under a simple random pruning scheme. The failures are also qualitative: the distribution of active weights in a pruned LSTM or GRU network tend to be concentrated in specific neurons and gates, and not well dispersed across the entire architecture. We seek to rectify both the quantitative and qualitative issues with recurrent network pruning by introducing a new recurrent pruning objective derived from the spectrum of the recurrent Jacobian. Our objective is data efficient (requiring only 64 data points to prune the network), easy to implement, and produces 95 % sparse GRUs that significantly improve on existing baselines. We evaluate on sequential MNIST, Billion Words, and Wikitext. ",
        "keywords": "Pruning;RNNs;Sparsity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shunshi Zhang;Bradly C. Stadie",
        "authorids": "matthew.zhang@mail.utoronto.ca;bstadie@berkeley.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nZhang2020One-Shot,\ntitle={One-Shot Pruning of Recurrent Neural Networks by Jacobian Spectrum Evaluation},\nauthor={Shunshi Zhang and Bradly C. Stadie},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1e9GCNKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1e9GCNKvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "387;459;158",
        "wc_reply_reviewers": "0;149;0",
        "wc_reply_authors": "348;569;76",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            334.6666666666667,
            128.3337662330361
        ],
        "wc_reply_reviewers_avg": [
            49.666666666666664,
            70.23927359786371
        ],
        "wc_reply_authors_avg": [
            331.0,
            201.6250645794482
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 42,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6874130163696847542&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1eBeyHFDH",
        "title": "A Theory of Usable Information under Computational Constraints",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "We propose a new framework for reasoning about information in complex systems. Our foundation is based on a variational extension of Shannon\u2019s information theory that takes into account the modeling power and computational constraints of the observer. The resulting predictive V-information encompasses mutual information and other notions of informativeness such as the coefficient of determination. Unlike Shannon\u2019s mutual information and in violation of the data processing inequality, V-information can be created through computation. This is consistent with deep neural networks extracting hierarchies of progressively more informative features in representation learning. Additionally, we show that by incorporating computational constraints, V-information can be reliably estimated from data even in high dimensions with PAC-style guarantees. Empirically, we demonstrate predictive V-information is more effective than mutual information for structure learning and fair representation learning. Codes are available at https://github.com/Newbeeer/V-information .",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yilun Xu;Shengjia Zhao;Jiaming Song;Russell Stewart;Stefano Ermon",
        "authorids": "xuyilun@pku.edu.cn;sjzhao@stanford.edu;tsong@cs.stanford.edu;russell.sb.nebel@gmail.com;ermon@cs.stanford.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nXu2020A,\ntitle={A Theory of Usable Information under Computational Constraints},\nauthor={Yilun Xu and Shengjia Zhao and Jiaming Song and Russell Stewart and Stefano Ermon},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eBeyHFDH}\n}",
        "github": "https://github.com/Newbeeer/V-information",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1eBeyHFDH",
        "pdf_size": 0,
        "rating": "8;8",
        "confidence": "0;0",
        "wc_review": "978;454",
        "wc_reply_reviewers": "18;0",
        "wc_reply_authors": "623;310",
        "reply_reviewers": "1;0",
        "reply_authors": "1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            716.0,
            262.0
        ],
        "wc_reply_reviewers_avg": [
            9.0,
            9.0
        ],
        "wc_reply_authors_avg": [
            466.5,
            156.5
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 186,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1322131635293835006&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1eCukHYDH",
        "title": "Manifold Learning and Alignment with Generative Adversarial Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a generative adversarial network that performs both multi-manifold learning and manifold alignment, utilizing the abstraction ability of encoder architecture. ",
        "abstract": "We present a generative adversarial network (GAN) that conducts manifold learning and alignment (MLA): A task to learn the multi-manifold structure underlying data and to align those manifolds without any correspondence information. Our main idea is to exploit the powerful abstraction ability of encoder architecture. Specifically, we define multiple generators to model multiple manifolds, but in a particular way that their inverse maps can be commonly represented by a single smooth encoder. Then, the abstraction ability of the encoder enforces semantic similarities between the generators and gives a plausibly aligned embedding in the latent space. In experiments with MNIST, 3D-Chair, and UT-Zap50k datasets, we demonstrate the superiority of our model in learning the manifolds by FID scores and in aligning the manifolds by disentanglement scores. Furthermore, by virtue of the abstractive modeling, we show that our model can generate data from an untrained manifold, which is unique to our model.",
        "keywords": "Generative Adversarial Networks;Manifold Learning;Manifold Alignment",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiseob Kim;Seungjae Jung;Hyundo Lee;Byoung-Tak Zhang",
        "authorids": "jkim@bi.snu.ac.kr;sjjung@bi.snu.ac.kr;hdlee@bi.snu.ac.kr;btzhang@bi.snu.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkim2020manifold,\ntitle={Manifold Learning and Alignment with Generative Adversarial Networks},\nauthor={Jiseob Kim and Seungjae Jung and Hyundo Lee and Byoung-Tak Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eCukHYDH}\n}",
        "github": "https://www.dropbox.com/s/6odr268ybngcapm/iclr2020_mla_gan.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1eCukHYDH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "143;218;195",
        "wc_reply_reviewers": "0;0;88",
        "wc_reply_authors": "207;444;434",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            185.33333333333334,
            31.372316175606517
        ],
        "wc_reply_reviewers_avg": [
            29.333333333333332,
            41.48359782961079
        ],
        "wc_reply_authors_avg": [
            361.6666666666667,
            109.4420191496646
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8m6pHHJWGo0J:scholar.google.com/&scioq=Manifold+Learning+and+Alignment+with+Generative+Adversarial+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1eCy0NtDH",
        "title": "Mean-field Behaviour of Neural Tangent Kernel for Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Impact of the Initialization and the Activation function on the Neural Tangent Kernel ",
        "abstract": "Recent work by Jacot et al. (2018) has showed that training a neural network of any kind with gradient descent in parameter space is equivalent to kernel gradient descent in function space with respect to the Neural Tangent Kernel (NTK). Lee et al. (2019) built on this result to show that the output of a neural network trained using full batch gradient descent can be approximated by a linear model for wide networks. In parallel, a recent line of studies ( Schoenhols et al. (2017), Hayou et al. (2019)) suggested that a special initialization known as the Edge of Chaos leads to good performance. In this paper, we bridge the gap between this two concepts and show the impact of the initialization and the activation function on the NTK as the network depth becomes large. We provide experiments illustrating our theoretical results.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Soufiane Hayou;Arnaud Doucet;Judith Rousseau",
        "authorids": "soufiane.hayou@stats.ox.ac.uk;doucet@stats.ox.ac.uk;judith.rousseau@stats.ox.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhayou2020meanfield,\ntitle={Mean-field Behaviour of Neural Tangent Kernel for Deep Neural Networks},\nauthor={Soufiane Hayou and Arnaud Doucet and Judith Rousseau},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eCy0NtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1eCy0NtDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "276;653;565",
        "wc_reply_reviewers": "0;40;0",
        "wc_reply_authors": "982;816;555",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            498.0,
            161.03622780811364
        ],
        "wc_reply_reviewers_avg": [
            13.333333333333334,
            18.856180831641264
        ],
        "wc_reply_authors_avg": [
            784.3333333333334,
            175.75424761739202
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14753045703144914507&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1eIiCNYwS",
        "title": "Transformer-XH: Multi-Evidence Reasoning with eXtra Hop Attention",
        "track": "main",
        "status": "Poster",
        "tldr": "We present Transformer-XH, which upgrades Transformer with eXtra Hop attentions to intrinsically model structured texts in a data driven way. ",
        "abstract": "Transformers have achieved new heights modeling natural language as a sequence of text tokens. However, in many real world scenarios, textual data inherently exhibits structures beyond a linear sequence such as trees and graphs; many tasks require reasoning with evidence scattered across multiple pieces of texts. This paper presents Transformer-XH, which uses eXtra Hop attention to enable intrinsic modeling of structured texts in a fully data-driven way. Its new attention mechanism naturally \u201chops\u201d across the connected text sequences in addition to attending over tokens within each sequence. Thus, Transformer-XH better conducts joint multi-evidence reasoning by propagating information between documents and constructing global contextualized representations. On multi-hop question answering, Transformer-XH leads to a simpler multi-hop QA system which outperforms previous state-of-the-art on the HotpotQA FullWiki setting. On FEVER fact verification, applying Transformer-XH provides state-of-the-art accuracy and excels on claims whose verification requires multiple evidence.",
        "keywords": "Transformer-XH;multi-hop QA;fact verification;extra hop attention;structured modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chen Zhao;Chenyan Xiong;Corby Rosset;Xia Song;Paul Bennett;Saurabh Tiwary",
        "authorids": "chenz@cs.umd.edu;chenyan.xiong@microsoft.com;corbin.rosset@microsoft.com;xiaso@microsoft.com;paul.n.bennett@microsoft.com;satiwary@microsoft.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nZhao2020Transformer-XH:,\ntitle={Transformer-XH: Multi-Evidence Reasoning with eXtra Hop Attention},\nauthor={Chen Zhao and Chenyan Xiong and Corby Rosset and Xia Song and Paul Bennett and Saurabh Tiwary},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eIiCNYwS}\n}",
        "github": "https://drive.google.com/file/d/1-CwjDwSvGzLKHMXNapzTin8Vw6SVYD9b/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1eIiCNYwS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "542;194;432",
        "wc_reply_reviewers": "269;0;150",
        "wc_reply_authors": "925;526;912",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            389.3333333333333,
            145.2385011703929
        ],
        "wc_reply_reviewers_avg": [
            139.66666666666666,
            110.06159891422419
        ],
        "wc_reply_authors_avg": [
            787.6666666666666,
            185.1023740768575
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 132,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1330946954324829338&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1eOnh4YPB",
        "title": "How Does Learning Rate Decay Help Modern Neural Networks?",
        "track": "main",
        "status": "Reject",
        "tldr": "We provide another novel explanation of learning rate decay: an initially large learning rate suppresses the network from memorizing noisy data while decaying the learning rate improves the learning of complex patterns.",
        "abstract": "Learning rate decay (lrDecay) is a \\emph{de facto} technique for training modern neural networks. It starts with a large learning rate and then decays it multiple times. It is empirically observed to help both optimization and generalization. Common beliefs in how lrDecay works come from the optimization analysis of (Stochastic) Gradient Descent: 1) an initially large learning rate accelerates training or helps the network escape spurious local minima; 2) decaying the learning rate helps the network converge to a local minimum and avoid oscillation. Despite the popularity of these common beliefs, experiments suggest that they are insufficient in explaining the general effectiveness of lrDecay in training modern neural networks that are deep, wide, and nonconvex. We provide another novel explanation: an initially large learning rate suppresses the network from memorizing noisy data while decaying the learning rate improves the learning of complex patterns. The proposed explanation is validated on a carefully-constructed dataset with tractable pattern complexity. And its implication, that additional patterns learned in later stages of lrDecay are more complex and thus less transferable, is justified in real-world datasets. We believe that this alternative explanation will shed light into the design of better training strategies for modern neural networks.",
        "keywords": "Learning rate decay;Optimization;Explainability;Deep learning;Transfer learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kaichao You;Mingsheng Long;Jianmin Wang;Michael I. Jordan",
        "authorids": "youkaichao@gmail.com;mingsheng@tsinghua.edu.cn;jimwang@tsinghua.edu.cn;jordan@cs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nyou2020how,\ntitle={How Does Learning Rate Decay Help Modern Neural Networks?},\nauthor={Kaichao You and Mingsheng Long and Jianmin Wang and Michael I. Jordan},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eOnh4YPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1eOnh4YPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "383;494;496",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            457.6666666666667,
            52.80361940456565
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 318,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4313831025398176710&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "r1eQeCEYwB",
        "title": "GRAPH ANALYSIS AND GRAPH POOLING IN THE SPATIAL DOMAIN",
        "track": "main",
        "status": "Reject",
        "tldr": "Addressing a serious shortcoming of the GNNs by making them aware of the role of the nodes in the structure of the graph and proposing a novel graph pooling method. ",
        "abstract": "The spatial convolution layer which is widely used  in the Graph Neural Networks (GNNs) aggregates the feature vector of each node with the feature vectors of its neighboring nodes. The GNN is not aware of  the locations of the nodes in the global structure of the graph  and when the local structures corresponding to different nodes are similar to each other, the convolution layer maps all those nodes to similar or same feature vectors in the continuous feature space. Therefore, the GNN cannot distinguish two graphs if their difference is not in their local structures. In addition, when the nodes are not labeled/attributed the convolution layers can fail to  distinguish even  different local structures. In this paper, we propose an effective solution to address this problem of the GNNs. The proposed approach leverages a spatial representation of the graph which makes the neural network aware of the differences between the nodes and also their locations in the graph. The spatial representation which is equivalent to a point-cloud representation of the graph is obtained by a graph embedding method. Using the proposed approach, the local feature extractor of the GNN distinguishes similar local structures in different locations of the graph and the GNN infers the topological structure of the graph from the spatial distribution of the locally extracted feature vectors. Moreover,  the spatial representation is utilized to simplify the graph down-sampling problem. A new graph pooling method is proposed and it is shown that the proposed pooling method achieves competitive or better results in comparison with the state-of-the-art methods. \n",
        "keywords": "Graph Neural Network;Graph Classification;Graph Pooling;Graph Embedding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mostafa Rahmani;Ping Li",
        "authorids": "mostafarahmani@baidu.com;liping11@baidu.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nrahmani2020graph,\ntitle={{\\{}GRAPH{\\}} {\\{}ANALYSIS{\\}} {\\{}AND{\\}} {\\{}GRAPH{\\}} {\\{}POOLING{\\}} {\\{}IN{\\}} {\\{}THE{\\}} {\\{}SPATIAL{\\}} {\\{}DOMAIN{\\}}},\nauthor={Mostafa Rahmani and Ping Li},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eQeCEYwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=r1eQeCEYwB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "165;394;996",
        "wc_reply_reviewers": "59;0;459",
        "wc_reply_authors": "618;502;1371",
        "reply_reviewers": "1;0;2",
        "reply_authors": "2;1;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            518.3333333333334,
            350.4609662842481
        ],
        "wc_reply_reviewers_avg": [
            172.66666666666666,
            203.8959429600196
        ],
        "wc_reply_authors_avg": [
            830.3333333333334,
            385.2309552578672
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4129133546650327641&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1eU1gHFvH",
        "title": "Under what circumstances do local codes emerge in feed-forward neural networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Localist codes emerge in response to learning a rule and generalising in 3- and 4-layer NNs under some situations, including noise, but are inhibited by softmax, large datasets and early stopping.",
        "abstract": "Localist coding schemes are more easily interpretable than the distributed schemes but generally believed to be biologically implausible. Recent results have found highly selective units and object detectors in NNs that are indicative of local codes (LCs). Here we undertake a constructionist study on feed-forward NNs and find LCs emerging in response to invariant features, and this finding is robust until the invariant feature is perturbed by 40%. Decreasing the number of input data, increasing the relative weight of the invariant features and large values of dropout all increase the number of LCs. Longer training times increase the number of LCs and the turning point of the LC-epoch curve correlates well with the point at which NNs reach 90-100% on both test and training accuracy. Pseudo-deep networks (2 hidden layers) which have many LCs lose them when common aspects of deep-NN research are applied (large training data, ReLU activations, early stopping on training accuracy and softmax), suggesting that LCs may not be found in deep-NNs. Switching to more biologically feasible constraints (sigmoidal activation functions, longer training times, dropout, activation noise) increases the number of LCs. If LCs are not found in the feed-forward classification layers of modern deep-CNNs these data suggest this could either be caused by a lack of (moderately) invariant features being passed to the fully connected layers or due to the choice of training conditions and architecture. Should the interpretability and resilience to noise of LCs be required, this work suggests how to tune a NN so they emerge. ",
        "keywords": "localist coding;emergence;contructionist science;neural networks;feed-forward;learning representation;distributed coding;generalisation;memorisation;biological plausibility;deep-NNs;training conditions",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ella M. Gale;Nicolas Martin",
        "authorids": "ella.gale@bristol.ac.uk;nm13850@bristol.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngale2020under,\ntitle={Under what circumstances do local codes emerge in feed-forward neural networks},\nauthor={Ella M. Gale and Nicolas Martin},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eU1gHFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1eU1gHFvH",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "288;213;791;243",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            383.75,
            236.63619228680975
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EO3FTSzkWAgJ:scholar.google.com/&scioq=Under+what+circumstances+do+local+codes+emerge+in+feed-forward+neural+networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1eUukrtwH",
        "title": "The Variational InfoMax AutoEncoder",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a VIMAE, a variational autoencoder learning both a good generative model and disentangled representations",
        "abstract": "We propose the Variational InfoMax AutoEncoder (VIMAE), an autoencoder based on a new learning principle for unsupervised models: the Capacity-Constrained InfoMax, which allows the learning of a disentangled representation while maintaining optimal generative performance. The variational capacity of an autoencoder is defined and we investigate its role. We associate the two main properties of a Variational AutoEncoder (VAE), generation quality and disentangled representation, to two different information concepts, respectively Mutual Information and network capacity. We deduce that a small capacity autoencoder tends to learn a more robust and disentangled representation than a high capacity one. This observation is confirmed by the computational experiments.",
        "keywords": "autoencoder;information theory;infomax;vae",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vinenzo Crescimanna;Bruce Graham",
        "authorids": "vincenzo.crescimanna1@stir.ac.uk;bruce.graham@stir.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ncrescimanna2020the,\ntitle={The Variational InfoMax AutoEncoder},\nauthor={Vinenzo Crescimanna and Bruce Graham},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eUukrtwH}\n}",
        "github": "https://drive.google.com/drive/folders/10DFddqa6THH9lavOzBVGu5iYAoOmQfKQ?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1eUukrtwH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "202;587;384",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "248;350;107",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            391.0,
            157.25351082461296
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            235.0,
            99.62931295557549
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5831119457247259053&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "r1eVX0EFvH",
        "title": "Exploiting Excessive Invariance caused by Norm-Bounded Adversarial Robustness",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that invariance-based adversarial examples are a threat to perturbation robust classifiers both theoretically and practically, e.g., by reducing the accuracy of a defense certified to give 87% accuracy to just 12%.",
        "abstract": "Adversarial examples are malicious inputs crafted to cause a model to misclassify them. In their most common instantiation, \"perturbation-based\" adversarial examples introduce  changes to the input that leave its true label unchanged, yet result in a different model prediction.  Conversely, \"invariance-based\" adversarial examples insert changes to the input that leave the model's prediction unaffected despite the underlying input's label having changed. So far, the relationship between these two notions of adversarial examples has not been studied, we close this gap.\n\nWe demonstrate that solely achieving perturbation-based robustness is insufficient for complete adversarial robustness. Worse, we find that classifiers trained to be Lp-norm robust are more vulnerable to invariance-based adversarial examples than their undefended counterparts. We construct theoretical arguments and analytical examples to justify why this is the case. We then illustrate empirically that the consequences of excessive perturbation-robustness can be exploited to craft new attacks. Finally, we show how to attack a provably robust defense --- certified on the MNIST test set to have at least 87% accuracy (with respect to the original test labels) under perturbations of Linfinity-norm below epsilon=0.4 --- and reduce its accuracy (under this threat model with respect to an ensemble of human labelers) to 60% with an automated attack, or just 12% with human-crafted adversarial examples.",
        "keywords": "Invariance;Robustness;Adversarial Examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "J\u00f6rn-Henrik Jacobsen;Jens Behrmann;Nicholas Carlini;Florian Tram\u00e8r;Nicolas Papernot",
        "authorids": "j.jacobsen@vectorinstitute.ai;jensb@uni-bremen.de;nicholas@carlini.com;tramer@cs.stanford.edu;nicolas.papernot@utoronto.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\njacobsen2020exploiting,\ntitle={Exploiting Excessive Invariance caused by Norm-Bounded Adversarial Robustness},\nauthor={J{\\\"o}rn-Henrik Jacobsen and Jens Behrmann and Nicholas Carlini and Florian Tram{\\`e}r and Nicolas Papernot},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eVX0EFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1eVX0EFvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "902;210;340",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "943;283;270",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            484.0,
            300.2976301382791
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            498.6666666666667,
            314.23593400854435
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8792149302985114654&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "r1eVXa4KvH",
        "title": "Concise Multi-head Attention Models",
        "track": "main",
        "status": "Reject",
        "tldr": "Fixing the head size of the Transformer models allows one to train them with a smaller embedding size.",
        "abstract": "Attention based Transformer architecture has enabled significant advances in the field of natural language processing. In addition to new pre-training techniques, recent improvements crucially rely on working with a relatively larger embedding dimension for tokens. This leads to models that are prohibitively large to be employed in the downstream tasks. In this paper we identify one of the important factors contributing to the large embedding size requirement. In particular, our analysis highlights that the scaling between the number of heads and the size of each head in the existing architectures gives rise to this limitation, which we further validate with our  experiments. As a solution, we propose a new way to set the projection size in attention heads that allows us to train models with a relatively smaller embedding dimension, without sacrificing the performance.",
        "keywords": "Transformers;Attention;Multihead;expressive power;embedding size",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Srinadh Bhojanapalli;Chulhee Yun;Ankit Singh Rawat;Sashank Reddi;Sanjiv Kumar",
        "authorids": "bsrinadh@google.com;chulheey@mit.edu;ankitsrawat@google.com;sashank@google.com;sanjivk@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nbhojanapalli2020concise,\ntitle={Concise Multi-head Attention Models},\nauthor={Srinadh Bhojanapalli and Chulhee Yun and Ankit Singh Rawat and Sashank Reddi and Sanjiv Kumar},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eVXa4KvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1eVXa4KvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "350;534;139",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "453;751;320",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            341.0,
            161.38360098432142
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            508.0,
            180.20173880034196
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-n2s4DyUItAJ:scholar.google.com/&scioq=Concise+Multi-head+Attention+Models&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1eWdlBFwS",
        "title": "Isolating Latent Structure with Cross-population Variational Autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "A variant of the VAE which models data from differing distributions, isolating the latent factors which are unique to each set as well as the shared structure",
        "abstract": "A significant body of recent work has examined variational autoencoders as a powerful approach for tasks which involve modeling the distribution of complex data such as images and text. In this work, we present a framework for modeling multiple data sets which come from differing distributions but which share some common latent structure. By incorporating architectural constraints and using a mutual information regularized form of the variational objective, our method successfully models differing data populations while explicitly encouraging the isolation of the shared and private latent factors. This enables our model to learn useful shared structure across similar tasks and to disentangle cross-population representations in a weakly supervised way. We demonstrate the utility of our method on several applications including image denoising, sub-group discovery, and continual learning.",
        "keywords": "variational autoencoder;latent variable model;probabilistic graphical model;machine learning;deep learning;continual learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Joe Davison;Kristen A. Severson;Soumya Ghosh",
        "authorids": "jddavison@g.harvard.edu;kristen.severson@ibm.com;ghoshso@us.ibm.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndavison2020isolating,\ntitle={Isolating Latent Structure with Cross-population Variational Autoencoders},\nauthor={Joe Davison and Kristen A. Severson and Soumya Ghosh},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eWdlBFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1eWdlBFwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "606;224;267",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.6666666666667,
            170.84560931502517
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rMY-7jyC3CQJ:scholar.google.com/&scioq=Isolating+Latent+Structure+with+Cross-population+Variational+Autoencoders&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1eX1yrKwB",
        "title": "Distribution Matching Prototypical Network for Unsupervised Domain Adaptation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose to explicitly model deep feature distributions of source and target data as Gaussian mixture distributions for Unsupervised Domain Adaptation (UDA) and achieve superior results in multiple UDA tasks than state-of-the-art methods.",
        "abstract": "State-of-the-art Unsupervised Domain Adaptation (UDA) methods learn transferable features by minimizing the feature distribution discrepancy between the source and target domains. Different from these methods which do not model the feature distributions explicitly, in this paper, we explore explicit feature distribution modeling for UDA. In particular, we propose Distribution Matching Prototypical Network (DMPN) to model the deep features from each domain as Gaussian mixture distributions. With explicit feature distribution modeling, we can easily measure the discrepancy between the two domains. In DMPN, we propose two new domain discrepancy losses with probabilistic interpretations. The first one minimizes the distances between the corresponding Gaussian component means of the source and target data. The second one minimizes the pseudo negative log likelihood of generating the target features from source feature distribution. To learn both discriminative and domain invariant features, DMPN is trained by minimizing the classification loss on the labeled source data and the domain discrepancy losses together. Extensive experiments are conducted over two UDA tasks. Our approach yields a large margin in the Digits Image transfer task over state-of-the-art approaches. More remarkably, DMPN obtains a mean accuracy of 81.4% on VisDA 2017 dataset. The hyper-parameter sensitivity analysis shows that our approach is robust w.r.t hyper-parameter changes.",
        "keywords": "Deep Learning;Unsupervised Domain Adaptation;Distribution Modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lei Zhu;Wei Wang;Mei Hui Zhang;Beng Chin Ooi;Chang Yao",
        "authorids": "e0203764@u.nus.edu;wangwei@comp.nus.edu.sg;meihui_zhang@bit.edu.cn;ooibc@comp.nus.edu.sg;yaochang@zjuici.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhu2020distribution,\ntitle={Distribution Matching Prototypical Network for Unsupervised Domain Adaptation},\nauthor={Lei Zhu and Wei Wang and Mei Hui Zhang and Beng Chin Ooi and Chang Yao},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eX1yrKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1eX1yrKwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "249;549;309",
        "wc_reply_reviewers": "0;570;0",
        "wc_reply_authors": "1359;1747;964",
        "reply_reviewers": "0;3;0",
        "reply_authors": "2;3;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.0,
            129.6148139681572
        ],
        "wc_reply_reviewers_avg": [
            190.0,
            268.70057685088807
        ],
        "wc_reply_authors_avg": [
            1356.6666666666667,
            319.6626694223494
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16086937049389263073&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1e_FpNFDr",
        "title": "Generalization bounds for deep convolutional neural networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We prove generalization bounds for convolutional neural networks that take account of weight-tying",
        "abstract": "We prove bounds on the generalization error of convolutional networks.\nThe bounds are in terms of the training loss, the number of\nparameters, the Lipschitz constant of the loss and the distance from\nthe weights to the initial weights.  They are independent of the\nnumber of pixels in the input, and the height and width of hidden\nfeature maps.\nWe present experiments using CIFAR-10 with varying\nhyperparameters of a deep convolutional network, comparing our bounds\nwith practical generalization gaps.",
        "keywords": "generalization;convolutional networks;statistical learning theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Philip M. Long;Hanie Sedghi",
        "authorids": "plong@google.com;hsedghi@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLong2020Generalization,\ntitle={Generalization bounds for deep convolutional neural networks},\nauthor={Philip M. Long and Hanie Sedghi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1e_FpNFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1e_FpNFDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "522;248;483",
        "wc_reply_reviewers": "477;0;262",
        "wc_reply_authors": "497;143;607",
        "reply_reviewers": "2;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.6666666666667,
            121.02433180517416
        ],
        "wc_reply_reviewers_avg": [
            246.33333333333334,
            195.04928152193287
        ],
        "wc_reply_authors_avg": [
            415.6666666666667,
            197.96520457449645
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 125,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14241423067015504175&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1ecqn4YwB",
        "title": "N-BEATS: Neural basis expansion analysis for interpretable time series forecasting",
        "track": "main",
        "status": "Poster",
        "tldr": "A novel deep interpretable architecture that achieves state of the art on three large scale univariate time series forecasting datasets ",
        "abstract": "We focus on solving the univariate times series point forecasting problem using deep learning. We propose a deep neural architecture based on backward and forward residual links and a very deep stack of fully-connected layers. The architecture has a number of desirable properties, being interpretable, applicable without modification to a wide array of target domains, and fast to train. We test the proposed architecture on several well-known datasets, including M3, M4 and TOURISM competition datasets containing time series from diverse domains. We demonstrate state-of-the-art performance for two configurations of N-BEATS for all the datasets, improving forecast accuracy by 11% over a statistical benchmark and by 3% over last year's winner of the M4 competition, a domain-adjusted hand-crafted hybrid between neural network and statistical time series models. The first configuration of our model does not employ any time-series-specific components and its performance on heterogeneous datasets strongly suggests that, contrarily to received wisdom, deep learning primitives such as residual blocks are by themselves sufficient to solve a wide range of forecasting problems. Finally, we demonstrate how the proposed architecture can be augmented to provide outputs that are interpretable without considerable loss in accuracy.",
        "keywords": "time series forecasting;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Boris N. Oreshkin;Dmitri Carpov;Nicolas Chapados;Yoshua Bengio",
        "authorids": "boris@elementai.com;dmitri.carpov@elementai.com;chapados@elementai.com;yoshua.bengio@mila.quebec",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nOreshkin2020N-BEATS:,\ntitle={N-BEATS: Neural basis expansion analysis for interpretable time series forecasting},\nauthor={Boris N. Oreshkin and Dmitri Carpov and Nicolas Chapados and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1ecqn4YwB}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 19 community implementations](https://paperswithcode.com/paper/?openreview=r1ecqn4YwB)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1ecqn4YwB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "284;452;691",
        "wc_reply_reviewers": "0;0;50",
        "wc_reply_authors": "695;1411;311",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;3;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            475.6666666666667,
            166.99767130778267
        ],
        "wc_reply_reviewers_avg": [
            16.666666666666668,
            23.570226039551585
        ],
        "wc_reply_authors_avg": [
            805.6666666666666,
            455.84012792011
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1691,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7330544929604007320&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "r1edp2VYwH",
        "title": "Amharic Light Stemmer",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Amharic Light Stemmer is designed for improving performance of  Amharic Sentiment Classification.",
        "abstract": "Stemming is the process of removing affixes( i.e. prefixes, infixes and suffixes) that improve the accuracy and performance of information retrieval systems.This paper presents the reduction of Amharic words to corresponding stem where with the intention that it preserves semantic information. The proposed approach efficiently removes affixes from an Amharic word. The process of removing such affixes (prefixes, infixes and suffixes) from a word to its base form is called stemming. While many stemmers exist for dominant languages such as English, under resourced languages such as Amharic which lacks such powerful tool support. In this paper, we design a light Amharic stemmer relying on the rules that receives an Amharic word and then it finds a match to the beginning of a word to the possible prefixes and to its ending with the possible suffixes and finally it checks whether it has infix. The final result is the stem if there is any prefix, infix or/and suffix, otherwise it remains in one of the earlier states. The technique does not rely on any additional resource (e.g. dictionary) to verify the generated stem. The performance of the generated stemmer is evaluated using manually annotated Amharic words. The result is compared with current state-of-the-art stemmer for Amharic showing an increase of 7% in stemmer correctness.",
        "keywords": "Amharic light Stemmer;Affixes;Amharic Sentiment Classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Girma Neshir;Andeas Rauber;and Solomon Atnafu",
        "authorids": "girma1978@gmail.com;rauber@ifs.tuwien.ac.at;solomon.atnafu@aau.edu.et",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1edp2VYwH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "198;191;127",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            172.0,
            31.94787421201396
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sX6W9ahxfQEJ:scholar.google.com/&scioq=Amharic+Light+Stemmer&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1egIyBFPS",
        "title": "Deep Symbolic Superoptimization Without Human Knowledge",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Deep  symbolic superoptimization refers to the task of applying deep learning methods to simplify symbolic expressions.   Existing approaches either perform supervised training on human-constructed datasets that defines equivalent expression pairs, or apply reinforcement learning with human-defined equivalent trans-formation actions.  In short,  almost all existing methods rely on human knowledge to define equivalence, which suffers from large labeling cost and learning bias, because it is almost impossible to define and comprehensive equivalent set. We thus propose HISS, a reinforcement learning framework for symbolic super-optimization that keeps human outside the loop.  HISS introduces a tree-LSTM encoder-decoder network with attention to ensure tractable learning.   Our experiments show that HISS can discover more simplification rules than existing human-dependent methods, and can learn meaningful embeddings for symbolic expressions, which are indicative of equivalence.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hui Shi;Yang Zhang;Xinyun Chen;Yuandong Tian;Jishen Zhao",
        "authorids": "hshi@ucsd.edu;yang.zhang2@ibm.com;xinyun.chen@berkeley.edu;yuandong@fb.com;jzhao@ucsd.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nShi2020Deep,\ntitle={Deep Symbolic Superoptimization Without Human Knowledge},\nauthor={Hui Shi and Yang Zhang and Xinyun Chen and Yuandong Tian and Jishen Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1egIyBFPS}\n}",
        "github": "[![github](/images/github_icon.svg) shihui2010/symbolic_simplifier](https://github.com/shihui2010/symbolic_simplifier)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1egIyBFPS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "216;551;453",
        "wc_reply_reviewers": "0;12;0",
        "wc_reply_authors": "273;402;462",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.6666666666667,
            140.63269732020203
        ],
        "wc_reply_reviewers_avg": [
            4.0,
            5.656854249492381
        ],
        "wc_reply_authors_avg": [
            379.0,
            78.85429601486528
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1299108471437991049&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1eh30NFwB",
        "title": "Variational Autoencoders with Normalizing Flow Decoders",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recently proposed normalizing flow models such as Glow (Kingma & Dhariwal, 2018) have been shown to be able to generate high quality, high dimensional images with relatively fast sampling speed. Due to the inherently restrictive design of architecture , however, it is necessary that their model are excessively deep in order to achieve effective training. In this paper we propose to combine Glow model with an underlying variational autoencoder in order to counteract this issue. We demonstrate that such our proposed model is competitive with Glow in terms of image quality while requiring far less time for training. Additionally, our model achieves state-of-the-art FID score on CIFAR-10 for a likelihood-based model.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rogan Morrow;Wei-Chen Chiu",
        "authorids": "rogan.o.morrow@gmail.com;walon@cs.nctu.edu.tw",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmorrow2020variational,\ntitle={Variational Autoencoders with Normalizing Flow Decoders},\nauthor={Rogan Morrow and Wei-Chen Chiu},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eh30NFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1eh30NFwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "741;473;318",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "429;441;227",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            510.6666666666667,
            174.7309042181402
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            365.6666666666667,
            98.17444790892496
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 17,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3346939951151674357&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1ehEgHKwH",
        "title": "An Empirical Study of Encoders and Decoders in Graph-Based Dependency Parsing",
        "track": "main",
        "status": "Withdraw",
        "tldr": "An empirical study that examines the effectiveness of different encoder-decoder combinations for the task of dependency parsing",
        "abstract": "Graph-based dependency parsing consists of two steps: first, an encoder produces a feature representation for each parsing substructure of the input sentence, which is then used to compute a score for the substructure; and second, a decoder} finds the parse tree whose substructures have the largest total score. Over the past few years, powerful neural techniques have been introduced into the encoding step which substantially increases parsing accuracies. However, advanced decoding techniques, in particular high-order decoding, have seen a decline in usage. It is widely believed that contextualized features produced by neural encoders can help capture high-order decoding information and hence diminish the need for a high-order decoder. In this paper, we empirically evaluate the combinations of different neural and non-neural encoders with first- and second-order decoders and provide a comprehensive analysis about the effectiveness of these combinations with varied training data sizes. We find that: first, when there is large training data, a strong neural encoder with first-order decoding is sufficient to achieve high parsing accuracy and only slightly lags behind the combination of neural encoding and second-order decoding; second, with small training data, a non-neural encoder with a second-order decoder outperforms the other combinations in most cases.   ",
        "keywords": "dependency parsing;high order decoding;empirical study",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ge Wang;Ziyuan Hu;Zechuan Hu;Kewei Tu",
        "authorids": "wangge@shanghaitech.edu.cn;huzy@shanghaitech.edu.cn;huzch@shanghaitech.edu.cn;tukw@shanghaitech.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1ehEgHKwH",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "254;251",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            252.5,
            1.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8091093454910577475&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1eiu2VtwH",
        "title": "Neural Oblivious Decision Ensembles for Deep Learning on Tabular Data",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a new DNN architecture for deep learning on tabular data",
        "abstract": "Nowadays, deep neural networks (DNNs) have become the main instrument for machine learning tasks within a wide range of domains, including vision, NLP, and speech. Meanwhile, in an important case of heterogenous tabular data, the advantage of DNNs over shallow counterparts remains questionable. In particular, there is no sufficient evidence that deep learning machinery allows constructing methods that outperform gradient boosting decision trees (GBDT), which are often the top choice for tabular problems. In this paper, we introduce Neural Oblivious Decision Ensembles (NODE), a new deep learning architecture, designed to work with any tabular data. In a nutshell, the proposed NODE architecture generalizes ensembles of oblivious decision trees, but benefits from both end-to-end gradient-based optimization and the power of multi-layer hierarchical representation learning. With an extensive experimental comparison to the leading GBDT packages on a large number of tabular datasets, we demonstrate the advantage of the proposed NODE architecture, which outperforms the competitors on most of the tasks. We open-source the PyTorch implementation of NODE and believe that it will become a universal framework for machine learning on tabular data.",
        "keywords": "tabular data;architectures;DNN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sergei Popov;Stanislav Morozov;Artem Babenko",
        "authorids": "sapopov@yandex-team.ru;stanis-morozov@yandex.ru;artem.babenko@phystech.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nPopov2020Neural,\ntitle={Neural Oblivious Decision Ensembles for Deep Learning on Tabular Data},\nauthor={Sergei Popov and Stanislav Morozov and Artem Babenko},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eiu2VtwH}\n}",
        "github": "https://github.com/anonICLR2020/node",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1eiu2VtwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "597;121;417",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "292;64;375",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            378.3333333333333,
            196.24021560888642
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            243.66666666666666,
            131.48468436370155
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 402,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14218543222397495007&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1enqkBtwr",
        "title": "Asymptotic learning curves of kernel methods: empirical data v.s. Teacher-Student paradigm",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "How many training data are needed to learn a supervised task? It is often observed that the generalization error decreases  as $n^{-\\beta}$ where $n$ is the number of training examples and $\\beta$  an exponent that  depends on both data and algorithm. In this work we measure  $\\beta$  when applying kernel methods to real datasets. For MNIST we find $\\beta\\approx 0.4$ and for CIFAR10 $\\beta\\approx 0.1$. Remarkably, $\\beta$ is the same for  regression and classification tasks, and for Gaussian or Laplace kernels. To rationalize the existence of non-trivial exponents that can be independent of the specific kernel used, we introduce the Teacher-Student framework for kernels. In this scheme, a Teacher generates data according to a Gaussian random field, and a Student learns  them via kernel regression. With a simplifying assumption --- namely that the data are sampled from a regular lattice --- we derive analytically $\\beta$  for translation invariant kernels, using previous results from the kriging literature.  Provided that the Student is not too sensitive to high frequencies, $\\beta$ depends only on the training data and their dimension. We confirm numerically that these predictions hold when the training points are  sampled  at random on a hypersphere. Overall, our results quantify how smooth Gaussian data should be to avoid the curse of dimensionality, and indicate that for kernel learning the relevant dimension of the data  should be defined in terms of how the distance between  nearest data points depends on $n$. With this definition one obtains reasonable effective smoothness estimates for MNIST and CIFAR10.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stefano Spigler;Mario Geiger;Matthieu Wyart",
        "authorids": "stefano.spigler@epfl.ch;mario.geiger@epfl.ch;matthieu.wyart@epfl.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nspigler2020asymptotic,\ntitle={Asymptotic learning curves of kernel methods: empirical data v.s. Teacher-Student paradigm},\nauthor={Stefano Spigler and Mario Geiger and Matthieu Wyart},\nyear={2020},\nurl={https://openreview.net/forum?id=r1enqkBtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1enqkBtwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "552;122;601",
        "wc_reply_reviewers": "0;0;96",
        "wc_reply_authors": "264;44;427",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;3",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            425.0,
            215.18519155989026
        ],
        "wc_reply_reviewers_avg": [
            32.0,
            45.254833995939045
        ],
        "wc_reply_authors_avg": [
            245.0,
            156.93523080132985
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 107,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12394616922205073031&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "r1eoflSFvS",
        "title": "Auto Network Compression with Cross-Validation Gradient",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Network compression technology can compress large and complex networks into small networks, so that it can be deployed on devices with limited resources. Sparse regularization method, such as $\\normlone$ or $L^{21}$ regularization, is the most popular method that can induce sparse model. However, it introduces new hyperparameters, which not only affects the degree of sparsity, but also involves whether the network can be effectively trained (gradient explosion or model non-convergence). How to select hyperparameters becomes an important and open problem for regularization-based network compression method. In this paper, we propose an auto network compression framework with cross-validation gradient which can automatically adjust the hyperparameters. Firstly, we design an unified framework which combines model parameter learning with hyperparametric learning. Secondly, in order to solve the problem of non-derivability of $\\normlone$ norm, we introduce auxiliary variables to transform it into a solvable problem, and then obtain the derivative of model parameters with respect to hyperparameters. Finally, the derivative of the hyperparametric vector is solved by the chain rule. In solving the inverse problem of Heisen matrix, we compare three methods and only calculate the mixed partial derivatives. To a certain extent, this method realizes the automatic network compression. Classical network structures such as VGG, ResNet and DensNet are tested on CIFAR-10 and CIFAR-100 datasets to prove the effectiveness of our algorithm.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nannan Tian;Yong Liu",
        "authorids": "tiannannan@iie.ac.cn;liuyong@iie.ac.cn",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1eoflSFvS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "536;409;278",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;588;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            407.6666666666667,
            105.33227847573073
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            196.0,
            277.18585822512665
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J8cJ4ivGFCYJ:scholar.google.com/&scioq=Auto+Network+Compression+with+Cross-Validation+Gradient&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1eowANFvr",
        "title": "Towards Fast Adaptation of Neural Architectures with Meta Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "A meta-learning method for fast adaptation of neural architectures.",
        "abstract": "Recently, Neural Architecture Search (NAS) has been successfully applied to multiple artificial intelligence areas and shows better performance compared with hand-designed networks. However, the existing NAS methods only target a specific task. Most of them usually do well in searching an architecture for single task but are troublesome for multiple datasets or multiple tasks. Generally, the architecture for a new task is either searched from scratch, which is neither efficient nor flexible enough for practical application scenarios, or borrowed from the ones searched on other tasks, which might be not optimal. In order to tackle the transferability of NAS and conduct fast adaptation of neural architectures, we propose a novel Transferable Neural Architecture Search method based on meta-learning in this paper, which is termed as T-NAS. T-NAS learns a meta-architecture that is able to adapt to a new task quickly through a few gradient steps, which makes the transferred architecture suitable for the specific task. Extensive experiments show that T-NAS achieves state-of-the-art performance in few-shot learning and comparable performance in supervised learning but with 50x less searching cost, which demonstrates the effectiveness of our method.",
        "keywords": "Fast adaptation;Meta learning;NAS",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongze Lian;Yin Zheng;Yintao Xu;Yanxiong Lu;Leyu Lin;Peilin Zhao;Junzhou Huang;Shenghua Gao",
        "authorids": "liandz@shanghaitech.edu.cn;yzheng3xg@gmail.com;xuyt@shanghaitech.edu.cn;alanlu@tencent.com;goshawklin@tencent.com;masonzhao@tencent.com;jzhuang@uta.edu;gaoshh@shanghaitech.edu.cn",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nLian2020Towards,\ntitle={Towards Fast Adaptation of Neural Architectures with Meta Learning},\nauthor={Dongze Lian and Yin Zheng and Yintao Xu and Yanxiong Lu and Leyu Lin and Peilin Zhao and Junzhou Huang and Shenghua Gao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eowANFvr}\n}",
        "github": "[![github](/images/github_icon.svg) dongzelian/T-NAS](https://github.com/dongzelian/T-NAS)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1eowANFvr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "437;158;449",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "657;241;471",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.0,
            134.43957750603056
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            456.3333333333333,
            170.1476483005928
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 104,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15917750614264940386&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "r1epjn4KvS",
        "title": "CoRelatE: Modeling the Correlation in Multi-fold Relations for Knowledge Graph Embedding",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Representation learning of knowledge bases aims to embed both entities and relations into a continuous vector space. Most existing models such as TransE, TransH and TransR consider only binary relations involved in knowledge bases, while multi-fold relations are converted to triplets and treated as instances of binary relations, resulting in a loss of structural information. M-TransH is a recently proposed direct modeling framework for multi-fold relations but ignores the relation-level information that certain facts belong to the same relation. This paper proposes a Group-constrained Embedding method which embeds entity nodes and fact nodes from entity space into relation space, restricting the embedded fact nodes related to the same relation to groups with Zero Constraint, Radius Constraint or Cosine Constraint. Using this method, a new model is provided, i.e. CoRelatE. We evaluate our model on link prediction and instance classification tasks, experimental results demonstrate that our approach outperforms related methods by a significant margin.",
        "keywords": "multi-fold relation;knowledge graph embedding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yan Huang;Haili Sun;Ke Xu;Songfeng Lu;Xinfang Zhang",
        "authorids": "platanus@hust.edu.cn;m201372818@hust.edu.cn;xuke@hust.edu.cn;553159969@qq.com;1050058446@qq.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=r1epjn4KvS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            2,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6842560776711269532&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1erNxBtwr",
        "title": "Demystifying Graph Neural Network Via Graph Filter Assessment",
        "track": "main",
        "status": "Reject",
        "tldr": "Propose an assessment framework to analyze and learn graph convolutional filter",
        "abstract": "Graph Neural Networks (GNNs) have received tremendous attention recently due to their power in handling graph data for different downstream tasks across different application domains. The key of GNN is its graph convolutional filters, and recently various kinds of filters are designed. However, there still lacks in-depth analysis on (1) Whether there exists a best filter that can perform best on all graph data; (2) Which graph properties will influence the optimal choice of graph filter; (3) How to design appropriate filter adaptive to the graph data. In this paper, we focus on addressing the above three questions. We first propose a novel assessment tool to evaluate the effectiveness of graph convolutional filters for a given graph. Using the assessment tool, we find out that there is no single filter as a `silver bullet' that perform the best on all possible graphs. In addition, different graph structure properties will influence the optimal graph convolutional filter's design choice. Based on these findings, we develop Adaptive Filter Graph Neural Network (AFGNN), a simple but powerful model that can adaptively learn task-specific filter. For a given graph, it leverages graph filter assessment as regularization and learns to combine from a set of base filters. Experiments on both synthetic and real-world benchmark datasets demonstrate that our proposed model can indeed learn an appropriate filter and perform well on graph tasks.",
        "keywords": "Graph Neural Networks;Graph convolutional filter analysis;representational power",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yewen Wang;Ziniu Hu;Yusong Ye;Yizhou Sun",
        "authorids": "wyw10804@gmail.com;bull@cs.ucla.edu;yusongye@g.ucla.edu;yzsun@cs.ucla.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020demystifying,\ntitle={Demystifying Graph Neural Network Via Graph Filter Assessment},\nauthor={Yewen Wang and Ziniu Hu and Yusong Ye and Yizhou Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=r1erNxBtwr}\n}",
        "github": "https://github.com/conferencesub/ICLR_2020",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1erNxBtwr",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "699;411;142",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1179;1525;145",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.3333333333333,
            227.43839214658158
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            949.6666666666666,
            586.2566749205411
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2183579536053749775&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1etN1rtPB",
        "title": "Implementation Matters in Deep RL: A Case Study on PPO and TRPO",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "We study the roots of algorithmic progress in deep policy gradient algorithms through a case study on two popular algorithms: Proximal Policy Optimization (PPO) and Trust Region Policy Optimization (TRPO). Specifically, we investigate the consequences of \"code-level optimizations:\" algorithm augmentations found only in implementations or described as auxiliary details to the core algorithm. Seemingly of secondary importance, such optimizations turn out to have a major impact on agent behavior. Our results show that they (a) are responsible for most of PPO's gain in cumulative reward over TRPO, and (b) fundamentally change how RL methods function. These insights show the difficulty, and importance, of attributing performance gains in deep reinforcement learning.\n",
        "keywords": "deep policy gradient methods;deep reinforcement learning;trpo;ppo",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Logan Engstrom;Andrew Ilyas;Shibani Santurkar;Dimitris Tsipras;Firdaus Janoos;Larry Rudolph;Aleksander Madry",
        "authorids": "ailyas@mit.edu;engstrom@mit.edu;shibani@mit.edu;tsipras@mit.edu;firdaus.janoos@twosigma.com;rudolph@csail.mit.edu;madry@mit.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nEngstrom2020Implementation,\ntitle={Implementation Matters in Deep RL: A Case Study on PPO and TRPO},\nauthor={Logan Engstrom and Andrew Ilyas and Shibani Santurkar and Dimitris Tsipras and Firdaus Janoos and Larry Rudolph and Aleksander Madry},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1etN1rtPB}\n}",
        "github": "https://github.com/implementation-matters/code-for-paper",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1etN1rtPB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "829;144;350",
        "wc_reply_reviewers": "183;95;8",
        "wc_reply_authors": "357;144;267",
        "reply_reviewers": "1;1;1",
        "reply_authors": "1;2;2",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            441.0,
            286.95760430186664
        ],
        "wc_reply_reviewers_avg": [
            95.33333333333333,
            71.44383963801374
        ],
        "wc_reply_authors_avg": [
            256.0,
            87.3040663428686
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 211,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1341599101812362085&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1evOhEKvH",
        "title": "Graph inference learning for semi-supervised classification",
        "track": "main",
        "status": "Poster",
        "tldr": " We propose a novel graph inference learning framework by building structure relations to infer unknown node labels from those labeled nodes in an end-to-end way.",
        "abstract": "In this work, we address the semi-supervised classification of graph data, where the categories of those unlabeled nodes are inferred from labeled nodes as well as graph structures. Recent works often solve this problem with the advanced graph convolution in a conventional supervised manner, but the performance could be heavily affected when labeled data is scarce. Here we propose a Graph Inference Learning (GIL) framework to boost the performance of node classification by learning the inference of node labels on graph topology. To bridge the connection of two nodes, we formally define a structure relation by encapsulating node attributes, between-node paths and local topological structures together, which can make inference conveniently deduced from one node to another node. For learning the inference process, we further introduce meta-optimization on structure relations from training nodes to validation nodes, such that the learnt graph inference capability can be better self-adapted into test nodes. Comprehensive evaluations on four benchmark datasets (including Cora, Citeseer, Pubmed and NELL) demonstrate the superiority of our GIL when compared with other state-of-the-art methods in the semi-supervised node classification task.",
        "keywords": "semi-supervised classification;graph inference learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chunyan Xu;Zhen Cui;Xiaobin Hong;Tong Zhang;Jian Yang;Wei Liu",
        "authorids": "cyx@njust.edu.cn;zhen.cui@njust.edu.cn;xbhong@njust.edu.cn;tong.zhang@njust.edu.cn;csjyang@njust.edu.cn;wl2223@columbia.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nXu2020Graph,\ntitle={Graph inference learning for semi-supervised classification},\nauthor={Chunyan Xu and Zhen Cui and Xiaobin Hong and Tong Zhang and Jian Yang and Wei Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1evOhEKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1evOhEKvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "466;203;234",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            301.0,
            117.35700518787392
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 39,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3718724291473885163&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "r1eyceSYPr",
        "title": "Unbiased Contrastive Divergence Algorithm for Training Energy-Based Latent Variable Models",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We have developed a new training algorithm for energy-based latent variable models that completely removes the bias of contrastive divergence.",
        "abstract": "The contrastive divergence algorithm is a popular approach to training energy-based latent variable models, which has been widely used in many machine learning models such as the restricted Boltzmann machines and deep belief nets. Despite its empirical success, the contrastive divergence algorithm is also known to have biases that severely affect its convergence. In this article we propose an unbiased version of the contrastive divergence algorithm that completely removes its bias in stochastic gradient methods, based on recent advances on unbiased Markov chain Monte Carlo methods. Rigorous theoretical analysis is developed to justify the proposed algorithm, and numerical experiments show that it significantly improves the existing method. Our findings suggest that the unbiased contrastive divergence algorithm is a promising approach to training general energy-based latent variable models.",
        "keywords": "energy model;restricted Boltzmann machine;contrastive divergence;unbiased Markov chain Monte Carlo;distribution coupling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yixuan Qiu;Lingsong Zhang;Xiao Wang",
        "authorids": "yixuanq@andrew.cmu.edu;lingsong@purdue.edu;wangxiao@purdue.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nQiu2020Unbiased,\ntitle={Unbiased Contrastive Divergence Algorithm for Training Energy-Based Latent Variable Models},\nauthor={Yixuan Qiu and Lingsong Zhang and Xiao Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1eyceSYPr}\n}",
        "github": "[![github](/images/github_icon.svg) yixuan/cdtau](https://github.com/yixuan/cdtau)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1eyceSYPr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "481;751;244",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "523;593;148",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            492.0,
            207.12797976130602
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            421.3333333333333,
            195.3771281962713
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2361595343122739906&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "r1ezqaEFPr",
        "title": "Multi-Task Learning via Scale Aware Feature Pyramid Networks and Effective Joint Head",
        "track": "main",
        "status": "Reject",
        "tldr": "Our work improve Mask R-CNN by using Scale Aware FPN to remedy scale variation issue and conbining detection and segmentation branches into Effective Joint Head for more expressive multi-task learning. ",
        "abstract": "As a concise and classic framework for object detection and instance segmentation, Mask R-CNN achieves promising performance in both two tasks. However, considering stronger feature representation for Mask R-CNN fashion framework, there is room for improvement from two aspects. On the one hand, performing multi-task prediction needs more credible feature extraction and multi-scale features integration to handle objects with varied scales. In this paper, we address this problem by using a novel neck module called SA-FPN (Scale Aware Feature Pyramid Networks). With the enhanced feature representations, our model can accurately detect and segment the objects of multiple scales. On the other hand, in Mask R-CNN framework, isolation between parallel detection branch and instance segmentation branch exists, causing the gap between training and testing processes. To narrow this gap, we propose a unified head module named EJ-Head (Effective Joint Head) to combine two branches into one head, not only realizing the interaction between two tasks, but also enhancing the effectiveness of multi-task learning. Comprehensive experiments show that our proposed methods bring noticeable gains for object detection and instance segmentation. In particular, our model outperforms the original Mask R-CNN by 1~2 percent AP in both object detection and instance segmentation task on MS-COCO benchmark. Code will be available soon.",
        "keywords": "Multi-Task Learning;Object Detection;Instance Segmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Feng Ni",
        "authorids": "nifeng@pku.edu.cn",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nni2020multitask,\ntitle={Multi-Task Learning via Scale Aware Feature Pyramid Networks and Effective Joint Head},\nauthor={Feng Ni},\nyear={2020},\nurl={https://openreview.net/forum?id=r1ezqaEFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1ezqaEFPr",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "191;250",
        "wc_reply_reviewers": "0;35",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;1",
        "reply_authors": "0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            220.5,
            29.5
        ],
        "wc_reply_reviewers_avg": [
            17.5,
            17.5
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3288000187999010781&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "r1g1CAEKDH",
        "title": "Wyner VAE: A Variational Autoencoder with Succinct Common Representation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "A new variational autoencoder (VAE) model is proposed that learns a succinct common representation of two correlated data variables for conditional and joint generation tasks. The proposed Wyner VAE model is based on two information theoretic problems---distributed simulation and channel synthesis---in which Wyner's common information arises as the fundamental limit of the succinctness of the common representation. The Wyner VAE decomposes a pair of correlated data variables into their common representation (e.g., a shared concept) and  local representations that capture the remaining randomness (e.g., texture and style) in respective data variables by imposing the mutual information between the data variables and the common representation as a regularization term. The utility of the proposed approach is demonstrated through experiments for joint and conditional generation with and without style control using synthetic data and real images. Experimental results show that learning a succinct common representation achieves better generative performance and that the proposed model outperforms existing VAE variants and the variational information bottleneck method.",
        "keywords": "Wyner's common information;information theoretic regularization;information bottleneck;representation learning;generative models;conditional generation;joint generation;style transfer;variational autoencoders",
        "primary_area": "",
        "supplementary_material": "",
        "author": "J. Jon Ryu;Yoojin Choi;Young-Han Kim;Mostafa El-Khamy;Jungwon Lee",
        "authorids": "jongha.ryu@gmail.com;yoojin.c@samsung.com;yhk@ucsd.edu;mostafa.e@samsung.com;jungwon2.lee@samsung.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nryu2020wyner,\ntitle={Wyner {\\{}VAE{\\}}: A Variational Autoencoder with Succinct Common Representation Learning},\nauthor={J. Jon Ryu and Yoojin Choi and Young-Han Kim and Mostafa El-Khamy and Jungwon Lee},\nyear={2020},\nurl={https://openreview.net/forum?id=r1g1CAEKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=r1g1CAEKDH",
        "pdf_size": 0,
        "rating": "3;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "810;384;288;473",
        "wc_reply_reviewers": "35;0;0;259",
        "wc_reply_authors": "2309;191;188;739",
        "reply_reviewers": "1;0;0;1",
        "reply_authors": "7;1;1;2",
        "rating_avg": [
            5.25,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            488.75,
            196.67406412641196
        ],
        "wc_reply_reviewers_avg": [
            73.5,
            108.04744328303192
        ],
        "wc_reply_authors_avg": [
            856.75,
            867.9494152887022
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            2.75,
            2.48746859276655
        ],
        "replies_avg": [
            20,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6947735646969455593&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1g4onVYvS",
        "title": "A Gradient-based Architecture HyperParameter Optimization Approach",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Network hyperparameters, such as network depth, layer-wise channel numbers, and input image resolution, are crucial for designing high-performance neural network architectures under resource limited scenarios. Previous solutions either optimize these hyperparameters with customized algorithms, or enumerate the hyperparameters with confined choices. Those methods are laborious and cumbersome to obtain a good solution. In this work, we propose a gradient-based approach to optimize these parameters in an efficient and unified manner, based on the observation that these parameters are consecutive and network performance changes continuously with them. Specifically, natural evolutionary strategy (NES) is used to approximate the gradient of the non-differentiable architecture hyperparameters and we incorporate it into the gradient descent framework for joint optimizing the weights and architecture hyperparameters. Compared to the state-of-the-art method, ChamNet, our method achieves higher accuracy with much fewer optimization time cost. Our method easily surpasses state-of-the-art methods and achieves up to 9.1%/6.1% accuracy enhancement than compact models MobileNet v1/v2.",
        "keywords": "gradient-based;neural architecture search;architecture hyperparameter optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zechun Liu;Xiangyu Zhang;Zhe Li;Yichen Wei;Kwang-Ting Cheng;Jian Sun",
        "authorids": "zliubq@connect.ust.hk;zhangxiangyu@megvii.com;lizhe@megvii.com;weiyichen@megvii.com;timcheng@ust.hk;sunjian@megvii.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1g4onVYvS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "297;417;120",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            278.0,
            121.99180300331658
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3363725178556981055&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1g6MCEtwr",
        "title": "Zero-Shot Out-of-Distribution Detection with Feature Correlations",
        "track": "main",
        "status": "Reject",
        "tldr": "We extend Gram matrices to compute feature correlations that allow us to detect OOD examples without requiring any OOD examples, and achieve detection rates that are generally equal to or better current state-of-the-art.",
        "abstract": "When presented with Out-of-Distribution (OOD) examples, deep neural networks yield confident, incorrect predictions. Detecting OOD examples is challenging, and the potential risks are high. In this paper, we propose to detect OOD examples by identifying inconsistencies between activity patterns and class predicted. We find that characterizing activity patterns by feature correlations and identifying anomalies in pairwise feature correlation values can yield high OOD detection rates. We identify anomalies in the pairwise feature correlations by simply comparing each pairwise correlation value with its respective range observed over the training data. Unlike many approaches, this can be used with any pre-trained softmax classifier and does not require access to OOD data for fine-tuning hyperparameters, nor does it require OOD access for inferring parameters. The method is applicable across a variety of architectures and vision datasets and generally performs better than or equal to state-of-the-art OOD detection methods, including those that do assume access to OOD examples.",
        "keywords": "out-of-distribution;gram matrices;classification;out-of-distribution detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chandramouli S Sastry;Sageev Oore",
        "authorids": "chandramouli.sastry@gmail.com;osageev@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsastry2020zeroshot,\ntitle={Zero-Shot Out-of-Distribution Detection with Feature Correlations},\nauthor={Chandramouli S Sastry and Sageev Oore},\nyear={2020},\nurl={https://openreview.net/forum?id=r1g6MCEtwr}\n}",
        "github": "https://github.com/zeroshot-ood/ood-detection",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1g6MCEtwr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "299;452;494",
        "wc_reply_reviewers": "37;0;0",
        "wc_reply_authors": "929;1578;1159",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;3;3",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            415.0,
            83.7973746605465
        ],
        "wc_reply_reviewers_avg": [
            12.333333333333334,
            17.441967269268172
        ],
        "wc_reply_authors_avg": [
            1222.0,
            268.6720429569602
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            3.0,
            0.0
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2DIr_fHU0qoJ:scholar.google.com/&scioq=Zero-Shot+Out-of-Distribution+Detection+with+Feature+Correlations&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1g6ogrtDr",
        "title": "Co-Attentive Equivariant Neural Networks: Focusing Equivariance On Transformations Co-Occurring in Data",
        "track": "main",
        "status": "Poster",
        "tldr": "We utilize attention to restrict equivariant neural networks to the set or co-occurring transformations in data. ",
        "abstract": "Equivariance is a nice property to have as it produces much more parameter efficient neural architectures and preserves the structure of the input through the feature mapping. Even though some combinations of transformations might never appear (e.g. an upright face with a horizontal nose), current equivariant architectures consider the set of all possible transformations in a transformation group when learning feature representations. Contrarily, the human visual system is able to attend to the set of relevant transformations occurring in the environment and utilizes this information to assist and improve object recognition. Based on this observation, we modify conventional equivariant feature mappings such that they are able to attend to the set of co-occurring transformations in data and generalize this notion to act on groups consisting of multiple symmetries. We show that our proposed co-attentive equivariant neural networks consistently outperform conventional rotation equivariant and rotation & reflection equivariant neural networks on rotated MNIST and CIFAR-10.",
        "keywords": "Equivariant Neural Networks;Attention Mechanisms;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David W. Romero;Mark Hoogendoorn",
        "authorids": "d.w.romeroguzman@vu.nl;m.hoogendoorn@vu.nl",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nRomero2020Co-Attentive,\ntitle={Co-Attentive Equivariant Neural Networks: Focusing Equivariance On Transformations Co-Occurring in Data},\nauthor={David W. Romero and Mark Hoogendoorn},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1g6ogrtDr}\n}",
        "github": "https://www.dropbox.com/sh/2gghao89strdotw/AAAYJ6XclnfeoS3AfN9Z-n5Wa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1g6ogrtDr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "758;263;594",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1191;581;1147",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            538.3333333333334,
            205.88076376604226
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            973.0,
            277.7672886908512
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16631809466718695250&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1g7xT4Kwr",
        "title": "PLEX: PLanner and EXecutor for Embodied Learning in Navigation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present a hierarchical learning framework for navigation within an embodied learning setting",
        "abstract": "We present a method for policy learning to navigate indoor environments. We adopt a hierarchical policy approach, where two agents are trained to work in cohesion with one another to perform a complex navigation task. A Planner agent operates at a higher level and proposes sub-goals for an Executor agent. The Executor reports an embedding summary back to the Planner as additional side information at the end of its series of operations for the Planner's next sub-goal proposal. The end goal is generated by the environment and exposed to the Planner which then decides which set of sub-goals to propose to the Executor. We show that this Planner-Executor setup drastically increases the sample efficiency of our method over traditional single agent approaches, effectively mitigating the difficulty accompanying long series of actions with a sparse reward signal. On the challenging Habitat environment which requires navigating various realistic indoor environments, we demonstrate that our approach offers a significant improvement over prior work for navigation.",
        "keywords": "Hierarchical Reinforcement Learning;Embodied Learning;Navigation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gil Avraham;Yan Zuo;Tom Drummond",
        "authorids": "gil.avraham@monash.edu;yan.zuo@monash.edu;tom.drummond@monash.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1g7xT4Kwr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "475;609;567",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            550.3333333333334,
            55.96030339048883
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WkfS5fqbY1oJ:scholar.google.com/&scioq=PLEX:+PLanner+and+EXecutor+for+Embodied+Learning+in+Navigation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1g87C4KwB",
        "title": "The Break-Even Point on Optimization Trajectories of Deep Neural Networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "In the early phase of training of deep neural networks there exists a \"break-even point\" which determines properties of the entire optimization trajectory.",
        "abstract": "The early phase of training of deep neural networks is critical for their final performance. In this work, we study how the hyperparameters of stochastic gradient descent (SGD) used in the early phase of training affect the rest of the optimization trajectory. We argue for the existence of the \"``break-even\" point on this trajectory, beyond which the curvature of the loss surface and noise in the gradient are implicitly regularized by SGD. In particular, we demonstrate on multiple classification tasks that using a large learning rate in the initial phase of training reduces the variance of the gradient, and improves the conditioning of the covariance of gradients. These effects are beneficial from the optimization perspective and become visible after the break-even point. Complementing prior work, we also show that using a low learning rate results in bad conditioning of the loss surface even for a neural network with batch normalization layers. In short, our work shows that key properties of the loss surface are strongly influenced by SGD in the early phase of training. We argue that studying the impact of the identified effects on generalization is a promising future direction.",
        "keywords": "generalization;sgd;learning rate;batch size;hessian;curvature;trajectory;optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stanislaw Jastrzebski;Maciej Szymczak;Stanislav Fort;Devansh Arpit;Jacek Tabor;Kyunghyun Cho*;Krzysztof Geras*",
        "authorids": "staszek.jastrzebski@gmail.com;msz93@o2.pl;stanislav.fort@gmail.com;devansharpit@gmail.com;jcktbr@gmail.com;kyunghyun.cho@nyu.edu;k.j.geras@nyu.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nJastrzebski2020The,\ntitle={The Break-Even Point on Optimization Trajectories of Deep Neural Networks},\nauthor={Stanislaw Jastrzebski and Maciej Szymczak and Stanislav Fort and Devansh Arpit and Jacek Tabor and Kyunghyun Cho* and Krzysztof Geras*},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1g87C4KwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1g87C4KwB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "888;849;278",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "884;2230;337",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            671.6666666666666,
            278.8193361698973
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1150.3333333333333,
            795.4295834467198
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 191,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1577219416528236637&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "r1g8C04FwB",
        "title": "Spline Templated Based Handwriting Generation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Publicly available labelled handwriting data is fairly limited in its representation of styles as well as in the volume of examples for many topics.  We find that trying to use these publically available datasets as training data on unrelated unlabelled handwriting datasets produces unsatisfactory results and would not produce a trained system capable of performing adequately in real world tasks.  We propose a method of character and word generation using fonts as templates, as large numbers of handwriting fonts are available for personal use online.  Our technique, based on modifying previous work in mechanical handwriting modeling and template based generation and extending that to arbitrary images of letter and words with an automatic method of generating templates through splinification.  We find that we get reasonable results on MNIST, EMNIST, IAM, and a proprietary dataset created from Boeing aircraft maintenance forms when no training data is available.  This method requires minimal training and generates in a fast, easily parallelizable fashion.\n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel Clothiaux;Ravi Starzl",
        "authorids": "dclothia@andrew.cmu.edu;rstarzl@cs.cmu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1g8C04FwB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "535;116;220",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "146;84;40",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            290.3333333333333,
            178.13914661173035
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            90.0,
            43.48179695765421
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:T9WPNJ7I3V4J:scholar.google.com/&scioq=Spline+Templated+Based+Handwriting+Generation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1gBOxSFwr",
        "title": "Reweighted Proximal Pruning for Large-Scale Language Representation",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop the pruning algorithm Reweighted Proximal Pruning (RPP), which achieves the first effective weight pruning result on large-scale pre-trained language representation model-BERT.",
        "abstract": "Recently, pre-trained language representation flourishes as the mainstay of the natural language understanding community, e.g., BERT. These pre-trained language representations can create state-of-the-art results on a wide range of downstream tasks. Along with continuous significant performance improvement, the size and complexity of these pre-trained neural models continue to increase rapidly. Is it possible to compress these large-scale language representation models? How will the pruned language representation affect the downstream multi-task transfer learning objectives? In this paper, we propose Reweighted Proximal Pruning (RPP), a new pruning method specifically designed for a large-scale language representation model. Through experiments on SQuAD and the GLUE benchmark suite, we show that proximal pruned BERT keeps high accuracy for both the pre-training task and the downstream multiple fine-tuning tasks at high prune ratio. RPP provides a new perspective to help us analyze what large-scale language representation might learn. Additionally, RPP makes it possible to deploy a large state-of-the-art language representation model such as BERT on a series of distinct devices (e.g., online servers, mobile phones, and edge devices).",
        "keywords": "Language Representation;Machine Learning;Deep Learning;Optimizer;Statistical Learning;Model Compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fu-Ming Guo;Sijia Liu;Finlay S. Mungall;Xue Lin;Yanzhi Wang",
        "authorids": "elphinkuo@gmail.com;sijia.liu@ibm.com;fmungall@gmail.com;xue.lin@northeastern.edu;yanz.wang@northeastern.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nguo2020reweighted,\ntitle={Reweighted Proximal Pruning for Large-Scale Language Representation},\nauthor={Fu-Ming Guo and Sijia Liu and Finlay S. Mungall and Xue Lin and Yanzhi Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gBOxSFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer5;AnonReviewer4",
        "site": "https://openreview.net/forum?id=r1gBOxSFwr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "273;353;203",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "419;1133;663",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            276.3333333333333,
            61.28258770283412
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            738.3333333333334,
            296.3166474492373
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 75,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16195257416592826625&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1gEXgBYDH",
        "title": "Defensive Tensorization: Randomized Tensor Parametrization for Robust Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose defensive tensorization, a novel adversarial defense technique that leverages a latent, randomized high order factorization of the network that renders it more robust.",
        "abstract": "As deep neural networks become widely adopted for solving most problems in computer vision and audio-understanding, there are rising concerns about their potential vulnerability. In particular, they are very sensitive to adversarial attacks, which manipulate the input to alter models' predictions. Despite large bodies of work to address this issue, the problem remains open.  In this paper, we propose defensive tensorization, a novel adversarial defense technique that leverages a latent high order factorization of the network. Randomization is applied in the latent subspace, therefore resulting in dense reconstructed weights, without the sparsity or perturbations typically induced by the randomization.\nOur approach can be easily integrated with any arbitrary neural architecture and combined with techniques like adversarial training. We empirically demonstrate the effectiveness of our approach on standard image classification benchmarks. We further validate the generalizability of our approach across domains and low-precision architectures by considering an audio classification task and binary networks. In all cases, we demonstrate superior performance compared to prior works in the target scenario.",
        "keywords": "tensor decomposition;tensor factorization;randomization;robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Adrian Bulat;Jean Kossaifi;Sourav Bhattacharya;Yannis Panagakis;Georgios Tzimiropoulos;Nicholas D.  Lane;Maja Pantic",
        "authorids": "adrian@adrianbulat.com;jean.kossaifi@gmail.com;bsourav@gmail.com;i.panagakis@imperial.ac.uk;georgios.t@samsung.com;nic.lane@samsung.com;maja.pantic@gmail.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nbulat2020defensive,\ntitle={Defensive Tensorization: Randomized Tensor Parametrization for Robust Neural Networks},\nauthor={Adrian Bulat and Jean Kossaifi and Sourav Bhattacharya and Yannis Panagakis and Georgios Tzimiropoulos and Nicholas D.  Lane and Maja Pantic},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gEXgBYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1gEXgBYDH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "171;467;421",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "560;785;1008",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            353.0,
            130.05639802280652
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            784.3333333333334,
            182.89584163913864
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8451525721952556635&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1gIa0NtDH",
        "title": "MelNet: A Generative Model for Audio in the Frequency Domain",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce an autoregressive generative model for spectrograms and demonstrate applications to speech and music generation",
        "abstract": "Capturing high-level structure in audio waveforms is challenging because a single second of audio spans tens of thousands of timesteps.  While long-range dependencies are difficult to model directly in the time domain, we show that they can be more tractably modelled in two-dimensional time-frequency representations such as spectrograms.  By leveraging this representational advantage, in conjunction with a highly expressive probabilistic model and a multiscale generation procedure, we design a model capable of generating high-fidelity audio samples which capture structure at timescales which time-domain models have yet to achieve.  We demonstrate that our model captures longer-range dependencies than time-domain models such as WaveNet across a diverse set of unconditional generation tasks, including single-speaker speech generation, multi-speaker speech generation, and music generation.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sean Vasquez;Mike Lewis",
        "authorids": "seanjv@mit.edu;mikelewis@fb.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nvasquez2020melnet,\ntitle={MelNet: A Generative Model for Audio in the Frequency Domain},\nauthor={Sean Vasquez and Mike Lewis},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gIa0NtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1gIa0NtDH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "350;200;538",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "691;342;430",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            362.6666666666667,
            138.27830230693783
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            487.6666666666667,
            148.19881540987058
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 172,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10957351083367914129&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1gIdySFPH",
        "title": "Skew-Fit: State-Covering Self-Supervised Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a principled objective for autonomous goal-setting in high-dimensional, unknown goal spaces and present a method that theoretically and empirically learns the optimal goal distribution.",
        "abstract": "Autonomous agents that must exhibit flexible and broad capabilities will need to be equipped with large repertoires of skills. Defining each skill with a manually-designed reward function limits this repertoire and imposes a manual engineering burden. Self-supervised agents that set their own goals can automate this process, but designing appropriate goal setting objectives can be difficult, and often involves heuristic design decisions. In this paper, we propose a formal exploration objective for goal-reaching policies that maximizes state coverage. We show that this objective is equivalent to maximizing the entropy of the goal distribution together with goal reaching performance, where goals correspond to full state observations. To instantiate this principle, we present an algorithm called Skew-Fit for learning a maximum-entropy goal distributions. Skew-Fit enables self-supervised agents to autonomously choose and practice reaching diverse goals. We show that, under certain regularity conditions, our method converges to a uniform distribution over the set of valid states, even when we do not know this set beforehand. Our experiments show that it can learn a variety of manipulation tasks from images, including opening a door with a real robot, entirely from scratch and without any manually-designed reward function.",
        "keywords": "deep reinforcement learning;goal space;goal conditioned reinforcement learning;self-supervised reinforcement learning;goal sampling;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vitchyr H. Pong;Murtaza Dalal;Steven Lin;Ashvin Nair;Shikhar Bahl;Sergey Levine",
        "authorids": "vitchyr@berkeley.edu;mdalal@berkeley.edu;stevenlin598@berkeley.edu;anair17@berkeley.edu;shikharbahl@berkeley.edu;svlevine@eecs.berkeley.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\npong2020skewfit,\ntitle={Skew-Fit: State-Covering Self-Supervised Reinforcement Learning},\nauthor={Vitchyr H. Pong and Murtaza Dalal and Steven Lin and Ashvin Nair and Shikhar Bahl and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gIdySFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1gIdySFPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "738;373;294",
        "wc_reply_reviewers": "429;0;0",
        "wc_reply_authors": "2277;371;391",
        "reply_reviewers": "1;0;0",
        "reply_authors": "5;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            468.3333333333333,
            193.3913705991615
        ],
        "wc_reply_reviewers_avg": [
            143.0,
            202.2325394193526
        ],
        "wc_reply_authors_avg": [
            1013.0,
            893.8202653031909
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.699673171197595
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 320,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4355492563942287414&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "r1gIwgSYwr",
        "title": "Localized Meta-Learning: A PAC-Bayes Analysis for Meta-Leanring Beyond Global Prior",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Meta-learning methods learn the meta-knowledge among various training tasks and aim to promote the learning of new tasks under the task similarity assumption. However, such meta-knowledge is often represented as a fixed distribution, which is too restrictive to capture various specific task information. In this work, we present a localized meta-learning framework based on PAC-Bayes theory. In particular, we propose a LCC-based prior predictor that allows the meta learner adaptively generate local meta-knowledge for specific task. We further develop a pratical algorithm with deep neural network based on the bound. Empirical results on real-world datasets demonstrate the efficacy of the proposed method. ",
        "keywords": "localized meta-learning;PAC-Bayes;meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chenghao Liu;Tao Lu;Doyen Sahoo;Yuan Fang;Steven C.H. Hoi.",
        "authorids": "chliu@smu.edu.sg;lutaott@zju.edu.cn;doyensahoo@gmail.com;yfang@smu.edu.sg;shoi@salesforce.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nliu2020localized,\ntitle={Localized Meta-Learning: A {\\{}PAC{\\}}-Bayes Analysis for Meta-Leanring Beyond Global Prior},\nauthor={Chenghao Liu and Tao Lu and Doyen Sahoo and Yuan Fang and Steven C.H. Hoi.},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gIwgSYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1gIwgSYwr",
        "pdf_size": 0,
        "rating": "3;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "622;1105;305;342",
        "wc_reply_reviewers": "0;16;0;0",
        "wc_reply_authors": "759;1499;524;902",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "1;3;1;2",
        "rating_avg": [
            5.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            593.5,
            319.73778319116434
        ],
        "wc_reply_reviewers_avg": [
            4.0,
            6.928203230275509
        ],
        "wc_reply_authors_avg": [
            921.0,
            359.96458159102264
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.75,
            0.82915619758885
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:X1pmBH0g0KkJ:scholar.google.com/&scioq=Localized+Meta-Learning:+A+PAC-Bayes+Analysis+for+Meta-Leanring+Beyond+Global+Prior&hl=en&as_sdt=0,33",
        "gs_version_total": 2
    },
    {
        "id": "r1gNLAEFPS",
        "title": "Neural ODEs for Image Segmentation with Level Sets",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel approach for image segmentation that combines Neural Ordinary Differential Equations (NODEs) and the Level Set method. ",
        "abstract": "We propose a novel approach for image segmentation that combines Neural Ordinary Differential Equations (NODEs) and the Level Set method.  Our approach parametrizes the evolution of an initial contour with a NODE that implicitly learns from data a speed function describing the evolution.  In addition, for cases where an initial contour is not available and to alleviate the need for careful choice or design of contour embedding functions, we propose a NODE-based method that evolves an image embedding into a dense per-pixel semantic label space. We evaluate our methods on kidney segmentation (KiTS19) and on salient object detection (PASCAL-S, ECSSD and HKU-IS). In addition to improving initial contours provided by deep learning models while using a fraction of their number of parameters, our approach achieves F scores that are higher than several state-of-the-art deep learning algorithms",
        "keywords": "neural odes;level sets;image segmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rafael Valle;Fitsum Reda;Mohammad Shoeybi;Patrick Legresley;Andrew Tao;Bryan Catanzaro",
        "authorids": "rafaelvalle@nvidia.com;freda@nvidia.com;mshoeybi@nvidia.com;plegresley@nvidia.com;atao@nvidia.com;bcatanzaro@nvidia.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nvalle2020neural,\ntitle={Neural {\\{}ODE{\\}}s for Image Segmentation with Level Sets},\nauthor={Rafael Valle and Fitsum Reda and Mohammad Shoeybi and Patrick Legresley and Andrew Tao and Bryan Catanzaro},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gNLAEFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1gNLAEFPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "476;370;108",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            318.0,
            154.66954020319147
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5438475287934154057&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1gPoCEKvH",
        "title": "SINGLE PATH ONE-SHOT NEURAL ARCHITECTURE SEARCH WITH UNIFORM SAMPLING",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We revisit the one-shot Neural Architecture Search (NAS) paradigm and analyze its advantages over existing NAS approaches. Existing one-shot method (Benderet al., 2018), however, is hard to train and not yet effective on large scale datasets like ImageNet.  This work propose a Single Path One-Shot model to address the challenge in the training.  Our central idea is to construct a simplified supernet, where all architectures are single paths so that weight co-adaption problem is alleviated. Training is performed by uniform path sampling. All architectures (and their weights) are trained fully and equally.\nComprehensive experiments verify that our approach is flexible and effective.  It is easy to train and fast to search.  It effortlessly supports complex search spaces(e.g., building blocks, channel, mixed-precision quantization) and different search constraints (e.g., FLOPs, latency).  It is thus convenient to use for various needs. It achieves start-of-the-art performance on the large dataset ImageNet.",
        "keywords": "Neural Architecture Search;Single Path",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zichao Guo;Xiangyu Zhang;Haoyuan Mu;Wen Heng;Zechun Liu;Yichen Wei;Jian Sun",
        "authorids": "guozichao@megvii.com;zhangxiangyu@megvii.com;muhy17@mails.tsinghua.edu.cn;hengwen@megvii.com;zliubq@connect.ust.hk;weiyichen@megvii.com;sunjian@megvii.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nguo2020single,\ntitle={{\\{}SINGLE{\\}} {\\{}PATH{\\}} {\\{}ONE{\\}}-{\\{}SHOT{\\}} {\\{}NEURAL{\\}} {\\{}ARCHITECTURE{\\}} {\\{}SEARCH{\\}} {\\{}WITH{\\}} {\\{}UNIFORM{\\}} {\\{}SAMPLING{\\}}},\nauthor={Zichao Guo and Xiangyu Zhang and Haoyuan Mu and Wen Heng and Zechun Liu and Yichen Wei and Jian Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gPoCEKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1gPoCEKvH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "139;463;545",
        "wc_reply_reviewers": "0;66;0",
        "wc_reply_authors": "185;324;376",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            382.3333333333333,
            175.2889677709987
        ],
        "wc_reply_reviewers_avg": [
            22.0,
            31.11269837220809
        ],
        "wc_reply_authors_avg": [
            295.0,
            80.62671186813131
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1129,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1079697292102563056&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "r1gRTCVFvB",
        "title": "Decoupling Representation and Classifier for Long-Tailed Recognition",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "The long-tail distribution of the visual world poses great challenges for deep learning based classification models on how to handle the class imbalance problem. Existing solutions usually involve class-balancing strategies, e.g., by loss re-weighting, data re-sampling, or transfer learning from head- to tail-classes, but most of them adhere to the scheme of jointly learning representations and classifiers. In this work, we decouple the learning procedure into representation learning and classification, and systematically explore how different balancing strategies affect them for long-tailed recognition. The findings are surprising: (1) data imbalance might not be an issue in learning high-quality representations; (2) with representations learned with the simplest instance-balanced (natural) sampling, it is also possible to achieve strong long-tailed recognition ability by adjusting only the classifier. We conduct extensive experiments and set new state-of-the-art performance on common long-tailed benchmarks like ImageNet-LT, Places-LT and iNaturalist, showing that it is possible to outperform carefully designed losses, sampling strategies, even complex modules with memory, by using a straightforward approach that decouples representation and classification. Our code is available at https://github.com/facebookresearch/classifier-balancing.",
        "keywords": "long-tailed recognition;classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bingyi Kang;Saining Xie;Marcus Rohrbach;Zhicheng Yan;Albert Gordo;Jiashi Feng;Yannis Kalantidis",
        "authorids": "kang@u.nus.edu;xiesaining@gmail.com;maroffm@gmail.com;zhicheng.yan@live.com;albert.gordo.s@gmail.com;elefjia@nus.edu.sg;ykalant@image.ntua.gr",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nKang2020Decoupling,\ntitle={Decoupling Representation and Classifier for Long-Tailed Recognition},\nauthor={Bingyi Kang and Saining Xie and Marcus Rohrbach and Zhicheng Yan and Albert Gordo and Jiashi Feng and Yannis Kalantidis},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gRTCVFvB}\n}",
        "github": "[![github](/images/github_icon.svg) facebookresearch/classifier-balancing](https://github.com/facebookresearch/classifier-balancing) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=r1gRTCVFvB)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1gRTCVFvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "243;318;388",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "133;305;189",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            316.3333333333333,
            59.20773222777204
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            209.0,
            71.62867209900423
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1599,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2236026226436038230&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "r1gV3nVKPS",
        "title": "Beyond Classical Diffusion: Ballistic Graph Neural Network",
        "track": "main",
        "status": "Reject",
        "tldr": "A new perspective on how to collect the correlation between nodes based on diffusion properties.",
        "abstract": "This paper presents the ballistic graph neural network. Ballistic graph neural network tackles the weight distribution from a transportation perspective and has many different properties comparing to the traditional graph neural network pipeline. The ballistic graph neural network does not require to calculate any eigenvalue. The filters propagate exponentially faster($\\sigma^2 \\sim T^2$) comparing to traditional graph neural network($\\sigma^2 \\sim T$). We use a perturbed coin operator to perturb and optimize the diffusion rate. Our results show that by selecting the diffusion speed, the network can reach a similar accuracy with fewer parameters. We also show the perturbed filters act as better representations comparing to pure ballistic ones. We provide a new perspective of training graph neural network, by adjusting the diffusion rate, the neural network's performance can be improved.",
        "keywords": "Graph Convolutional Network;Diffusion;Transportation;Machine Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yimeng Min",
        "authorids": "minyimen@mila.quebec",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nmin2020beyond,\ntitle={Beyond Classical Diffusion: Ballistic Graph Neural Network},\nauthor={Yimeng Min},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gV3nVKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1gV3nVKPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "470;203;357",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            343.3333333333333,
            109.42983546039393
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:romHlJZ3VX8J:scholar.google.com/&scioq=Beyond+Classical+Diffusion:+Ballistic+Graph+Neural+Network&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1g_R6EKvr",
        "title": "Doubly Normalized Attention",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Models based on the Transformer architecture have achieved better accuracy than models based on competing architectures. A unique feature of the Transformer is its universal application of a self-attention mechanism, which allows for free information flow at arbitrary distances. In this paper, we provide two alternative views of the attention mechanism: one from the probabilistic view via the Gaussian mixture model, the other from the optimization view via optimal transport. Following these insights, we propose a new attention scheme that requires normalization on both the upper and lower layers, called the doubly-normalized attention scheme. We analyze the properties of both the original and the new attention schemes, and find that the doubly-normalized attention mechanism directly mitigates two unwanted effects: it resolves the explaining-away effect and alleviates mode collapse. We conduct empirical studies that quantify numerical advantages for the doubly-normalized attention model, as well as for a hybrid model that dynamically combines both attention schemes to achieve improved performance on several well-known benchmarks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nan Ding;Xinjie Fan;Zhenzhong Lan;Dale Schuurmans;Radu Soricut",
        "authorids": "dingnan@google.com;fan.xinjiebuaa@gmail.com;lanzhzh@google.com;schuurmans@google.com;rsoricut@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1g_R6EKvr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "294;540;311",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "200;343;188",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            381.6666666666667,
            112.17347676206211
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            243.66666666666666,
            70.40991091853161
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nqH0HHlQwu8J:scholar.google.com/&scioq=Doubly+Normalized+Attention&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1gc3lBFPH",
        "title": "Keyword Spotter Model for Crop Pest and Disease Monitoring from Community Radio Data",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper describes an approach to analyse community radio data with machine learning-based speech keyword spotting techniques for crop pest and disease monitoring.",
        "abstract": "In societies with well developed internet infrastructure, social media is the leading medium of communication for various social issues especially for breaking news situations. In rural Uganda however, public community radio is still a dominant means for news dissemination. Community radio gives audience to the general public especially to individuals living in rural areas, and thus plays an important role in giving a voice to those living in the broadcast area. It is an avenue for participatory communication and a tool relevant in both economic and social development.This is supported by the rise to ubiquity of mobile phones providing access to phone-in or text-in talk shows. In this paper, we describe an approach to analysing the readily available community radio data with machine learning-based speech keyword spotting techniques. We identify the keywords of interest related to agriculture and build models to automatically identify these keywords from audio streams. Our contribution through these techniques is a cost-efficient and effective way to monitor food security concerns particularly in rural areas. Through keyword spotting and radio talk show analysis, issues such as crop diseases, pests, drought and famine can be captured and fed into an early warning system for stakeholders and policy makers.",
        "keywords": "keyword spotter;radio data;crop pest and disease;agriculture",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Benjamin Akera;Joyce Nakatumba-Nabende;Ali Hussein;Daniel Ssendiwala;Jonathan Mukiibi",
        "authorids": "akeraben@gmail.com;jnakatumba@cis.mak.ac.ug;ali.hussein@ronininstitute.org;ssendiwaladaniel@gmail.com;jonmuk7@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nakera2020keyword,\ntitle={Keyword Spotter Model for Crop Pest and Disease Monitoring from Community Radio Data},\nauthor={Benjamin Akera and Joyce Nakatumba-Nabende and Ali Hussein and Daniel Ssendiwala and Jonathan Mukiibi},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gc3lBFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1gc3lBFPH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "968;354;420",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            580.6666666666666,
            275.20820401208164
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4158126653516724540&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "r1gdj2EKPB",
        "title": "Scalable and Order-robust Continual Learning with Additive Parameter Decomposition",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "While recent continual learning methods largely alleviate the catastrophic problem on toy-sized datasets, there are issues that remain to be tackled in order to apply them to real-world problem domains. First, a continual learning model should effectively handle catastrophic forgetting and be efficient to train even with a large number of tasks. Secondly, it needs to tackle the problem of order-sensitivity, where the performance of the tasks largely varies based on the order of the task arrival sequence, as it may cause serious problems where fairness plays a critical role (e.g. medical diagnosis). To tackle these practical challenges, we propose a novel continual learning method that is scalable as well as order-robust, which instead of learning a completely shared set of weights, represents the parameters  for each task as a sum of task-shared and sparse task-adaptive parameters. With our Additive Parameter Decomposition (APD), the task-adaptive parameters for earlier tasks remain mostly unaffected, where we update them only to reflect the changes made to the task-shared parameters. This decomposition of parameters effectively prevents catastrophic forgetting and order-sensitivity, while being computation- and memory-efficient. Further, we can achieve even better scalability with APD using hierarchical knowledge consolidation, which clusters the task-adaptive parameters to obtain hierarchically shared parameters. We validate our network with APD, APD-Net, on multiple benchmark datasets against state-of-the-art continual learning methods, which it largely outperforms in accuracy, scalability, and order-robustness.",
        "keywords": "Continual Learning;Lifelong Learning;Catastrophic Forgetting;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jaehong Yoon;Saehoon Kim;Eunho Yang;Sung Ju Hwang",
        "authorids": "jaehong.yoon@kaist.ac.kr;shkim@aitrics.com;eunhoy@kaist.ac.kr;sjhwang82@kaist.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nYoon2020Scalable,\ntitle={Scalable and Order-robust Continual Learning with Additive Parameter Decomposition},\nauthor={Jaehong Yoon and Saehoon Kim and Eunho Yang and Sung Ju Hwang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gdj2EKPB}\n}",
        "github": "https://github.com/iclr2020-apd/anonymous_iclr2020_apd_code",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1gdj2EKPB",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "2713;1050;291",
        "wc_reply_reviewers": "0;952;216",
        "wc_reply_authors": "1469;2521;261",
        "reply_reviewers": "0;2;1",
        "reply_authors": "2;8;2",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1351.3333333333333,
            1011.4749406463591
        ],
        "wc_reply_reviewers_avg": [
            389.3333333333333,
            407.5204153031955
        ],
        "wc_reply_authors_avg": [
            1417.0,
            923.3735249976938
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            4.0,
            2.8284271247461903
        ],
        "replies_avg": [
            20,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 200,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1824460160917131841&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "r1geR1BKPr",
        "title": "MULTI-STAGE INFLUENCE FUNCTION",
        "track": "main",
        "status": "Reject",
        "tldr": "We proposed a influence function for multi-stage training",
        "abstract": "Multi-stage training and knowledge transfer from a large-scale pretrain task to various fine-tune end tasks have revolutionized natural language processing (NLP) and computer vision (CV), with state-of-the-art performances constantly being improved. In this paper, we develop a multi-stage influence function score to track predictions from a finetune model all the way back to the pretrain data. With this score, we can identify the pretrain examples in the pretrain task that contribute most to a prediction in the fine-tune task. The proposed multi-stage influence function generalizes the original influence function for a single model in Koh et al 2017, thereby enabling influence computation through both pretrain and fine-tune models. We test our proposed method in various experiments to show its effectiveness and potential applications.",
        "keywords": "influence function;multistage training;pretrained model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongge Chen;Si Si;Yang Li;Ciprian Chelba;Sanjiv Kumar;Duane Boning;Cho-Jui Hsieh",
        "authorids": "chenhg@mit.edu;sisidaisy@google.com;liyang@google.com;ciprianchelba@google.com;sanjivk@google.com;boning@mtl.mit.edu;chohsieh@cs.ucla.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nchen2020multistage,\ntitle={{\\{}MULTI{\\}}-{\\{}STAGE{\\}} {\\{}INFLUENCE{\\}} {\\{}FUNCTION{\\}}},\nauthor={Hongge Chen and Si Si and Yang Li and Ciprian Chelba and Sanjiv Kumar and Duane Boning and Cho-Jui Hsieh},\nyear={2020},\nurl={https://openreview.net/forum?id=r1geR1BKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1geR1BKPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "201;339;174",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "778;826;836",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            238.0,
            72.26340706055866
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            813.3333333333334,
            25.315783394730033
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5519715173511595551&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "r1gelyrtwH",
        "title": "Physics-aware Difference Graph Networks for Sparsely-Observed Dynamics",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose physics-aware difference graph networks designed to effectively learn spatial differences to modeling sparsely-observed dynamics.",
        "abstract": "Sparsely available data points cause numerical error on finite differences which hinders us from modeling the dynamics of physical systems. The discretization error becomes even larger when the sparse data are irregularly distributed or defined on an unstructured grid, making it hard to build deep learning models to handle physics-governing observations on the unstructured grid. In this paper, we propose a novel architecture, Physics-aware Difference Graph Networks (PA-DGN), which exploits neighboring information to learn finite differences inspired by physics equations. PA-DGN leverages data-driven end-to-end learning to discover underlying dynamical relations between the spatial and temporal differences in given sequential observations. We demonstrate the superiority of PA-DGN in the approximation of directional derivatives and the prediction of graph signals on the synthetic data and the real-world climate observations from weather stations.",
        "keywords": "physics-aware learning;spatial difference operators;sparsely-observed dynamics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sungyong Seo*;Chuizheng Meng*;Yan Liu",
        "authorids": "sungyons@usc.edu;chuizhem@usc.edu;yanliu.cs@usc.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nSeo*2020Physics-aware,\ntitle={Physics-aware Difference Graph Networks for Sparsely-Observed Dynamics},\nauthor={Sungyong Seo* and Chuizheng Meng* and Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gelyrtwH}\n}",
        "github": "https://github.com/USC-Melady/ICLR2020-PADGN",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1gelyrtwH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "368;232;386",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "695;768;769",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            328.6666666666667,
            68.74752520798275
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            744.0,
            34.650637319776195
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 78,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2925369914643755221&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1genAVKPB",
        "title": "Is a Good Representation Sufficient for Sample Efficient Reinforcement Learning?",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Exponential lower bounds for value-based and policy-based reinforcement learning with function approximation.",
        "abstract": "Modern deep learning methods provide effective means to learn good representations. However, is a good representation itself sufficient for sample efficient reinforcement learning? This question has largely been studied only with respect to (worst-case) approximation error, in the more classical approximate dynamic programming literature. With regards to the statistical viewpoint, this question is largely unexplored, and the extant body of literature mainly focuses on conditions which \\emph{permit} sample efficient reinforcement learning with little understanding of what are \\emph{necessary} conditions for efficient reinforcement learning.\nThis work shows that, from the statistical viewpoint, the situation is far subtler than suggested by the more traditional approximation viewpoint, where the requirements on the representation that suffice for sample efficient RL are even more stringent. Our main results\u00a0provide sharp thresholds for reinforcement learning methods, showing that there are hard limitations on what constitutes good function approximation (in terms of the dimensionality of the representation), where we focus on natural representational conditions relevant to value-based, model-based, and policy-based learning. These lower bounds highlight that having a good (value-based, model-based, or policy-based) representation in and of itself is insufficient for efficient reinforcement learning, unless the quality of this approximation passes certain hard thresholds. Furthermore, our lower bounds also imply exponential separations on the sample complexity between 1) value-based learning with perfect representation and value-based learning with a good-but-not-perfect representation, 2) value-based learning and policy-based learning, 3) policy-based learning and supervised learning and 4) reinforcement learning and imitation learning.   ",
        "keywords": "reinforcement learning;function approximation;lower bound;representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Simon S. Du;Sham M. Kakade;Ruosong Wang;Lin F. Yang",
        "authorids": "ssdu@ias.edu;sham@cs.washington.edu;ruosongw@andrew.cmu.edu;linyang@ee.ucla.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nDu2020Is,\ntitle={Is a Good Representation Sufficient for Sample Efficient Reinforcement Learning?},\nauthor={Simon S. Du and Sham M. Kakade and Ruosong Wang and Lin F. Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1genAVKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1genAVKPB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "713;436;188",
        "wc_reply_reviewers": "454;0;0",
        "wc_reply_authors": "815;408;17",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            445.6666666666667,
            214.43932060660475
        ],
        "wc_reply_reviewers_avg": [
            151.33333333333334,
            214.01765243912837
        ],
        "wc_reply_authors_avg": [
            413.3333333333333,
            325.80396287065355
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 256,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13093497235274536292&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "r1gfQgSFDr",
        "title": "High Fidelity Speech Synthesis with Adversarial Networks",
        "track": "main",
        "status": "Talk",
        "tldr": "We introduce GAN-TTS, a Generative Adversarial Network for Text-to-Speech, which achieves Mean Opinion Score (MOS) 4.2.",
        "abstract": "Generative adversarial networks have seen rapid development in recent years and have led to remarkable improvements in generative modelling of images. However, their application in the audio domain has received limited attention,\nand autoregressive models, such as WaveNet, remain the state of the art in generative modelling of audio signals such as human speech. To address this paucity, we introduce GAN-TTS, a Generative Adversarial Network for Text-to-Speech.\nOur architecture is composed of a conditional feed-forward generator producing raw speech audio, and an ensemble of discriminators which operate on random windows of different sizes. The discriminators analyse the audio both in terms of general realism, as well as how well the audio corresponds to the utterance that should be pronounced.  To measure the performance of GAN-TTS, we employ both subjective human evaluation (MOS - Mean Opinion Score), as well as novel quantitative metrics (Fr\u00e9chet DeepSpeech Distance and Kernel DeepSpeech Distance), which we find to be well correlated with MOS. We show that GAN-TTS is capable of generating high-fidelity speech with naturalness comparable to the state-of-the-art models, and unlike autoregressive models, it is highly parallelisable thanks to an efficient feed-forward generator. Listen to GAN-TTS reading this abstract at https://storage.googleapis.com/deepmind-media/research/abstract.wav",
        "keywords": "texttospeech;speechsynthesis;audiosynthesis;gans;generativeadversarialnetworks;implicitgenerativemodels",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Miko\u0142aj Bi\u0144kowski;Jeff Donahue;Sander Dieleman;Aidan Clark;Erich Elsen;Norman Casagrande;Luis C. Cobo;Karen Simonyan",
        "authorids": "mikbinkowski@gmail.com;jeffdonahue@google.com;sedielem@google.com;aidanclark@google.com;eriche@google.com;ncasagrande@google.com;luisca@google.com;simonyan@google.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nBi\u0144kowski2020High,\ntitle={High Fidelity Speech Synthesis with Adversarial Networks},\nauthor={Miko\u0142aj Bi\u0144kowski and Jeff Donahue and Sander Dieleman and Aidan Clark and Erich Elsen and Norman Casagrande and Luis C. Cobo and Karen Simonyan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gfQgSFDr}\n}",
        "github": "https://github.com/mbinkowski/DeepSpeechDistances",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1gfQgSFDr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "173;392;294",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "36;269;19",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            286.3333333333333,
            89.5705804876182
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            108.0,
            114.05554202521974
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 321,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11783894509127365289&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1gfweBFPB",
        "title": "Learning by shaking: Computing policy gradients by physical forward-propagation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a method to learn the effect of changing the parameters of the policy on the produced trajectories directly from the physical system.",
        "abstract": "Model-free and model-based reinforcement learning are two ends of a spectrum. Learning a good policy without a dynamic model can be prohibitively expensive. Learning the dynamic model of a system can reduce the cost of learning the policy, but it can also introduce bias if it is not accurate. We propose a middle ground where instead of the transition model, the sensitivity of the trajectories with respect to the perturbation (shaking) of the parameters is learned. This allows us to predict the local behavior of the physical system around a set of nominal policies without knowing the actual model. We assay our method on a custom-built physical robot in extensive experiments and show the feasibility of the approach in practice. We investigate potential challenges when applying our method to physical systems and propose solutions to each of them.",
        "keywords": "Reinforcement Learning;Control Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arash Mehrjou;Ashkan Soleymani;Stefan Bauer;Bernhard Sch\u00f6lkopf",
        "authorids": "amehrjou@tuebingen.mpg.de;soli.ashkan98@gmail.com;stefan.bauer@tuebingen.mpg.de;bs@tuebingen.mpg.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmehrjou2020learning,\ntitle={Learning by shaking: Computing policy gradients by physical forward-propagation},\nauthor={Arash Mehrjou and Ashkan Soleymani and Stefan Bauer and Bernhard Sch{\\\"o}lkopf},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gfweBFPB}\n}",
        "github": "https://sites.google.com/view/physicalderivatives/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1gfweBFPB",
        "pdf_size": 0,
        "rating": "1;1;1;3",
        "confidence": "0;0;0;0",
        "wc_review": "690;457;934;658",
        "wc_reply_reviewers": "0;0;0;157",
        "wc_reply_authors": "140;110;655;532",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            1.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            684.75,
            169.36554401648525
        ],
        "wc_reply_reviewers_avg": [
            39.25,
            67.98299419707844
        ],
        "wc_reply_authors_avg": [
            359.25,
            238.48833828931762
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qsHPG1HwqVwJ:scholar.google.com/&scioq=Learning+by+shaking:+Computing+policy+gradients+by+physical+forward-propagation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1ghgxHtPH",
        "title": "Blurring Structure and Learning to Optimize and Adapt Receptive Fields",
        "track": "main",
        "status": "Reject",
        "tldr": "Composing structured Gaussian and free-form filters makes receptive field size and shape differentiable for end-to-end optimization and dynamic adaptation.",
        "abstract": "The visual world is vast and varied, but its variations divide into structured and unstructured factors. We compose free-form filters and structured Gaussian filters, optimized end-to-end, to factorize deep representations and learn both local features and their degree of locality. In effect this optimizes over receptive field size and shape, tuning locality to the data and task. Our semi-structured composition is strictly more expressive than free-form filtering, and changes in its structured parameters would require changes in architecture for standard networks. Dynamic inference, in which the Gaussian structure varies with the input, adapts receptive field size to compensate for local scale variation. Optimizing receptive field size improves semantic segmentation accuracy on Cityscapes by 1-2 points for strong dilated and skip architectures and by up to 10 points for suboptimal designs. Adapting receptive fields by dynamic Gaussian structure further improves results, equaling the accuracy of free-form deformation while improving efficiency.",
        "keywords": "scale;deep learning;dynamic inference;fully convolutional",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Evan Shelhamer;Dequan Wang;Trevor Darrell",
        "authorids": "shelhamer@cs.berkeley.edu;dqwang@eecs.berkeley.edu;trevor@eecs.berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nshelhamer2020blurring,\ntitle={Blurring Structure and Learning to Optimize and Adapt Receptive Fields},\nauthor={Evan Shelhamer and Dequan Wang and Trevor Darrell},\nyear={2020},\nurl={https://openreview.net/forum?id=r1ghgxHtPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1ghgxHtPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "497;186;404",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "541;226;278",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            362.3333333333333,
            130.33887456251193
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            348.3333333333333,
            137.87998001482626
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:v5lkbe-Tk04J:scholar.google.com/&scioq=Blurring+Structure+and+Learning+to+Optimize+and+Adapt+Receptive+Fields&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1gixp4FPH",
        "title": "Accelerating SGD with momentum for over-parameterized learning",
        "track": "main",
        "status": "Poster",
        "tldr": "This work proves the non-acceleration of Nesterov SGD with any hyper-parameters, and proposes new algorithm which provably accelerates SGD in the over-parameterized setting.",
        "abstract": "\nNesterov SGD is widely used for training modern neural networks and other machine learning models. Yet, its advantages over SGD have not been theoretically clarified. Indeed, as we show  in this paper, both theoretically and empirically, Nesterov SGD with any parameter selection does not in general provide acceleration over ordinary SGD. Furthermore, Nesterov SGD may diverge for step sizes that ensure convergence of ordinary SGD. This is in contrast to the classical results in the deterministic setting, where the same step size ensures accelerated convergence of the Nesterov's method over optimal gradient descent.\n\nTo address the non-acceleration issue, we  introduce a compensation term to Nesterov SGD. The resulting  algorithm, which we call MaSS, converges  for same step sizes as SGD. We prove that MaSS obtains an accelerated convergence rates over SGD for any mini-batch size in the linear setting.  For full batch, the convergence rate of MaSS matches the well-known accelerated rate of the Nesterov's method. \n\nWe also analyze the  practically important question of the dependence of the convergence rate and  optimal hyper-parameters on the mini-batch size, demonstrating three distinct regimes: linear scaling, diminishing returns and saturation.\n\nExperimental evaluation of MaSS for several standard  architectures of deep networks, including ResNet and convolutional networks, shows improved performance over SGD, Nesterov SGD  and Adam. ",
        "keywords": "SGD;acceleration;momentum;stochastic;over-parameterized;Nesterov",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chaoyue Liu;Mikhail Belkin",
        "authorids": "liu.2656@buckeyemail.osu.edu;mbelkin@cse.ohio-state.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLiu2020Accelerating,\ntitle={Accelerating SGD with momentum for over-parameterized learning},\nauthor={Chaoyue Liu and Mikhail Belkin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gixp4FPH}\n}",
        "github": "https://github.com/ts66395/MaSS",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1gixp4FPH",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "272;193;289",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "669;97;203",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            251.33333333333334,
            41.82768886223043
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            323.0,
            248.45656897467344
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 109,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15634725943352892277&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1glDpNYwS",
        "title": "LabelFool: A Trick in the Label Space",
        "track": "main",
        "status": "Reject",
        "tldr": "A trick on adversarial samples so that the mis-classified labels are imperceptible in the label space to human observers",
        "abstract": "It is widely known that well-designed perturbations can cause state-of-the-art machine learning classifiers to mis-label an image, with sufficiently small perturbations that are imperceptible to the human eyes. However, by detecting the inconsistency between the image and wrong label, the human observer would be alerted of the attack. In this paper, we aim to design attacks that not only make classifiers generate wrong labels, but also make the wrong labels imperceptible to human observers. To achieve this, we propose an algorithm called LabelFool which identifies a target label similar to the ground truth label and finds a perturbation of the image for this target label. We first find the target label for an input image by a probability model, then move the input in the feature space towards the target label. Subjective studies on ImageNet show that in the label space, our attack is much less recognizable by human observers, while objective experimental results on ImageNet show that we maintain similar performance in the image space as well as attack rates to state-of-the-art attack algorithms.",
        "keywords": "Adversarial attack;LabelFool;Imperceptibility;Label space",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yujia Liu;Tingting Jiang;Ming Jiang",
        "authorids": "yujia_liu@pku.edu.cn;ttjiang@pku.edu.cn;ming-jiang@pku.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nliu2020labelfool,\ntitle={LabelFool: A Trick in the Label Space},\nauthor={Yujia Liu and Tingting Jiang and Ming Jiang},\nyear={2020},\nurl={https://openreview.net/forum?id=r1glDpNYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=r1glDpNYwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "254;241;187",
        "wc_reply_reviewers": "139;0;437",
        "wc_reply_authors": "921;385;811",
        "reply_reviewers": "2;0;2",
        "reply_authors": "3;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            227.33333333333334,
            29.00957696271277
        ],
        "wc_reply_reviewers_avg": [
            192.0,
            182.2982903558524
        ],
        "wc_reply_authors_avg": [
            705.6666666666666,
            231.1497830892822
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8Wpat1yUSocJ:scholar.google.com/&scioq=LabelFool:+A+Trick+in+the+Label+Space&hl=en&as_sdt=0,33",
        "gs_version_total": 3
    },
    {
        "id": "r1glygHtDB",
        "title": "A multi-task U-net for segmentation with lazy labels",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel multi-task learning strategy that integrates user-friendly labelling and an end-to-end model for image segmentation ",
        "abstract": "The need for labour intensive pixel-wise annotation is a major limitation of many fully supervised learning methods for image segmentation.   In this paper,  we propose a deep convolutional neural network for multi-class segmentation that circumvents this problem by being trainable on coarse data labels combined with only a very small number of images with pixel-wise annotations.  We call this new labelling strategy \u2018lazy\u2019 labels.  Image segmentation is then stratified into three connected tasks: rough detection of class instances, separation of wrongly connected objects without a clear boundary, and pixel-wise segmentation to find the accurate boundaries of each object. These problems are integrated into a multi-task learning framework and the model is trained end-to-end in a semi-supervised fashion. The method is demonstrated on two segmentation datasets, including food microscopy images and histology images of tissues respectively. We show that the model gives accurate segmentation results even if exact boundary labels are missing for a majority of the annotated data.  This allows more flexibility and efficiency for training deep neural networks that are data hungry in a practical setting where manual annotation is expensive, by collecting more lazy (rough) annotations than precisely segmented images. ",
        "keywords": "multi-task learning;weak labels;semisupervised learning;image segmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rihuan Ke;Aur\u00e9lie Bugeau;Nicolas Papadakis;Peter Schuetz;Carola-Bibiane Sch\u00f6nlieb",
        "authorids": "rk621@cam.ac.uk;aurelie.bugeau@labri.fr;nicolas.papadakis@math.u-bordeaux.fr;peter.schuetz@unilever.com;cbs31@cam.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nke2020a,\ntitle={A multi-task U-net for segmentation with lazy labels},\nauthor={Rihuan Ke and Aur{\\'e}lie Bugeau and Nicolas Papadakis and Peter Schuetz and Carola-Bibiane Sch{\\\"o}nlieb},\nyear={2020},\nurl={https://openreview.net/forum?id=r1glygHtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1glygHtDB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "206;435;371",
        "wc_reply_reviewers": "0;0;80",
        "wc_reply_authors": "831;374;1688",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;3",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            337.3333333333333,
            96.47221822311793
        ],
        "wc_reply_reviewers_avg": [
            26.666666666666668,
            37.71236166328253
        ],
        "wc_reply_authors_avg": [
            964.3333333333334,
            544.6603426805452
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15245586431831496418&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1gmu1SYDS",
        "title": "Information lies in the eye of the beholder: The effect of representations on observed mutual information",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We take a step towards measuring learning task difficulty and demonstrate that in practice performance strongly depends on the match of the representation of the information and the model interpreting it.",
        "abstract": "Learning can be framed as trying to encode the mutual information between input and output while discarding other information in the input. Since the distribution between input and output is unknown, also the true mutual information is. To quantify how difficult it is to learn a task, we calculate a observed mutual information score by dividing the estimated mutual information by the entropy of the input. We substantiate this score analytically by showing that the estimated mutual information has an error that increases with the entropy of the data. Intriguingly depending on how the data is represented the observed entropy and mutual information can vary wildly. There needs to be a match between how data is represented and how a model encodes it. Experimentally we analyze image-based input data representations and demonstrate that performance outcomes of extensive network architectures searches are well aligned to the calculated score. Therefore to ensure better learning outcomes, representations may need to be tailored to both task and model to align with the implicit distribution of the model.",
        "keywords": "Deep learning;Information theory;representation;coding;mutual information estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Julian Zilly;Lorenz Hetzel;Andrea Censi;Emilio Frazzoli",
        "authorids": "jzilly@ethz.ch;hetzell@ethz.ch;acensi@ethz.ch;emilio.frazzoli@idsc.mavt.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://drive.google.com/open?id=1D8wICzJVPJRUWB9y5WgceslXZfurY34g",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1gmu1SYDS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "177;332;95",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            201.33333333333334,
            98.27286276259359
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WXKlhvp4_9sJ:scholar.google.com/&scioq=Information+lies+in+the+eye+of+the+beholder:+The+effect+of+representations+on+observed+mutual+information&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1gx60NKPS",
        "title": "JAUNE: Justified And Unified Neural language Evaluation",
        "track": "main",
        "status": "Reject",
        "tldr": "Introduces JAUNE: a methodology to replace BLEU and ROUGE score with multidimensional, model-based evaluators for assessing summaries",
        "abstract": "We review the limitations of BLEU and ROUGE -- the most popular metrics used to assess reference summaries against hypothesis summaries, and introduce JAUNE:  a set of criteria for what a good metric should behave like and propose concrete ways to use recent Transformers-based Language Models to assess reference summaries against hypothesis summaries.\n\n",
        "keywords": "NLP;Evaluation Metrics;Summarization;Translation;BLEU;ROUGE;Transformers",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hassan Kan\u00e9;Yusuf Kocyigit;Ali Abdalla;Pelkins Ajanoh;Mohamed Coulibali",
        "authorids": "hassanmohamed@alum.mit.edu;yusuf.kocyigit@boun.edu.tr;aabdalla@alum.mit.edu;pelkins@alum.mit.edu;mohamed-konoufo.coulibali.1@ulaval.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nkan{\\'e}2020jaune,\ntitle={{\\{}JAUNE{\\}}: Justified And Unified Neural language Evaluation},\nauthor={Hassan Kan{\\'e} and Yusuf Kocyigit and Ali Abdalla and Pelkins Ajanoh and Mohamed Coulibali},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gx60NKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1gx60NKPS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "190;393;201",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "154;67;245",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            261.3333333333333,
            93.21063363276865
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            155.33333333333334,
            72.67431152446892
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:g8gA5HuyDBoJ:scholar.google.com/&scioq=JAUNE:+Justified+And+Unified+Neural+language+Evaluation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1gzaCEtPS",
        "title": "Posterior Control of Blackbox Generation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A structured latent-variable approach that adds discrete control states within a standard autoregressive neural paradigm to provide arbitrary grounding of internal model decisions, without sacrificing any representational power of neural models.",
        "abstract": "Many tasks in natural language processing and related domains require high precision output that obeys dataset-specific constraints. This level of fine-grained control can be difficult to obtain in large-scale neural network models. In this work, we propose a structured latent-variable approach that adds discrete control states within a standard autoregressive neural paradigm. Under this formulation, we can include a range of rich, posterior constraints to enforce task-specific knowledge that is effectively trained into the neural model. This approach allows us to provide arbitrary grounding of internal model decisions, without sacrificing any representational power of neural models. Experiments consider applications of this approach for text generation and part-of-speech induction. For natural language generation, we find that this method improves over standard benchmarks, while also providing fine-grained control.",
        "keywords": "nlp;text generation;deep latent variable models;variational autoencoders",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiang Lisa Li;Alexander M. Rush",
        "authorids": "xli150@jhu.edu;srush@seas.harvard.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1gzaCEtPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "529;390;88",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            335.6666666666667,
            184.09116099247012
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9001739963382618084&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "r1gzdhEKvH",
        "title": "Neural Linear Bandits: Overcoming Catastrophic Forgetting through Likelihood Matching",
        "track": "main",
        "status": "Reject",
        "tldr": "Neural-linear bandits combine linear contextual bandits with deep neural networks to solve problems where both exploration and representation learning play an important role.",
        "abstract": "We study neural-linear bandits for solving problems where both exploration and representation learning play an important role. Neural-linear bandits leverage the representation power of deep neural networks and combine it with efficient exploration mechanisms, designed for linear contextual bandits, on top of the last hidden layer. Since the representation is being optimized during learning, information regarding exploration with \"old\" features is lost. Here, we propose the first limited memory neural-linear bandit that is resilient to this catastrophic forgetting phenomenon. We perform simulations on a variety of real-world problems, including regression, classification, and sentiment analysis, and observe that our algorithm achieves superior performance and shows resilience to catastrophic forgetting. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tom Zahavy;Shie Mannor",
        "authorids": "tomzahavy@gmail.com;shiemannor@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzahavy2020neural,\ntitle={Neural Linear Bandits: Overcoming Catastrophic Forgetting through Likelihood Matching},\nauthor={Tom Zahavy and Shie Mannor},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gzdhEKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1gzdhEKvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "266;450;276",
        "wc_reply_reviewers": "0;0;73",
        "wc_reply_authors": "209;153;427",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            330.6666666666667,
            84.48010942753855
        ],
        "wc_reply_reviewers_avg": [
            24.333333333333332,
            34.41253001774532
        ],
        "wc_reply_authors_avg": [
            263.0,
            118.19757470721075
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9229039724185611992&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "r1gzoaNtvr",
        "title": "Emergence of Compositional Language with Deep Generational Transmission",
        "track": "main",
        "status": "Reject",
        "tldr": "We use cultural transmission to encourage compositionality in languages that emerge from interactions between neural agents.",
        "abstract": "Recent work has studied the emergence of language among deep reinforcement learning agents that must collaborate to solve a task. Of particular interest are the factors that cause language to be compositional---i.e., express meaning by combining words which themselves have meaning. Evolutionary linguists have found that in addition to structural priors like those already studied in deep learning, the dynamics of transmitting language from generation to generation contribute significantly to the emergence of  compositionality. In this paper, we introduce these cultural evolutionary dynamics into language emergence by periodically replacing agents in a population to create a knowledge gap, implicitly inducing cultural transmission of language. We show that this implicit cultural transmission encourages the resulting languages to exhibit better compositional generalization.",
        "keywords": "Cultural Evolution;Deep Learning;Language Emergence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael Cogswell;Jiasen Lu;Stefan Lee;Devi Parikh;Dhruv Batra",
        "authorids": "cogswell@gatech.edu;jiasenlu@gatech.edu;steflee@gatech.edu;parikh@gatech.edu;dbatra@gatech.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ncogswell2020emergence,\ntitle={Emergence of Compositional Language with Deep Generational Transmission},\nauthor={Michael Cogswell and Jiasen Lu and Stefan Lee and Devi Parikh and Dhruv Batra},\nyear={2020},\nurl={https://openreview.net/forum?id=r1gzoaNtvr}\n}",
        "github": "[![github](/images/github_icon.svg) mcogswell/evolang](https://github.com/mcogswell/evolang)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=r1gzoaNtvr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "567;981;356",
        "wc_reply_reviewers": "0;127;44",
        "wc_reply_authors": "748;1027;540",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            634.6666666666666,
            259.6026878819932
        ],
        "wc_reply_reviewers_avg": [
            57.0,
            52.65611708687479
        ],
        "wc_reply_authors_avg": [
            771.6666666666666,
            199.5199795063698
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 59,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13826305436132626624&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1l-5pEtDr",
        "title": "AdaX: Adaptive Gradient Descent with Exponential Long Term Memory",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel adaptive algorithm with extraordinary performance in deep learning tasks.",
        "abstract": "Adaptive optimization algorithms such as RMSProp and Adam have fast convergence and smooth learning process. Despite their successes, they are proven to have non-convergence issue even in convex optimization problems as well as weak performance compared with the first order gradient methods such as stochastic gradient descent (SGD). Several other algorithms, for example AMSGrad and AdaShift, have been proposed to alleviate these issues but only minor effect has been observed. This paper further analyzes the performance of such algorithms in a non-convex setting by extending their non-convergence issue into a simple non-convex case and show that Adam's design of update steps would possibly lead the algorithm to local minimums. To address the above problems, we propose a novel adaptive gradient descent algorithm, named AdaX, which accumulates the long-term past gradient information exponentially. We prove the convergence of AdaX in both convex and non-convex settings. Extensive experiments show that AdaX outperforms Adam in various tasks of computer vision and natural language processing and can catch up with SGD.\n",
        "keywords": "Optimization Algorithm;Machine Learning;Deep Learning;Adam",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenjie Li;Zhaoyang Zhang;Xinjiang Wang;Ping Luo",
        "authorids": "li3549@purdue.edu;zhaoyangzhang@link.cuhk.edu.hk;swanxinjiang@gmail.com;pluo.lhi@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nli2020adax,\ntitle={AdaX: Adaptive Gradient Descent with Exponential Long Term Memory},\nauthor={Wenjie Li and Zhaoyang Zhang and Xinjiang Wang and Ping Luo},\nyear={2020},\nurl={https://openreview.net/forum?id=r1l-5pEtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1l-5pEtDr",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "632;393;446;644",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "683;782;740;754",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            528.75,
            110.9264959331178
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            739.75,
            36.08583517115822
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6025438808998203066&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1l-HTNtDB",
        "title": "S2VG: Soft Stochastic Value Gradient method",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Model-based reinforcement learning (MBRL) has shown its advantages in sample-efficiency over model-free reinforcement learning (MFRL). Despite the impressive results it achieves, it still faces a trade-off between the ease of data generation and model bias. In this paper, we propose a simple and elegant model-based reinforcement learning algorithm called soft stochastic value gradient method (S2VG). S2VG combines the merits of the maximum-entropy reinforcement learning and MBRL, and exploits both real and imaginary data. In particular, we embed the model in the policy training and learn $Q$ and $V$ functions from the real (or imaginary) data set. Such embedding enables us to compute an analytic policy gradient through the back-propagation rather than the likelihood-ratio estimation, which can reduce the variance of the gradient estimation. We name our algorithm Soft Stochastic Value Gradient method to indicate its connection with the well-known stochastic value gradient method in \\citep{heess2015Learning}.",
        "keywords": "Model-based reinforcement learning;soft stochastic value gradient",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiaoyu Tan;Chao Qu;Junwu Xiong;James Zhang",
        "authorids": "xiaoyu_tan@u.nus.edu;chaoqu.technion@gmail.com;junwu.xjw@antfin.com;james.z@antfin.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ntan2020svg,\ntitle={S2{\\{}VG{\\}}: Soft Stochastic Value Gradient method},\nauthor={Xiaoyu Tan and Chao Qu and Junwu Xiong and James Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=r1l-HTNtDB}\n}",
        "github": "https://github.com/S2VG-anonymous1/S2VG",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1l-HTNtDB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "281;469;313",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "778;781;537",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            354.3333333333333,
            82.12727250365063
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            698.6666666666666,
            114.32215688813588
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ED70t6FlGbUJ:scholar.google.com/&scioq=S2VG:+Soft+Stochastic+Value+Gradient+method&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "r1l-VeSKwS",
        "title": "SemanticAdv: Generating Adversarial Examples via Attribute-Conditional Image Editing",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel and effective semantic adversarial attack method.",
        "abstract": "Deep neural networks (DNNs) have achieved great success in various applications due to their strong expressive power. However, recent studies have shown that DNNs are vulnerable to adversarial examples which are manipulated instances targeting to mislead DNNs to make incorrect predictions. Currently, most such adversarial examples try to guarantee \u201csubtle perturbation\" by limiting the Lp norm of the perturbation. In this paper, we aim to explore the impact of semantic manipulation on DNNs predictions by manipulating the semantic attributes of images and generate \u201cunrestricted adversarial examples\". Such semantic based perturbation is more practical compared with the Lp bounded perturbation. In particular, we propose an algorithm SemanticAdv which leverages disentangled semantic factors to generate adversarial perturbation by altering controlled semantic attributes to fool the learner towards various \u201cadversarial\" targets. We conduct extensive experiments to show that the semantic based adversarial examples can not only fool different learning tasks such as face verification and landmark detection, but also achieve high targeted attack success rate against real-world black-box services such as Azure face verification service based on transferability. To further demonstrate the applicability of SemanticAdv beyond face recognition domain, we also generate semantic perturbations on street-view images. Such adversarial examples with controlled semantic manipulation can shed light on further understanding about vulnerabilities of DNNs as well as potential defensive approaches.",
        "keywords": "adversarial examples;semantic attack",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haonan Qiu;Chaowei Xiao;Lei Yang;Xinchen Yan;HongLak Lee;Bo Li",
        "authorids": "haonanqiu@link.cuhk.edu.cn;xiaocw@umich.edu;yl016@ie.cuhk.edu.hk;xcyan@umich.edu;honglak@eecs.umich.edu;lxbosky@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nqiu2020semanticadv,\ntitle={SemanticAdv: Generating Adversarial Examples via Attribute-Conditional Image Editing},\nauthor={Haonan Qiu and Chaowei Xiao and Lei Yang and Xinchen Yan and HongLak Lee and Bo Li},\nyear={2020},\nurl={https://openreview.net/forum?id=r1l-VeSKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1l-VeSKwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "656;171;282",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1325;186;941",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.6666666666667,
            207.4774418795665
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            817.3333333333334,
            473.145737473021
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 203,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7562833323061021948&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "r1l0VCNKwB",
        "title": "LOSSLESS SINGLE IMAGE SUPER RESOLUTION FROM LOW-QUALITY JPG IMAGES",
        "track": "main",
        "status": "Reject",
        "tldr": "We solve the specific SR issue of low-quality JPG images by functional sub-models.",
        "abstract": "Super Resolution (SR) is a fundamental and important low-level computer vision (CV) task. Different from traditional SR models, this study concentrates on a specific but realistic SR issue: How can we obtain satisfied SR results from compressed JPG (C-JPG) image, which widely exists on the Internet. In general, C-JPG can release storage space while keeping considerable quality in visual. However, further image processing operations, e.g., SR, will suffer from enlarging inner artificial details and result in unacceptable outputs. To address this problem, we propose a novel SR structure with two specifically designed components, as well as a cycle loss. In short, there are mainly three contributions to this paper. First, our research can generate high-qualified SR images for prevalent C-JPG images. Second, we propose a functional sub-model to recover information for C-JPG images, instead of the perspective of noise elimination in traditional SR approaches. Third, we further integrate cycle loss into SR solver to build a hybrid loss function for better SR generation. Experiments show that our approach achieves outstanding performance among state-of-the-art methods.",
        "keywords": "Super Resolution;Low-quality JPG;Recovering details",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yong Shi;Biao Li;Bo Wang;Zhiquan Qi;Jiabin Liu;Fan Meng",
        "authorids": "yshi@unomaha.edu;libiao17@mails.ucas.ac.cn;wangbo@uibe.edu.cn;qizhiquan@foxmail.com;liujiabin008@126.com;mengfan@cufe.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nshi2020lossless,\ntitle={{\\{}LOSSLESS{\\}} {\\{}SINGLE{\\}} {\\{}IMAGE{\\}} {\\{}SUPER{\\}} {\\{}RESOLUTION{\\}} {\\{}FROM{\\}} {\\{}LOW{\\}}-{\\{}QUALITY{\\}} {\\{}JPG{\\}} {\\{}IMAGES{\\}}},\nauthor={Yong Shi and Biao Li and Bo Wang and Zhiquan Qi and Jiabin Liu and Fan Meng},\nyear={2020},\nurl={https://openreview.net/forum?id=r1l0VCNKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1l0VCNKwB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "447;369;254",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "330;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            356.6666666666667,
            79.27308636745653
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            110.0,
            155.56349186104046
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xl8yXGvr27UJ:scholar.google.com/&scioq=LOSSLESS+SINGLE+IMAGE+SUPER+RESOLUTION+FROM+LOW-QUALITY+JPG+IMAGES&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1l1myStwr",
        "title": "Continuous Meta-Learning without Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "Bayesian changepoint detection enables meta-learning directly from time series data.",
        "abstract": "Meta-learning is a promising strategy for learning to efficiently learn within new tasks, using data gathered from a distribution of tasks. However, the meta-learning literature thus far has focused on the task segmented setting, where at train-time, offline data is assumed to be split according to the underlying task, and at test-time, the algorithms are optimized to learn in a single task. In this work, we enable the application of generic meta-learning algorithms to settings where this task segmentation is unavailable, such as continual online learning with a time-varying task. We present meta-learning via online changepoint analysis (MOCA), an approach which augments a meta-learning algorithm with a differentiable Bayesian changepoint detection scheme. The framework allows both training and testing directly on time series data without segmenting it into discrete tasks. We demonstrate the utility of this approach on a nonlinear meta-regression benchmark as well as two meta-image-classification benchmarks.",
        "keywords": "Meta-learning;Continual learning;changepoint detection;Bayesian learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "James Harrison;Apoorva Sharma;Chelsea Finn;Marco Pavone",
        "authorids": "jharrison@stanford.edu;apoorva@stanford.edu;cbfinn@cs.stanford.edu;pavone@stanford.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nharrison2020continuous,\ntitle={Continuous Meta-Learning without Tasks},\nauthor={James Harrison and Apoorva Sharma and Chelsea Finn and Marco Pavone},\nyear={2020},\nurl={https://openreview.net/forum?id=r1l1myStwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1l1myStwr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "484;217;585",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "677;593;124",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            428.6666666666667,
            155.24675698885164
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            464.6666666666667,
            243.3164377696026
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 106,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3924794146291307550&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "r1l7E1HFPH",
        "title": "Multi-step Greedy Policies in Model-Free Deep Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Use model free algorithms like DQN/TRPO to solve short horizon problems (model free) iteratively in a Policy/Value Iteration fashion.",
        "abstract": "Multi-step greedy policies have been extensively used in model-based Reinforcement Learning (RL) and in the case when a model of the environment is available (e.g., in the game of Go). In this work, we explore the benefits of multi-step greedy policies in model-free RL when employed in the framework of multi-step Dynamic Programming (DP): multi-step Policy and Value Iteration. These algorithms iteratively solve short-horizon decision problems and converge to the optimal solution of the original one. By using model-free algorithms as solvers of the short-horizon problems we derive fully model-free algorithms which are instances of the multi-step DP framework. As model-free algorithms are prone to instabilities w.r.t. the decision problem horizon, this simple approach can help in mitigating these instabilities and results in an improved model-free algorithms. We test this approach and show results on both discrete and continuous control problems.",
        "keywords": "Reinforcement Learning;Multi-step greedy policies;Model free Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yonathan Efroni;Manan Tomar;Mohammad Ghavamzadeh",
        "authorids": "jonathan.efroni@gmail.com;manan.tomar@gmail.com;mgh@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nefroni2020multistep,\ntitle={Multi-step Greedy Policies in Model-Free Deep Reinforcement Learning},\nauthor={Yonathan Efroni and Manan Tomar and Mohammad Ghavamzadeh},\nyear={2020},\nurl={https://openreview.net/forum?id=r1l7E1HFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1l7E1HFPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "143;133;1370",
        "wc_reply_reviewers": "0;0;978",
        "wc_reply_authors": "916;799;803",
        "reply_reviewers": "0;0;2",
        "reply_authors": "2;2;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            548.6666666666666,
            580.7847181950373
        ],
        "wc_reply_reviewers_avg": [
            326.0,
            461.033621333629
        ],
        "wc_reply_authors_avg": [
            839.3333333333334,
            54.23610933276423
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11212756797914696723&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1lDIC4FDH",
        "title": "A Memory-augmented Neural Network by Resembling Human Cognitive Process of Memorization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Memorization of long-term information is a core task in sequence learning of neural networks, and inspired by human cognitive process, we propose a sparse memory-augmented neural network (SMANN) to deal with it in this paper, which is composed of a two-layer neural controller and an external memory. In the first layer of the network, the information is divided into segments according to a sparse mask, which preserve immediate memory, and then in the second layer, the segmented information are collected and processed as short-term memory. For alleviation of gradient vanishing problem, constrained LSTM structures are utilized in both of the layers to make the chrono-initializer more reasonable. Lastly, the external memory is used to store long-term information, and its access rate is reduced sharply owing to the sparse mask. In experiments, we evaluate the network and its components like constrained LSTM and the neural controller independently, results on different tasks has demonstrated the superiorities of our networks compared with their counterparts.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongjing Shan;Xiongwei Zhang;Chao Zhang;Limin Wang",
        "authorids": "shandongjing@pku.edu.cn;xwzhang9898@163.com;chzhang@cis.pku.edu.cn;07wanglimin@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1lDIC4FDH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "535;205;1124",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            621.3333333333334,
            380.1143102921658
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:inKKBzDXIw4J:scholar.google.com/&scioq=A+Memory-augmented+Neural+Network+by+Resembling+Human+Cognitive+Process+of+Memorization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1lEd64YwH",
        "title": "Learning Semantically Meaningful Representations Through Embodiment",
        "track": "main",
        "status": "Reject",
        "tldr": "We show how a deep neural network can learn meaningful and robust representations of visual input when trained in an embodied framework.",
        "abstract": "How do humans acquire a meaningful understanding of the world with little to no supervision or semantic labels provided by the environment? Here we investigate embodiment and a closed loop between action and perception as one key component in this process. We take a close look at the representations learned by a deep reinforcement learning agent that is trained with visual and vector observations collected in a 3D environment with sparse rewards. We show that this agent learns semantically meaningful and stable representations of its environment without receiving any semantic labels. Our results show that the agent learns to represent the action relevant information extracted from pixel input in a wide variety of sparse activation patterns. The quality of the representations learned shows the strength of embodied learning and its advantages over fully supervised approaches with regards to robustness and generalizability.",
        "keywords": "reinforcement learning;deep learning;embodied;embodiment;embodied cognition;representation learning;representations;sparse coding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Viviane Clay;Peter K\u00f6nig;Kai-Uwe K\u00fchnberger;Gordon Pipa",
        "authorids": "vkakerbeck@uos.de;pkoenig@uos.de;kkuehnbe@uos.de;gpipa@uos.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nclay2020learning,\ntitle={Learning Semantically Meaningful Representations Through Embodiment},\nauthor={Viviane Clay and Peter K{\\\"o}nig and Kai-Uwe K{\\\"u}hnberger and Gordon Pipa},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lEd64YwH}\n}",
        "github": "https://github.com/EmbodiedLearning/ICLR-Submission-2020",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1lEd64YwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "275;276;497",
        "wc_reply_reviewers": "0;123;141",
        "wc_reply_authors": "552;459;394",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;2;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            349.3333333333333,
            104.41689944108133
        ],
        "wc_reply_reviewers_avg": [
            88.0,
            62.65780079128216
        ],
        "wc_reply_authors_avg": [
            468.3333333333333,
            64.83997395297304
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gnxyMPCTXbgJ:scholar.google.com/&scioq=Learning+Semantically+Meaningful+Representations+Through+Embodiment&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1lEjlHKPH",
        "title": "Better Knowledge Retention through Metric Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We show metric learning can help reduce catastrophic forgetting",
        "abstract": "In a continual learning setting, new categories may be introduced over time, and an ideal learning system should perform well on both the original categories and the new categories. While deep neural nets have achieved resounding success in the classical setting, they are known to forget about knowledge acquired in prior episodes of learning if the examples encountered in the current episode of learning are drastically different from those encountered in prior episodes. This makes deep neural nets ill-suited to continual learning. In this paper, we propose a new model that can both leverage the expressive power of deep neural nets and is resilient to forgetting when new categories are introduced. We demonstrate an improvement in terms of accuracy on original classes compared to a vanilla deep neural net.",
        "keywords": "metric learning;continual learning;catastrophic forgetting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ke Li*;Shichong Peng*;Kailas Vodrahalli*;Jitendra Malik",
        "authorids": "ke.li@eecs.berkeley.edu;shichong.peng@mail.utoronto.ca;kailasv@berkeley.edu;malik@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nli*2020better,\ntitle={Better Knowledge Retention through Metric Learning},\nauthor={Ke Li* and Shichong Peng* and Kailas Vodrahalli* and Jitendra Malik},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lEjlHKPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1lEjlHKPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "1208;482;427",
        "wc_reply_reviewers": "98;0;0",
        "wc_reply_authors": "1441;243;387",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            705.6666666666666,
            355.91228632284606
        ],
        "wc_reply_reviewers_avg": [
            32.666666666666664,
            46.19764303752111
        ],
        "wc_reply_authors_avg": [
            690.3333333333334,
            534.0470224823111
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EBIufgNi8aoJ:scholar.google.com/&scioq=Better+Knowledge+Retention+through+Metric+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "r1lF_CEYwS",
        "title": "On the Need for Topology-Aware Generative Models for Manifold-Based Defenses",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "ML algorithms or models, especially deep neural networks (DNNs), have shown significant promise in several areas. However, recently researchers have demonstrated that ML algorithms, especially DNNs, are vulnerable to adversarial examples (slightly perturbed samples that cause mis-classification). Existence of adversarial examples has hindered deployment of ML algorithms in safety-critical sectors, such as security. Several defenses for adversarial examples exist in the literature. One of the important classes of defenses are manifold-based defenses, where a sample is \"pulled back\" into the data manifold before classifying. These defenses rely on the manifold assumption (data lie in a manifold of lower dimension than the input space). These defenses use a generative model to approximate the input distribution. This paper asks the following question: do the generative models used in manifold-based defenses need to be topology-aware? Our paper suggests the answer is yes. We provide theoretical and empirical evidence to support our claim.",
        "keywords": "Manifold-based Defense;Robust Learning;Adversarial Attacks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Uyeong Jang;Susmit Jha;Somesh Jha",
        "authorids": "wjang@cs.wisc.edu;susmit.jha@sri.com;jha@cs.wisc.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nJang2020On,\ntitle={On the Need for Topology-Aware Generative Models for Manifold-Based Defenses},\nauthor={Uyeong Jang and Susmit Jha and Somesh Jha},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lF_CEYwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1lF_CEYwS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "209;1639;296",
        "wc_reply_reviewers": "0;121;0",
        "wc_reply_authors": "579;1072;426",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            714.6666666666666,
            654.5666929775418
        ],
        "wc_reply_reviewers_avg": [
            40.333333333333336,
            57.03994701571483
        ],
        "wc_reply_authors_avg": [
            692.3333333333334,
            275.635427492347
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15413769721864368462&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "r1lGO0EKDH",
        "title": "GraphZoom: A Multi-level Spectral Approach for Accurate and Scalable Graph Embedding",
        "track": "main",
        "status": "Talk",
        "tldr": "A multi-level spectral approach to improving the quality and scalability of unsupervised graph embedding.",
        "abstract": "Graph embedding techniques have been increasingly deployed in a multitude of different applications that involve learning on non-Euclidean data. However, existing graph embedding models either fail to incorporate node attribute information during training or suffer from node attribute noise, which compromises the accuracy. Moreover, very few of them scale to large graphs due to their high computational complexity and memory usage. In this paper we propose GraphZoom, a multi-level framework for improving both accuracy and scalability of unsupervised graph embedding algorithms. GraphZoom first performs graph fusion to generate a new graph that effectively encodes the topology of the original graph and the node attribute information. This fused graph is then repeatedly coarsened into much smaller graphs by merging nodes with high spectral similarities. GraphZoom allows any existing embedding methods to be applied to the coarsened graph, before it progressively refine the embeddings obtained at the coarsest level to increasingly finer graphs. We have evaluated our approach on a number of popular graph datasets for both transductive and inductive tasks. Our experiments show that GraphZoom can substantially increase the classification accuracy and significantly accelerate the entire graph embedding process by up to $40.8 \\times$, when compared to the  state-of-the-art unsupervised embedding methods. ",
        "keywords": "graph embedding;unsupervised learning;multi-level optimization;spectral graph theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chenhui Deng;Zhiqiang Zhao;Yongyu Wang;Zhiru Zhang;Zhuo Feng",
        "authorids": "cd574@cornell.edu;qzzhao@mtu.edu;yongyuw@mtu.edu;zhiruz@cornell.edu;zfeng12@stevens.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nDeng2020GraphZoom:,\ntitle={GraphZoom: A Multi-level Spectral Approach for Accurate and Scalable Graph Embedding},\nauthor={Chenhui Deng and Zhiqiang Zhao and Yongyu Wang and Zhiru Zhang and Zhuo Feng},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lGO0EKDH}\n}",
        "github": "https://github.com/cornell-zhang/GraphZoom",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=r1lGO0EKDH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "476;511;468",
        "wc_reply_reviewers": "0;0;143",
        "wc_reply_authors": "814;721;556",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            485.0,
            18.672618098881223
        ],
        "wc_reply_reviewers_avg": [
            47.666666666666664,
            67.41084647311753
        ],
        "wc_reply_authors_avg": [
            697.0,
            106.68645649753299
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 155,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10802093213472366412&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "r1lHAAVtwr",
        "title": "Deep Hierarchical-Hyperspherical Learning (DH^2L)",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Regularization is known to be an inexpensive and reasonable solution to alleviate over-fitting problems of inference models, including deep neural networks. In this paper, we propose a hierarchical regularization which preserves the semantic structure of a sample distribution. At the same time, this regularization promotes diversity by imposing distance between parameter vectors enlarged within semantic structures. To generate evenly distributed parameters, we constrain them to lie on \\emph{hierarchical hyperspheres}. Evenly distributed parameters are considered to be less redundant. To define hierarchical parameter space, we propose to reformulate the topology space with multiple hypersphere space. On each hypersphere space, the projection parameter is defined by two individual parameters. Since maximizing groupwise pairwise distance between points on hypersphere is nontrivial (generalized Thomson problem), we propose a new discrete metric integrated with continuous angle metric. Extensive experiments on publicly available datasets (CIFAR-10, CIFAR-100, CUB200-2011, and Stanford Cars), our proposed method shows improved generalization performance,  especially when the number of super-classes is larger.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Youngsung Kim;Jae-Joon Han",
        "authorids": "yskim.ee@gmail.com;jae-joon.han@samsung.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkim2020deep,\ntitle={Deep Hierarchical-Hyperspherical Learning ({\\{}DH{\\}}^2L)},\nauthor={Youngsung Kim and Jae-Joon Han},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lHAAVtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1lHAAVtwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "410;297;145",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "682;685;241",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            284.0,
            108.57562648525987
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            536.0,
            208.60009587725506
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yhzYLo7gabkJ:scholar.google.com/&scioq=Deep+Hierarchical-Hyperspherical+Learning+(DH%5E2L)&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1lI3ertwH",
        "title": "WHAT DATA IS USEFUL FOR MY DATA: TRANSFER LEARNING WITH A MIXTURE OF SELF-SUPERVISED EXPERTS",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Transfer learning has proven to be a successful way to train high performing deep learning models in various applications for which little labeled data is available. In transfer learning, one pre-trains the model on a large dataset such as Imagenet or MS-COCO, and fine-tunes its weights on the target domain. In our work, we claim that in the new era of ever increasing number of massive datasets, selecting the relevant pre-training data itself is a critical issue. We introduce a new problem in which available datasets are stored in one centralized location, i.e., a dataserver. We assume that a client, a target application with its own small labeled dataset, is only interested in fetching a subset of the server\u2019s data that is most relevant to its own target domain. We propose a novel method that aims to optimally select subsets of data from the dataserver given a particular target client. We perform data selection by employing a mixture of experts model in a series of dataserver- client transactions with a small computational cost. We show the effectiveness of our work in several transfer learning scenarios, demonstrating state-of-the-art per- formance on several target datasets and tasks such as image classification, object detection and instance segmentation. We will make our framework available as a web-service, serving data to users trying to improve performance in their A.I. application.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xi Yan;David Acuna;Sanja Fidler",
        "authorids": "xi.yan@mail.utoronto.ca;davidj@cs.toronto.edu;fidler@cs.toronto.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1lI3ertwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "500;94;680",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            424.6666666666667,
            245.09227287334502
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oPX-q2TIpAIJ:scholar.google.com/&scioq=WHAT+DATA+IS+USEFUL+FOR+MY+DATA:+TRANSFER+LEARNING+WITH+A+MIXTURE+OF+SELF-SUPERVISED+EXPERTS&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1lIKlSYvH",
        "title": "The Usual Suspects? Reassessing Blame for VAE Posterior Collapse",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In narrow asymptotic settings Gaussian VAE models of continuous data have been shown to possess global optima aligned with ground-truth distributions.  Even so, it is well known that poor solutions whereby the latent posterior collapses to an uninformative prior are sometimes obtained in practice.  However, contrary to conventional wisdom that largely assigns blame for this phenomena on the undue influence of KL-divergence regularization, we will argue that posterior collapse is, at least in part, a direct consequence of bad local minima inherent to the loss surface of deep autoencoder networks.  In particular, we prove that even small nonlinear perturbations of affine VAE decoder models can produce such minima, and in deeper models, analogous minima can force the VAE to behave like an aggressive truncation operator, provably discarding information along all latent dimensions in certain circumstances.  Regardless, the underlying message here is not meant to undercut valuable existing explanations of posterior collapse, but rather, to refine the discussion and elucidate alternative risk factors that may have been previously underappreciated.",
        "keywords": "variational autoencoder;posterior collapse",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bin Dai;Ziyu Wang;David Wipf",
        "authorids": "daib13@mails.tsinghua.edu.cn;wzy196@gmail.com;davidwipf@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndai2020the,\ntitle={The Usual Suspects? Reassessing Blame for {\\{}VAE{\\}} Posterior Collapse},\nauthor={Bin Dai and Ziyu Wang and David Wipf},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lIKlSYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1lIKlSYvH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "446;392;885",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1115;1146;901",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;3;2",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            574.3333333333334,
            220.77791757530662
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1054.0,
            108.92505068471011
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 102,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14483661495071003756&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "r1lL4a4tDB",
        "title": "Variational Recurrent Models for Solving Partially Observable Control Tasks",
        "track": "main",
        "status": "Poster",
        "tldr": "A deep RL algorithm for solving POMDPs by auto-encoding the underlying states using a variational recurrent model",
        "abstract": "In partially observable (PO) environments, deep reinforcement learning (RL) agents often suffer from unsatisfactory performance, since two problems need to be tackled together: how to extract information from the raw observations to solve the task, and how to improve the policy. In this study, we propose an RL algorithm for solving PO tasks. Our method comprises two parts: a variational recurrent model (VRM) for modeling the environment, and an RL controller that has access to both the environment and the VRM. The proposed algorithm was tested in two types of PO robotic control tasks, those in which either coordinates or velocities were not observable and those that require long-term memorization. Our experiments show that the proposed algorithm achieved better data efficiency and/or learned more optimal policy than other alternative approaches in tasks in which unobserved states cannot be inferred from raw observations in a simple manner.",
        "keywords": "Reinforcement Learning;Deep Learning;Variational Inference;Recurrent Neural Network;Partially Observable;Robotic Control;Continuous Control",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongqi Han;Kenji Doya;Jun Tani",
        "authorids": "dongqi.han@oist.jp;doya@oist.jp;jun.tani@oist.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nhan2020variational,\ntitle={Variational Recurrent Models for Solving Partially Observable Control Tasks},\nauthor={Dongqi Han and Kenji Doya and Jun Tani},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lL4a4tDB}\n}",
        "github": "https://github.com/oist-cnru/Variational-Recurrent-Models",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1lL4a4tDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "281;330;449",
        "wc_reply_reviewers": "10;0;0",
        "wc_reply_authors": "976;755;746",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            353.3333333333333,
            70.54234346987788
        ],
        "wc_reply_reviewers_avg": [
            3.3333333333333335,
            4.714045207910316
        ],
        "wc_reply_authors_avg": [
            825.6666666666666,
            106.36519898705383
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 95,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10619641407453895242&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "r1lOgyrKDS",
        "title": "Adaptive Correlated Monte Carlo for Contextual Categorical Sequence Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Sequence generation models are commonly refined with reinforcement learning over user-defined metrics. However, high gradient variance hinders the practical use of this method. To stabilize this method, we adapt to contextual generation of categorical sequences a policy gradient estimator, which evaluates a set of correlated Monte Carlo (MC) rollouts for variance control. Due to the correlation, the number of unique rollouts is random and adaptive to model uncertainty; those rollouts naturally become baselines for each other, and hence are combined to effectively reduce gradient variance. We also demonstrate the use of correlated MC rollouts for binary-tree softmax models, which reduce the high generation cost in large vocabulary scenarios by decomposing each categorical action into a sequence of binary actions. We evaluate our methods on both neural program synthesis and image captioning. The proposed methods yield lower gradient variance and consistent improvement over related baselines. ",
        "keywords": "binary softmax;discrete variables;policy gradient;pseudo actions;reinforcement learning;variance reduction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinjie Fan;Yizhe Zhang;Zhendong Wang;Mingyuan Zhou",
        "authorids": "xfan@utexas.edu;yizhe.zhang@microsoft.com;zw2533@columbia.edu;mingyuan.zhou@mccombs.utexas.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nFan2020Adaptive,\ntitle={Adaptive Correlated Monte Carlo for Contextual Categorical Sequence Generation},\nauthor={Xinjie Fan and Yizhe Zhang and Zhendong Wang and Mingyuan Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lOgyrKDS}\n}",
        "github": "https://github.com/xinjiefan/ACMC_ICLR",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1lOgyrKDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "462;373;197",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "761;605;571",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.0,
            110.1120641286261
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            645.6666666666666,
            82.72578531241366
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3786399280246105812&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "r1lPleBFvH",
        "title": "Understanding the Limitations of Conditional Generative Models",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Class-conditional generative models hold promise to overcome the shortcomings of their discriminative counterparts. They are a natural choice to solve discriminative tasks in a robust manner as they jointly optimize for predictive performance and accurate modeling of the input distribution. In this work, we investigate robust classification with likelihood-based generative models from a theoretical and practical perspective to investigate if they can deliver on their promises. Our analysis focuses on a spectrum of robustness properties: (1) Detection of worst-case outliers in the form of adversarial examples; (2) Detection of average-case outliers in the form of ambiguous inputs and (3) Detection of incorrectly labeled in-distribution inputs. \n\nOur theoretical result reveals that it is impossible to guarantee detectability of adversarially-perturbed inputs even for near-optimal generative classifiers. Experimentally, we find that while we are able to train robust models for MNIST, robustness completely breaks down on CIFAR10. We relate this failure to various undesirable model properties that can be traced to the maximum likelihood training objective. Despite being a common choice in the literature, our results indicate that likelihood-based conditional generative models may are surprisingly ineffective for robust classification.",
        "keywords": "Conditional Generative Models;Generative Classifiers;Robustness;Adversarial Examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ethan Fetaya;Joern-Henrik Jacobsen;Will Grathwohl;Richard Zemel",
        "authorids": "ethanf@cs.toronto.edu;j.jacobsen@vectorinstitute.ai;wgrathwohl@cs.toronto.edu;zemel@cs.toronto.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nFetaya2020Understanding,\ntitle={Understanding the Limitations of Conditional Generative Models},\nauthor={Ethan Fetaya and Joern-Henrik Jacobsen and Will Grathwohl and Richard Zemel},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lPleBFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1lPleBFvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "418;664;184",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "606;404;313",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            422.0,
            195.97959077414157
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            441.0,
            122.44454527118252
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 59,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17661918499853052922&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1lQQeHYPr",
        "title": "Embodied Multimodal Multitask Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a multitask model which facilitates knowledge transfer across multimodal tasks by disentangling the knowledge of words and visual concepts in the intermediate representations. ",
        "abstract": "Visually-grounded embodied language learning models have recently shown to be effective at learning multiple multimodal tasks such as following navigational instructions and answering questions. In this paper, we address two key limitations of these models, (a) the inability to transfer the grounded knowledge across different tasks and (b) the inability to transfer to new words and concepts not seen during training using only a few examples. We propose a multitask model which facilitates knowledge transfer across tasks by disentangling the knowledge of words and visual attributes in the intermediate representations. We create scenarios and datasets to quantify cross-task knowledge transfer and show that the proposed model outperforms a range of baselines in simulated 3D environments. We also show that this disentanglement of representations makes our model modular and interpretable which allows for transfer to instructions containing new concepts.",
        "keywords": "Visual Grounding;Semantic Goal Navigation;Embodied Question Answering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Devendra Singh Chaplot;Lisa Lee;Ruslan Salakhutdinov;Devi Parikh;Dhruv Batra",
        "authorids": "chaplot@cs.cmu.edu;lslee@cs.cmu.edu;rsalakhu@cs.cmu.edu;parikh@gatech.edu;dbatra@gatech.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchaplot2020embodied,\ntitle={Embodied Multimodal Multitask Learning},\nauthor={Devendra Singh Chaplot and Lisa Lee and Ruslan Salakhutdinov and Devi Parikh and Dhruv Batra},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lQQeHYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1lQQeHYPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "273;555;183",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "532;656;355",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            337.0,
            158.46766231632245
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            514.3333333333334,
            123.51608082441017
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 39,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9871169546255706377&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "r1lUE04YPB",
        "title": "Mix-review: Alleviate Forgetting in the Pretrain-Finetune Framework for Neural Language Generation Models",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We identify the forgetting problem in fine-tuning of pre-trained NLG models, and propose the mix-review strategy to address it.",
        "abstract": "In this work, we study how the large-scale pretrain-finetune framework changes the behavior of a neural language generator. We focus on the transformer encoder-decoder model for the open-domain dialogue response generation task. We find that after standard fine-tuning, the model forgets important language generation skills acquired during large-scale pre-training. We demonstrate the forgetting phenomenon through a detailed behavior analysis from the perspectives of context sensitivity and knowledge transfer. Adopting the concept of data mixing, we propose an intuitive fine-tuning strategy named \"mix-review''. We find that mix-review effectively regularize the fine-tuning process, and the forgetting problem is largely alleviated. Finally, we discuss interesting behavior of the resulting dialogue model and its implications.\n",
        "keywords": "language generation;forgetting;pretraining;open-domain dialogue",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianxing He;Jun Liu;Kyunghyun Cho;Myle Ott;Bing Liu;James Glass;Fuchun Peng",
        "authorids": "tianxing@mit.edu;junliu@fb.com;kyunghyuncho@fb.com;myleott@fb.com;bingl@fb.com;glass@mit.edu;fuchunpeng@fb.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1lUE04YPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "988;256;364",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "597;509;551",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            536.0,
            322.6391172812125
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            552.3333333333334,
            35.93821859184948
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1526581341583664940&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1lUdpVtwB",
        "title": "Context Based Machine Translation With Recurrent Neural Network For English-Amharic Translation",
        "track": "main",
        "status": "Reject",
        "tldr": "A context based machine translation system combined with a recurrent neural network machine translation system to translate English to Amharic.",
        "abstract": "The current approaches for machine translation usually require large set of parallel corpus in order to achieve fluency like in the case of neural machine translation (NMT), statistical machine translation (SMT) and example-based machine translation (EBMT). The context awareness of phrase-based machine translation (PBMT) approaches is also questionable. This research develops a system that translates English text to Amharic text using a combination of context based machine translation (CBMT) and a recurrent neural network machine translation (RNNMT). We built a bilingual dictionary for the CBMT system to use along with a large target corpus. The RNNMT model has then been provided with the output of the CBMT and a parallel corpus for training. Our combinational approach on English-Amharic language pair yields a performance improvement over the simple neural machine translation (NMT).",
        "keywords": "Context based machine translation;machine translation;Neural network machine translation;English to Amharic machine translation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yeabsira Asefa Ashengo;Rosa Tsegaye Aga;Surafel Lemma Abebe",
        "authorids": "yeabsira.asefa@aait.edu.et;rosatsegaye@gmail.com;surafel.lemma@aait.edu.et",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nashengo2020context,\ntitle={Context Based Machine Translation With Recurrent Neural Network For English-Amharic Translation },\nauthor={Yeabsira Asefa Ashengo and Rosa Tsegaye Aga and Surafel Lemma Abebe},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lUdpVtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1lUdpVtwB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "286;273;317",
        "wc_reply_reviewers": "0;0;57",
        "wc_reply_authors": "727;172;265",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.0,
            18.457157599876172
        ],
        "wc_reply_reviewers_avg": [
            19.0,
            26.870057685088806
        ],
        "wc_reply_authors_avg": [
            388.0,
            242.69734238347152
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16724520653770263828&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1lUl6NFDH",
        "title": "Mirror Descent View For Neural Network Quantization",
        "track": "main",
        "status": "Reject",
        "tldr": "We evaluate the mirror descent algorithm derived for various projections useful for neural network quantization and discuss the relationship of numerically stable updates of mirror descent to the widely used straight through estimator.",
        "abstract": "Quantizing large Neural Networks (NN) while maintaining the performance is highly desirable for resource-limited devices due to reduced memory and time complexity. NN quantization is usually formulated as a constrained optimization problem and optimized via a modified version of gradient descent. In this work, by interpreting the continuous parameters (unconstrained) as the dual of the quantized ones, we introduce a Mirror Descent (MD) framework (Bubeck (2015)) for NN quantization. Specifically, we provide conditions on the projections (i.e., mapping from continuous to quantized ones) which would enable us to derive valid mirror maps and in turn the respective MD updates. Furthermore, we discuss a numerically stable implementation of MD by storing an additional set of auxiliary dual variables (continuous). This update is strikingly analogous to the popular Straight Through Estimator (STE) based method which is typically viewed as a \u201ctrick\u201d to avoid vanishing gradients issue but here we show that it is an implementation method for MD for certain projections. Our experiments on standard classification datasets (CIFAR-10/100, TinyImageNet) with convolutional and residual architectures show that our MD variants obtain fully-quantized networks with accuracies very close to the floating-point networks.",
        "keywords": "mirror descent;network quantization;numerical stability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thalaiyasingam Ajanthan;Kartik Gupta;Philip H. S. Torr;Richard Hartley;Puneet K. Dokania",
        "authorids": "thalaiyasingam.ajanthan@anu.edu.au;kartik.gupta@anu.edu.au;phst@robots.ox.ac.uk;richard.hartley@anu.edu.au;puneet@robots.ox.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\najanthan2020mirror,\ntitle={Mirror Descent View For Neural Network Quantization},\nauthor={Thalaiyasingam Ajanthan and Kartik Gupta and Philip H. S. Torr and Richard Hartley and Puneet K. Dokania},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lUl6NFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1lUl6NFDH",
        "pdf_size": 0,
        "rating": "3;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "186;254;161;246",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "863;0;106;189",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;0;1;1",
        "rating_avg": [
            5.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            211.75,
            39.35971925712885
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            289.5,
            337.81836835790915
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.7071067811865476
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1573231960296776312&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "r1lZ7AEKvB",
        "title": "The Logical Expressiveness of Graph Neural Networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We characterize the expressive power of GNNs in terms of classical logical languages, separating different GNNs and showing connections with standard notions in Knowledge Representation.",
        "abstract": "The ability of graph neural networks (GNNs) for distinguishing nodes in graphs has been recently characterized in terms of the Weisfeiler-Lehman (WL) test for checking graph isomorphism. This characterization, however, does not settle the issue of which Boolean node classifiers (i.e., functions classifying nodes in graphs as true or  false) can be expressed by GNNs.  We tackle this problem by focusing on Boolean classifiers expressible as formulas in the logic FOC2, a well-studied fragment of first order logic. FOC2 is tightly related to the WL test, and hence to GNNs. We start by studying a popular class of GNNs, which we call AC-GNNs, in which the features of each node in the graph are updated, in successive layers, only in terms of the features of its neighbors.  We show that this class of GNNs is too weak to capture all FOC2 classifiers, and provide a syntactic characterization of  the largest subclass of FOC2 classifiers that can be captured by AC-GNNs. This subclass coincides with a logic heavily used by the knowledge representation community. We then look at what needs to be added to AC-GNNs for capturing all FOC2 classifiers. We show that it suffices to add readout functions, which allow to update the features of a node not only in terms of its neighbors, but also in terms of a global attribute vector. We call GNNs of this kind ACR-GNNs. We experimentally validate our findings showing that, on synthetic data conforming to FOC2 formulas, AC-GNNs struggle to fit the training data while ACR-GNNs can generalize even to graphs of sizes not seen during training.",
        "keywords": "Graph Neural Networks;First Order Logic;Expressiveness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pablo Barcel\u00f3;Egor V. Kostylev;Mikael Monet;Jorge P\u00e9rez;Juan Reutter;Juan Pablo Silva",
        "authorids": "pbarcelo@gmail.com;egor.kostylev@cs.ox.ac.uk;mikael.monet@imfd.cl;jorge.perez.rojas@gmail.com;juan.reutter@gmail.com;jpsilvapena@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nBarcel\u00f32020The,\ntitle={The Logical Expressiveness of Graph Neural Networks},\nauthor={Pablo Barcel\u00f3 and Egor V. Kostylev and Mikael Monet and Jorge P\u00e9rez and Juan Reutter and Juan Pablo Silva},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lZ7AEKvB}\n}",
        "github": "https://anonymous.4open.science/r/787222e2-ad5e-4810-a788-e80f0fe7eff0/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1lZ7AEKvB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "277;136;166",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "205;44;203",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            193.0,
            60.64651680022522
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            150.66666666666666,
            75.42914261447994
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 313,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10724081317338364619&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "r1lZgyBYwS",
        "title": "HiLLoC: lossless image compression with hierarchical latent variable models",
        "track": "main",
        "status": "Poster",
        "tldr": "We scale up lossless compression with latent variables, achieving state of the art on full-size ImageNet images.",
        "abstract": "We make the following striking observation: fully convolutional VAE models trained on 32x32 ImageNet can generalize well, not just to 64x64 but also to far larger photographs, with no changes to the model. We use this property, applying fully convolutional models to lossless compression, demonstrating a method to scale the VAE-based 'Bits-Back with ANS' algorithm for lossless compression to large color photographs, and achieving state of the art for compression of full size ImageNet images. We release Craystack, an open source library for convenient prototyping of lossless compression using probabilistic models, along with full implementations of all of our compression results.",
        "keywords": "compression;variational inference;lossless compression;deep latent variable models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "James Townsend;Thomas Bird;Julius Kunze;David Barber",
        "authorids": "james.townsend@cs.ucl.ac.uk;thomas.bird@cs.ucl.ac.uk;julius.kunze@cs.ucl.ac.uk;david.barber@ucl.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nTownsend2020HiLLoC:,\ntitle={HiLLoC: lossless image compression with hierarchical latent variable models},\nauthor={James Townsend and Thomas Bird and Julius Kunze and David Barber},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lZgyBYwS}\n}",
        "github": "https://github.com/hilloc-submission/hilloc",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1lZgyBYwS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "344;229;1028",
        "wc_reply_reviewers": "0;0;14",
        "wc_reply_authors": "90;236;497",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            533.6666666666666,
            352.6852547086267
        ],
        "wc_reply_reviewers_avg": [
            4.666666666666667,
            6.599663291074443
        ],
        "wc_reply_authors_avg": [
            274.3333333333333,
            168.3534641428233
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 81,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8743808448385898182&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1la7krKPS",
        "title": "Measuring Calibration in Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Overconfidence and underconfidence in machine learning classifiers is measured by calibration: the degree to which the probabilities predicted for each class match the accuracy of the classifier on that prediction. We propose two new measures for calibration, the Static Calibration Error (SCE) and Adaptive Calibration Error (ACE). These measures take into account every prediction made by a model, in contrast to the popular Expected Calibration Error.",
        "keywords": "Deep Learning;Multiclass Classification;Classification;Uncertainty Estimation;Calibration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jeremy Nixon;Mike Dusenberry;Ghassen Jerfel;Linchuan Zhang;Dustin Tran",
        "authorids": "jeremynixon@google.com;dusenberrymw@google.com;ghassen@google.com;linchzhang@google.com;trandustin@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nnixon2020measuring,\ntitle={Measuring Calibration in Deep Learning},\nauthor={Jeremy Nixon and Mike Dusenberry and Ghassen Jerfel and Linchuan Zhang and Dustin Tran},\nyear={2020},\nurl={https://openreview.net/forum?id=r1la7krKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1la7krKPS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "1117;1089;241",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            815.6666666666666,
            406.5114457866538
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 597,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=671990448700625194&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "r1laNeBYPB",
        "title": "Memory-Based Graph Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce an efficient memory layer to jointly learn representations and coarsen the input graphs.",
        "abstract": "Graph neural networks (GNNs) are a class of deep models that operate on data with arbitrary topology represented as graphs. We introduce an efficient memory layer for GNNs that can jointly learn node representations and coarsen the graph. We also introduce two new networks based on this layer: memory-based GNN (MemGNN) and graph memory network (GMN) that can learn hierarchical graph representations. The experimental results shows that the proposed models achieve state-of-the-art results in eight out of nine graph classification and regression benchmarks. We also show that the learned representations could correspond to chemical features in the molecule data.\n",
        "keywords": "Graph Neural Networks;Memory Networks;Hierarchial Graph Representation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Amir Hosein Khasahmadi;Kaveh Hassani;Parsa Moradi;Leo Lee;Quaid Morris",
        "authorids": "amirhosein.khasahmadi@mail.utoronto.ca;kaveh.hassani@autodesk.com;parsa.moradi73@gmail.com;ljlee@psi.toronto.edu;quaid.morris@utoronto.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nKhasahmadi2020Memory-Based,\ntitle={Memory-Based Graph Networks},\nauthor={Amir Hosein Khasahmadi and Kaveh Hassani and Parsa Moradi and Leo Lee and Quaid Morris},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1laNeBYPB}\n}",
        "github": "https://github.com/amirkhas/GraphMemoryNet",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=r1laNeBYPB",
        "pdf_size": 0,
        "rating": "6;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "312;497;150;152",
        "wc_reply_reviewers": "0;28;0;0",
        "wc_reply_authors": "1046;776;168;269",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "2;2;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            277.75,
            142.63305191995298
        ],
        "wc_reply_reviewers_avg": [
            7.0,
            12.12435565298214
        ],
        "wc_reply_authors_avg": [
            564.75,
            360.938620128132
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 118,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8513021522669466053&as_sdt=5,47&sciodt=0,47&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "r1lceRNYwS",
        "title": "Unsupervised Learning from Video with Deep Neural Embeddings",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Because of the rich dynamical structure of videos and their ubiquity in everyday life, it is a natural idea that video data could serve as a powerful unsupervised learning signal for visual representations. However, instantiating this idea, especially at large scale, has remained a significant artificial intelligence challenge. Here we present the Video Instance Embedding (VIE) framework, which trains deep nonlinear embeddings on video sequence inputs. By learning embedding dimensions that identify and group similar videos together, while pushing inherently different videos apart in the embedding space, VIE captures the strong statistical structure inherent in videos, without the need for external annotation labels. We find that, when trained on a large-scale video dataset, VIE yields powerful representations both for action recognition and single-frame object categorization, showing substantially improving on the state of the art wherever direct comparisons are possible. We show that a two-pathway model with both static and dynamic processing pathways is optimal, provide analyses indicating how the model works, and perform ablation studies showing the importance of key architecture and loss function choices. Our results suggest that deep neural embeddings are a promising approach to unsupervised video learning for a wide variety of task domains.",
        "keywords": "Unsupervised learning;action recognition;video learning;deep neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chengxu Zhuang;Tianwei She;Alex Andonian;Daniel Yamins",
        "authorids": "chengxuz@stanford.edu;shetw@stanford.edu;aandonia@mit.edu;yamins@stanford.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1lceRNYwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "747;250;247",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            414.6666666666667,
            234.99834514783765
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 78,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3746976281713793865&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "r1lclxBYDS",
        "title": "On the implicit minimization of alternative loss functions when training deep networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We study how the batch size and the learning affect the implicit minimization of different loss functions.",
        "abstract": "Understanding the implicit bias of optimization algorithms is important in order to improve generalization of neural networks. One approach to try to exploit such understanding would be to then make the bias explicit in the loss function.  Conversely, an interesting approach to gain more insights into the implicit bias could be to study how different loss functions  are being implicitly minimized when training the network. In this work, we concentrate our study on the inductive bias occurring when minimizing the cross-entropy loss with different batch sizes and learning rates.  We investigate how three loss functions are being implicitly minimized during training. These three loss functions are the Hinge loss with different margins, the cross-entropy loss with different temperatures and a newly introduced Gcdf loss with different standard deviations. This  Gcdf loss establishes a connection between a sharpness measure for the 0\u22121 loss and margin based loss functions. We find that a common behavior is emerging for all the loss functions considered.",
        "keywords": "implicit minimization;optimization bias;margin based loss functions;flat minima",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexandre Lemire Paquin;Brahim Chaib-draa;Philippe Gigu\u00e8re",
        "authorids": "alexandre.lemire-paquin.1@ulaval.ca;brahim.chaib-draa@ift.ulaval.ca;philippe.giguere@ift.ulaval.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npaquin2020on,\ntitle={On the implicit minimization of alternative loss functions when training deep networks},\nauthor={Alexandre Lemire Paquin and Brahim Chaib-draa and Philippe Gigu{\\`e}re},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lclxBYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1lclxBYDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "427;597;424",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            482.6666666666667,
            80.85515169459245
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17805899585855214759&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1lczkHKPr",
        "title": "Off-policy Multi-step Q-learning",
        "track": "main",
        "status": "Reject",
        "tldr": "The paper is about estimating the full return in off-policy Reinforcement Learning via a combination of short- and long-term predictions.",
        "abstract": "In the past few years, off-policy reinforcement learning methods have shown promising results in their application for robot control. Deep Q-learning, however, still suffers from poor data-efficiency which is limiting with regard to real-world applications. We follow the idea of multi-step TD-learning to enhance data-efficiency while remaining off-policy by proposing two novel Temporal-Difference formulations: (1) Truncated Q-functions which represent the return for the first n steps of a policy rollout and (2) Shifted Q-functions, acting as the farsighted return after this truncated rollout. We prove that the combination of these short- and long-term predictions is a representation of the full return, leading to the Composite Q-learning algorithm. We show the efficacy of Composite Q-learning in the tabular case and compare our approach in the function-approximation setting with TD3, Model-based Value Expansion and TD3(Delta), which we introduce as an off-policy variant of TD(Delta). We show on three simulated robot tasks that Composite TD3 outperforms TD3 as well as state-of-the-art off-policy multi-step approaches in terms of data-efficiency.",
        "keywords": "Multi-step Learning;Off-policy Learning;Q-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gabriel Kalweit;Maria Huegle;Joschka Boedecker",
        "authorids": "kalweitg@cs.uni-freiburg.de;hueglem@informatik.uni-freiburg.de;jboedeck@informatik.uni-freiburg.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nkalweit2020offpolicy,\ntitle={Off-policy Multi-step Q-learning},\nauthor={Gabriel Kalweit and Maria Huegle and Joschka Boedecker},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lczkHKPr}\n}",
        "github": "https://gofile.io/?c=lmcyx5",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1lczkHKPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1298;720;771",
        "wc_reply_reviewers": "2127;0;180",
        "wc_reply_authors": "1985;355;716",
        "reply_reviewers": "4;0;1",
        "reply_authors": "5;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            929.6666666666666,
            261.28188269036605
        ],
        "wc_reply_reviewers_avg": [
            769.0,
            963.0586690332007
        ],
        "wc_reply_authors_avg": [
            1018.6666666666666,
            699.0137496660722
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            1.699673171197595
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.699673171197595
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=496141096249622041&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1lfF2NYvH",
        "title": "InfoGraph: Unsupervised and Semi-supervised Graph-Level Representation Learning via Mutual Information Maximization",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "This paper studies learning the representations of whole graphs in both unsupervised and semi-supervised scenarios. Graph-level representations are critical in a variety of real-world applications such as predicting the properties of molecules and community analysis in social networks. Traditional graph kernel based methods are simple, yet effective for obtaining fixed-length representations for graphs but they suffer from poor generalization due to hand-crafted designs. There are also some recent methods based on language models (e.g. graph2vec) but they tend to only consider certain substructures (e.g. subtrees) as graph representatives. Inspired by recent progress of unsupervised representation learning, in this paper we proposed a novel method called InfoGraph for learning graph-level representations. We maximize the mutual information between the graph-level representation and the representations of substructures of different scales (e.g., nodes, edges, triangles). By doing so, the graph-level representations encode aspects of the data that are shared across different scales of substructures. Furthermore, we further propose InfoGraph*, an extension of InfoGraph for semisupervised scenarios. InfoGraph* maximizes the mutual information between unsupervised graph representations learned by InfoGraph and the representations learned by existing supervised methods. As a result, the supervised encoder learns from unlabeled data while preserving the latent semantic space favored by the current supervised task. Experimental results on the tasks of graph classification and molecular property prediction show that InfoGraph is superior to state-of-the-art baselines and InfoGraph* can achieve performance competitive with state-of-the-art semi-supervised models.",
        "keywords": "graph-level representation learning;mutual information maximization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fan-Yun Sun;Jordan Hoffman;Vikas Verma;Jian Tang",
        "authorids": "sunfanyun@gmail.com;jhoffmann@g.harvard.edu;vikasverma.iitm@gmail.com;jian.tang@hec.ca",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nSun2020InfoGraph:,\ntitle={InfoGraph: Unsupervised and Semi-supervised Graph-Level Representation Learning via Mutual Information Maximization},\nauthor={Fan-Yun Sun and Jordan Hoffman and Vikas Verma and Jian Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lfF2NYvH}\n}",
        "github": "https://github.com/fanyun-sun/InfoGraph",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1lfF2NYvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "321;261;294",
        "wc_reply_reviewers": "0;30;0",
        "wc_reply_authors": "68;640;227",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.0,
            24.535688292770594
        ],
        "wc_reply_reviewers_avg": [
            10.0,
            14.142135623730951
        ],
        "wc_reply_authors_avg": [
            311.6666666666667,
            241.07029864520615
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1221,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16670911678056840041&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "r1lfga4KvS",
        "title": "Extreme Value k-means Clustering",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper introduces Extreme Value Theory into k-means to measure similarity and proposes a novel algorithm called Extreme Value k-means for clustering.",
        "abstract": "Clustering is the central task in unsupervised learning and data mining. k-means is one of the most widely used clustering algorithms. Unfortunately, it is generally non-trivial to extend k-means to cluster data points beyond Gaussian distribution, particularly, the clusters with non-convex shapes (Beliakov & King, 2006). To this end, we, for the first time, introduce Extreme Value Theory (EVT) to improve the clustering ability of k-means. Particularly, the Euclidean space was transformed into a novel probability space denoted as extreme value space by EVT. We thus propose a novel algorithm called Extreme Value k-means (EV k-means), including GEV k-means and GPD k-means. In addition, we also introduce the tricks to accelerate Euclidean distance computation in improving the computational efficiency of classical k-means. Furthermore, our EV k-means is extended to an online version, i.e., online Extreme Value k-means, in utilizing the Mini Batch k-means to cluster streaming data. Extensive experiments are conducted to validate our EV k-means and online EV k-means on synthetic datasets and real datasets. Experimental results show that our algorithms significantly outperform competitors in most cases.",
        "keywords": "unsupervised learning;clustering;k-means;Extreme Value Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sixiao Zheng;Yanxi Hou;Yanwei Fu;Jianfeng Feng",
        "authorids": "sxzheng18@fudan.edu.cn;yxhou@fudan.edu.cn;yanweifu@fudan.edu.cn;jffeng@fudan.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzheng2020extreme,\ntitle={Extreme Value k-means Clustering},\nauthor={Sixiao Zheng and Yanxi Hou and Yanwei Fu and Jianfeng Feng},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lfga4KvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1lfga4KvS",
        "pdf_size": 0,
        "rating": "1;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "175;131;536;120",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            3.25,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            240.5,
            171.84367896434247
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "r1lh6C4FDr",
        "title": "COMBINED FLEXIBLE ACTIVATION FUNCTIONS FOR DEEP NEURAL NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Activation in deep neural networks is fundamental to achieving non-linear mappings. Traditional studies mainly focus on finding fixed activations for a particular set of learning tasks or model architectures. The research on flexible activation is quite limited in both designing philosophy and application scenarios. In this study, we propose a general combined form of flexible activation functions as well as three principles of choosing flexible activation component. Based on this, we develop two novel flexible activation functions that can be implemented in LSTM cells and auto-encoder layers. Also two new regularisation terms based on assumptions as prior knowledge are proposed. We find that LSTM and auto-encoder models with proposed flexible activations provides significant improvements on time series forecasting and image compressing tasks, while layer-wise regularization can improve the performance of CNN (LeNet-5) models with RPeLu activation in image classification tasks.",
        "keywords": "Flexible;Activation Functions;Deep Learning;Regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Renlong Jie;Junbin Gao;Andrey Vasnev;Minh-Ngoc Tran",
        "authorids": "renlong.jie@sydney.edu.au;junbin.gao@sydney.edu.au;andrey.vasnev@sydney.edu.au;minh-ngoc.tran@sydney.edu.au",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njie2020combined,\ntitle={{\\{}COMBINED{\\}} {\\{}FLEXIBLE{\\}} {\\{}ACTIVATION{\\}} {\\{}FUNCTIONS{\\}} {\\{}FOR{\\}} {\\{}DEEP{\\}} {\\{}NEURAL{\\}} {\\{}NETWORKS{\\}}},\nauthor={Renlong Jie and Junbin Gao and Andrey Vasnev and Minh-Ngoc Tran},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lh6C4FDr}\n}",
        "github": "https://github.com/9NXJRDDRQK/Flexible_Activation",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1lh6C4FDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "289;400;256",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "354;506;435",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            315.0,
            61.59545437773797
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            431.6666666666667,
            62.098488083223266
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RtNxhpal7pYJ:scholar.google.com/&scioq=COMBINED+FLEXIBLE+ACTIVATION+FUNCTIONS+FOR+DEEP+NEURAL+NETWORKS&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1lkKn4KDS",
        "title": "Learning Reusable Options for Multi-Task Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We discover options for multi-task RL by maximizing the probability of reproducing optimal trajectories while minimizing the number of decisions needed to do so.",
        "abstract": "Reinforcement learning (RL) has become an increasingly active area of research in recent years. Although there are many algorithms that allow an agent to solve tasks efficiently, they often ignore the possibility that prior experience related to the task at hand might be available. For many practical applications, it might be unfeasible for an agent to learn how to solve a task from scratch, given that it is generally a computationally expensive process; however, prior experience could be leveraged to make these problems tractable in practice. In this paper, we propose a framework for exploiting existing experience by learning reusable options. We show that after an agent learns policies for solving a small number of problems, we are able to use the trajectories generated from those policies to learn reusable options that allow an agent to quickly learn how to solve novel and related problems.",
        "keywords": "Reinforcement Learning;Temporal Abstraction;Options;Multi-Task RL",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Francisco M. Garcia;Chris Nota;Philip S. Thomas",
        "authorids": "fmaxgarcia@gmail.com;cnota@cs.umass.edu;pthomas@cs.umass.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ngarcia2020learning,\ntitle={Learning Reusable Options for Multi-Task Reinforcement Learning},\nauthor={Francisco M. Garcia and Chris Nota and Philip S. Thomas},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lkKn4KDS}\n}",
        "github": "https://anonymousfiles.io/Ls3zuIqn/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1lkKn4KDS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "785;636;279",
        "wc_reply_reviewers": "206;174;0",
        "wc_reply_authors": "1022;308;315",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            566.6666666666666,
            212.31161584383983
        ],
        "wc_reply_reviewers_avg": [
            126.66666666666667,
            90.5145783224387
        ],
        "wc_reply_authors_avg": [
            548.3333333333334,
            334.94510329637933
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16182592125283231468&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "r1ln504YvH",
        "title": "Actor-Critic Approach for Temporal Predictive Clustering",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Due to the wider availability of modern electronic health records (EHR), patient care data is often being stored in the form of time-series. Clustering such time-series data is crucial for patient phenotyping, anticipating patients\u2019 prognoses by identifying \u201csimilar\u201d patients, and designing treatment guidelines that are tailored to homogeneous patient subgroups. In this paper, we develop a deep learning approach for clustering time-series data, where each cluster comprises patients who share similar future outcomes of interest (e.g., adverse events, the onset of comorbidities, etc.). The clustering is carried out by using our novel loss functions that encourage each cluster to have homogeneous future outcomes. We adopt actor-critic models to allow \u201cback-propagation\u201d through the sampling process that is required for assigning clusters to time-series inputs. Experiments on two real-world datasets show that our model achieves superior clustering performance over state-of-the-art benchmarks and identifies meaningful clusters that can be translated into actionable information for clinical decision-making.",
        "keywords": "Temporal Clustering;Predictive Clustering;Actor-Critic",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Changhee Lee;Mihaela van der Schaar",
        "authorids": "chl8856@gmail.com;mihaela@ee.ucla.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlee2020actorcritic,\ntitle={Actor-Critic Approach for Temporal Predictive Clustering},\nauthor={Changhee Lee and Mihaela van der Schaar},\nyear={2020},\nurl={https://openreview.net/forum?id=r1ln504YvH}\n}",
        "github": "https://github.com/ICLR2020-ACTPC/ACTPC_submission",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1ln504YvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "308;399;210",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "536;1152;342",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            305.6666666666667,
            77.17656523985906
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            676.6666666666666,
            345.31660191128304
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J3m4F4RagYoJ:scholar.google.com/&scioq=Actor-Critic+Approach+for+Temporal+Predictive+Clustering&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1lnigSFDr",
        "title": "Improving the Gating Mechanism of Recurrent Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Improving the gating mechanisms of recurrent neural networks by addressing the initialization of the biases and the saturation problem of sigmoid.",
        "abstract": "In this work, we revisit the gating mechanisms widely used in various recurrent and feedforward networks such as LSTMs, GRUs, or highway networks. These gates are meant to control information flow, allowing gradients to better propagate back in time for recurrent models. However, to propagate gradients over very long temporal windows, they need to operate close to their saturation regime. We propose two independent and synergistic modifications to the standard gating mechanism that are easy to implement, introduce no additional hyper-parameters, and are aimed at improving learnability of the gates when they are close to saturation. Our proposals are theoretically justified, and we show a generic framework that encompasses other recently proposed gating mechanisms such as chrono-initialization and master gates . We perform systematic analyses and ablation studies on the proposed improvements and evaluate our method on a wide range of applications including synthetic memorization tasks, sequential image classification, language modeling, and reinforcement learning.  Empirically, our proposed gating mechanisms robustly increase the performance of recurrent models such as LSTMs, especially on tasks requiring long temporal dependencies.",
        "keywords": "recurrent neural networks;LSTM;GRUs;gating mechanisms;deep learning;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Albert Gua;Caglar Gulcehre;Tom le Paine;Razvan Pascanu;Matt Hoffman",
        "authorids": ";caglarg@google.com;;;",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ngua2020improving,\ntitle={Improving the Gating Mechanism of Recurrent Neural Networks},\nauthor={Albert Gua and Caglar Gulcehre and Tom le Paine and Razvan Pascanu and Matt Hoffman},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lnigSFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1lnigSFDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "424;217;609",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "342;440;382",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            416.6666666666667,
            160.11731810005924
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            388.0,
            40.2326567189723
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 107,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6772148801596258143&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "r1lnxTEYPS",
        "title": "Detecting Out-of-Distribution Inputs to Deep Generative Models Using Typicality",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose detecting out-of-distribution inputs to deep generative models via a goodness-of-fit test based on the model entropy.",
        "abstract": "Recent work has shown that deep generative models can assign higher likelihood to out-of-distribution data sets than to their training data [Nalisnick et al., 2019; Choi et al., 2019].  We posit that this phenomenon is caused by a mismatch between the model's typical set and its areas of high probability density.  In-distribution inputs should reside in the former but not necessarily in the latter, as previous work has presumed [Bishop, 1994].  To determine whether or not inputs reside in the typical set, we propose a statistically principled, easy-to-implement test using the empirical distribution of model likelihoods.  The test is model agnostic and widely applicable, only requiring that the likelihood can be computed or closely approximated.  We report experiments showing that our procedure can successfully detect the out-of-distribution sets in several of the challenging cases reported by Nalisnick et al. [2019].",
        "keywords": "Deep generative models;out-of-distribution detection;safety",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eric Nalisnick;Akihiro Matsukawa;Yee Whye Teh;Balaji Lakshminarayanan",
        "authorids": "e.nalisnick@eng.cam.ac.uk;matsukaw@deshaw.com;ywteh@google.com;balajiln@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nnalisnick2020detecting,\ntitle={Detecting Out-of-Distribution Inputs to Deep Generative Models Using Typicality},\nauthor={Eric Nalisnick and Akihiro Matsukawa and Yee Whye Teh and Balaji Lakshminarayanan},\nyear={2020},\nurl={https://openreview.net/forum?id=r1lnxTEYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1lnxTEYPS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "312;460;269",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "270;829;634",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            347.0,
            81.80871999161621
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            577.6666666666666,
            231.66115101347677
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 223,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7523992557065959204&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "r1ltgp4FwS",
        "title": "Learning Temporal Coherence via Self-Supervision for GAN-based Video Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose temporal self-supervisions for learning stable temporal functions with GANs.",
        "abstract": "We focus on temporal self-supervision for GAN-based video generation tasks. While adversarial training successfully yields generative models for a variety of areas, temporal relationship in the generated data is much less explored. This is crucial for sequential generation tasks, e.g. video super-resolution and unpaired video translation. For the former, state-of-the-art methods often favor simpler norm losses such as L2 over adversarial training. However, their averaging nature easily leads to temporally smooth results with an undesirable lack of spatial detail. For unpaired video translation, existing approaches modify the generator networks to form spatio-temporal cycle consistencies. In contrast, we focus on improving the learning objectives and propose a temporally self-supervised algorithm. For both tasks, we show that temporal adversarial learning is key to achieving temporally coherent solutions without sacrificing spatial detail. We also propose a novel Ping-Pong loss to improve the long-term temporal consistency. It effectively prevents recurrent networks from accumulating artifacts temporally without depressing detailed features. We also propose a first set of metrics to quantitatively evaluate the accuracy as well as the perceptual quality of the temporal evolution. A series of user studies confirms the rankings computed with these metrics.",
        "keywords": "adversarial training;generative models;unpaired video translation;video super-resolution;temporal coherence;self-supervision;cycle-consistency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mengyu Chu;You Xie;Jonas Mayer;Laura Leal-Taix\u00e9;Nils Th\u00fcrey",
        "authorids": "mengyu.chu@tum.de;you.xie@tum.de;jonas.a.mayer@tum.de;leal.taixe@tum.de;nils.thuerey@tum.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchu2020learning,\ntitle={Learning Temporal Coherence via Self-Supervision for {\\{}GAN{\\}}-based Video Generation},\nauthor={Mengyu Chu and You Xie and Jonas Mayer and Laura Leal-Taix{\\'e} and Nils Th{\\\"u}rey},\nyear={2020},\nurl={https://openreview.net/forum?id=r1ltgp4FwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1ltgp4FwS",
        "pdf_size": 0,
        "rating": "3;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "390;726;108;225",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "1229;1360;131;533",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;3;1;1",
        "rating_avg": [
            5.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            362.25,
            232.68258959363504
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            813.25,
            503.9317290070154
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.75,
            0.82915619758885
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 262,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14751471043855818510&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "r1ltnp4KwS",
        "title": "Training Interpretable Convolutional Neural Networks towards Class-specific Filters",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Convolutional neural networks (CNNs) have often been treated as \u201cblack-box\u201d and successfully used in a range of tasks. However, CNNs still suffer from the problem of filter ambiguity \u2013 an intricate many-to-many mapping relationship between filters and features, which undermines the models\u2019 interpretability. To interpret CNNs, most existing works attempt to interpret a pre-trained model, while neglecting to reduce the filter ambiguity hidden behind. To this end, we propose a simple but effective strategy for training interpretable CNNs. Specifically, we propose a novel Label Sensitive Gate (LSG) structure to enable the model to learn disentangled filters in a supervised manner, in which redundant channels experience a periodical shutdown as flowing through a learnable gate varying with input labels. To reduce redundant filters during training, LSG is constrained with a sparsity regularization. In this way, such training strategy imposes each filter\u2019s attention to just one or few classes, namely class-specific. Extensive experiments demonstrate the fabulous performance of our method in generating sparse and highly label- related representation of the input. Moreover, comparing to the standard training strategy, our model displays less redundancy and stronger interpretability.\n",
        "keywords": "class-specific filters;interpretability;disentangled representation;filter ambiguity;gate",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haoyu Liang;Zhihao Ouyang;Hang Su;Yuyuan Zeng;Zihao He;Shu-tao Xia;Jun Zhu;Bo Zhang",
        "authorids": "lianghy18@mails.tsinghua.edu.cn;oyzh18@mails.tsinghua.edu.cn;suhangss@mail.tsinghua.edu.cn;zengyy19@mails.tsinghua.edu.cn;zihaoh@usc.edu;xiast@sz.tsinghua.edu.cn;dcszj@mail.tsinghua.edu.cn;dcszb@mail.tsinghua.edu.cn",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nliang2020training,\ntitle={Training Interpretable Convolutional Neural Networks towards Class-specific Filters},\nauthor={Haoyu Liang and Zhihao Ouyang and Hang Su and Yuyuan Zeng and Zihao He and Shu-tao Xia and Jun Zhu and Bo Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=r1ltnp4KwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1ltnp4KwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "326;351;190",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1153;751;231",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;3;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            289.0,
            70.743668739094
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            711.6666666666666,
            377.43108274521086
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.247219128924647
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Gl1wvnHIoiAJ:scholar.google.com/&scioq=Training+Interpretable+Convolutional+Neural+Networks+towards+Class-specific+Filters&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1lxvxBtvr",
        "title": "TransINT: Embedding Implication Rules in Knowledge Graphs with Isomorphic Intersections of Linear Subspaces",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose TransINT, a novel and interpretable KG embedding method that isomorphically preserves the implication ordering among relations in the embedding space in an explainable, robust, and geometrically coherent way.",
        "abstract": "Knowledge Graphs (KG), composed of entities and relations, provide a structured representation of knowledge. For easy access to statistical approaches on relational data, multiple methods to embed a KG as components of R^d have been introduced. We propose TransINT, a novel and interpretable KG embedding method that isomorphically preserves the implication ordering among relations in the embedding space. TransINT maps set of entities (tied by a relation) to continuous sets of vectors that are inclusion-ordered isomorphically to relation implications. With a novel parameter sharing scheme, TransINT enables automatic training on missing but implied facts without rule grounding. We achieve new state-of-the-art performances with signficant margins in Link Prediction and Triple Classification on FB122 dataset, with boosted performance even on test instances that cannot be inferred by logical rules. The angles between the continuous sets embedded by TransINT provide an interpretable way to mine semantic relatedness and implication rules among relations. ",
        "keywords": "Knowledge Graph Embedding;Knowledge Graph;Common Sense;Rules;Isomorphic Embedding;Isomorphism;Semantics Mining;Rule Mining",
        "primary_area": "",
        "supplementary_material": "",
        "author": "So Yeon Min;Preethi Raghavan;Peter Szolovits",
        "authorids": "symin95@mit.edu;praghav@us.ibm.com;psz@mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1lxvxBtvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "347;280;272",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "299;27;78",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            299.6666666666667,
            33.6286914537109
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            134.66666666666666,
            118.0517777173314
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18424942403050074001&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "r1nSxrKPH",
        "title": "Learning Functionally Decomposed Hierarchies for Continuous Navigation Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "Learning Functionally Decomposed Hierarchies for Continuous Navigation Tasks",
        "abstract": "Solving long-horizon sequential decision making tasks in environments with sparse rewards is a longstanding problem in reinforcement learning (RL) research. Hierarchical Reinforcement Learning (HRL) has held the promise to enhance the capabilities of RL agents via operation on different levels of temporal abstraction. Despite the success of recent works in dealing with inherent nonstationarity and sample complexity, it remains difficult to generalize to unseen environments and to transfer different layers of the policy to other agents. In this paper, we propose a novel HRL architecture, Hierarchical Decompositional Reinforcement Learning (HiDe), which allows decomposition of the hierarchical layers into independent subtasks, yet allows for joint training of all layers in end-to-end manner. The main insight is to combine a control policy on a lower level with an image-based planning policy on a higher level. We evaluate our method on various complex continuous control tasks for navigation, demonstrating that generalization across environments and transfer of higher level policies can be achieved. See videos https://sites.google.com/view/hide-rl",
        "keywords": "Hierarchical reinforcement learning;planning;navigation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lukas Jendele;Sammy Christen;Emre Aksan;Otmar Hilliges",
        "authorids": "lukas.jendele@gmail.com;sammy.christen@inf.ethz.ch;eaksan@inf.ethz.ch;otmar.hilliges@inf.ethz.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njendele2020learning,\ntitle={Learning Functionally Decomposed Hierarchies for Continuous Navigation Tasks},\nauthor={Lukas Jendele and Sammy Christen and Emre Aksan and Otmar Hilliges},\nyear={2020},\nurl={https://openreview.net/forum?id=r1nSxrKPH}\n}",
        "github": "https://sites.google.com/view/hide-rl",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1nSxrKPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "394;851;461",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "712;2019;477",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            568.6666666666666,
            201.50489379224072
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1069.3333333333333,
            678.3343980335625
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2421286475923554299&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "r1x0lxrFPS",
        "title": "BinaryDuo: Reducing Gradient Mismatch in Binary Activation Network by Coupling Binary Activations",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Binary Neural Networks (BNNs) have been garnering interest thanks to their compute cost reduction and memory savings. However, BNNs suffer from performance degradation mainly due to the gradient mismatch caused by binarizing activations. Previous works tried to address the gradient mismatch problem by reducing the discrepancy between activation functions used at forward pass and its differentiable approximation used at backward pass, which is an indirect measure. In this work, we use the gradient of smoothed loss function to better estimate the gradient mismatch in quantized neural network. Analysis using the gradient mismatch estimator indicates that using higher precision for activation is more effective than modifying the differentiable approximation of activation function. Based on the observation, we propose a new training scheme for binary activation networks called BinaryDuo in which two binary activations are coupled into a ternary activation during training. Experimental results show that BinaryDuo outperforms state-of-the-art BNNs on various benchmarks with the same amount of parameters and computing cost.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hyungjun Kim;Kyungsu Kim;Jinseok Kim;Jae-Joon Kim",
        "authorids": "hyungjun.kim@postech.ac.kr;kyungsu.kim@postech.ac.kr;jinseok.kim@postech.ac.kr;jaejoon@postech.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nKim2020BinaryDuo:,\ntitle={BinaryDuo: Reducing Gradient Mismatch in Binary Activation Network by Coupling Binary Activations},\nauthor={Hyungjun Kim and Kyungsu Kim and Jinseok Kim and Jae-Joon Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1x0lxrFPS}\n}",
        "github": "https://github.com/Hyungjun-K1m/BinaryDuo",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1x0lxrFPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "207;727;141",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "954;2828;309",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;4;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            358.3333333333333,
            262.0754768297781
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1363.6666666666667,
            1068.3976579698945
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 55,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14477900274189098502&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1x1kJHKDH",
        "title": "PatchVAE: Learning Local Latent Codes for Recognition",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A patch-based bottleneck formulation in a VAE framework that learns unsupervised representations better suited for visual recognition.",
        "abstract": "Unsupervised representation learning holds the promise of exploiting large amount of available unlabeled data to learn general representations. A promising technique for unsupervised learning is the framework of Variational Auto-encoders (VAEs). However, unsupervised representations learned by VAEs are significantly outperformed by those learned by supervising for recognition. Our hypothesis is that to learn useful representations for recognition the model needs to be encouraged to learn about repeating and consistent patterns in data. Drawing inspiration from the mid-level representation discovery work, we propose PatchVAE, that reasons about images at patch level. Our key contribution is a bottleneck formulation in a VAE framework that encourages mid-level style representations. Our experiments demonstrate that representations learned by our method perform much better on the recognition tasks compared to those learned by vanilla VAEs.",
        "keywords": "unsupervised learning;deep learning;representation learning;recognition;computer vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kamal Gupta;Saurabh Singh;Abhinav Shrivastava",
        "authorids": "kamalgupta308@gmail.com;saurabhsingh@google.com;abhinav@cs.umd.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1x1kJHKDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "476;346;357",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            393.0,
            58.86141916966211
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10744669028405220428&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "r1x3R6NtwB",
        "title": "DropGrad: Gradient Dropout Regularization for Meta-Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "With the growing attention on learning-to-learn new tasks using only a few examples, meta-learning has been widely used in numerous problems such as few-shot classification, reinforcement learning, and domain generalization. However, meta-learning models are prone to overfitting when there are no sufficient training tasks for the meta-learners to generalize. Although existing approaches such as Dropout are widely used to address the overfitting problem, these methods are typically designed for regularizing models of a single task in supervised training. In this paper, we introduce a simple yet effective method to alleviate the risk of overfitting for gradient-based meta-learning. Specifically, during the gradient-based adaptation stage, we randomly drop the gradient in the inner-loop optimization of each parameter in deep neural networks, such that the augmented gradients improve generalization to new tasks. We present a general form of the proposed gradient dropout regularization and show that this term can be sampled from either the Bernoulli or Gaussian distribution. To validate the proposed method, we conduct extensive experiments and analysis on numerous tasks, demonstrating that the gradient dropout regularization mitigates the overfitting problem and improves the performance upon various gradient-based meta-learning frameworks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hung-Yu Tseng;Yi-Wen Chen;Yi-Hsuan Tsai;Sifei Liu;Yen-Yu Lin;Ming-Hsuan Yang",
        "authorids": "htseng6@ucmerced.edu;ychen319@ucmerced.edu;wasidennis@gmail.com;sifeil@nvidia.com;lin@cs.nctu.edu.tw;mhyang@ucmerced.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1x3R6NtwB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "113;326;419",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            286.0,
            128.0859086707043
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a8FzKwM-aZQJ:scholar.google.com/&scioq=DropGrad:+Gradient+Dropout+Regularization+for+Meta-Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1x3unVKPS",
        "title": "Support-guided Adversarial Imitation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We unify support estimation with the family of Adversarial Imitation Learning algorithms into Support-guided Adversarial Imitation Learning, a more robust and stable imitation learning framework.",
        "abstract": "We propose Support-guided Adversarial Imitation Learning (SAIL), a generic imitation learning framework that unifies support estimation of the expert policy with the family of Adversarial Imitation Learning (AIL) algorithms. SAIL addresses two important challenges of AIL, including the implicit reward bias and potential training instability. We also show that SAIL is at least as efficient as standard AIL. In an extensive evaluation, we demonstrate that the proposed method effectively handles the reward bias and achieves better performance and training stability than other baseline methods on a wide range of benchmark control tasks.",
        "keywords": "Adversarial Imitation Learning;Reinforcement Learning;Learning from Demonstrations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruohan Wang;Carlo Ciliberto;Pierluigi Amadori;Yiannis Demiris",
        "authorids": "r.wang16@ic.ac.uk;c.ciliberto@imperial.ac.uk;p.amadori@imperial.ac.uk;y.demiris@imperial.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020supportguided,\ntitle={Support-guided Adversarial Imitation Learning},\nauthor={Ruohan Wang and Carlo Ciliberto and Pierluigi Amadori and Yiannis Demiris},\nyear={2020},\nurl={https://openreview.net/forum?id=r1x3unVKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1x3unVKPS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "583;767;290",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "724;580;247",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            546.6666666666666,
            196.42188155996152
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            517.0,
            199.7648617750379
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Al1HQVOe2MwJ:scholar.google.com/&scioq=Support-guided+Adversarial+Imitation+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1x63grFvH",
        "title": "Limitations for Learning from Point Clouds",
        "track": "main",
        "status": "Reject",
        "tldr": "We prove new universal approximation theorems for PointNets and DeepSets and demonstrate new limitations.",
        "abstract": "In this paper we prove new universal approximation theorems for deep learning on point clouds that do not assume fixed cardinality. We do this by first generalizing the classical universal approximation theorem to general compact Hausdorff spaces and then applying this to the permutation-invariant architectures presented in 'PointNet' (Qi et al) and 'Deep Sets' (Zaheer et al). Moreover, though both architectures operate on the same domain, we show that the constant functions are the only functions they can mutually uniformly approximate. In particular, DeepSets architectures cannot uniformly approximate the diameter function but can uniformly approximate the center of mass function but it is the other way around for PointNet. ",
        "keywords": "universal approximation;point clouds;deep learning;hausdorff metric;wasserstein metric",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Christian Bueno;Alan G. Hylton",
        "authorids": "christianbueno@ucsb.edu;alan.g.hylton@nasa.gov",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nbueno2020limitations,\ntitle={Limitations for Learning from Point Clouds},\nauthor={Christian Bueno and Alan G. Hylton},\nyear={2020},\nurl={https://openreview.net/forum?id=r1x63grFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1x63grFvH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "374;107;232",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "477;153;149",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            237.66666666666666,
            109.075916478183
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            259.6666666666667,
            153.68654968980061
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4097760751087664418&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1xBk6NKwr",
        "title": "Learning audio representations with self-supervision",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "We explore self-supervision as a way to learn general purpose audio\nrepresentations. Specifically, we propose self-supervised tasks that exploit the\ntemporal context in the spectrogram domain. The temporal gap task estimates\nthe distance between two short audio segments extracted at random from the same\naudio clip. The Audio2Vec task is inspired by Word2Vec, a popular\ntechnique used to learn word embeddings, and aim at reconstructing a spectrogram\nslice from past and future slices or, alternatively, at reconstructing the\ncontext of surrounding slices from the current slice. We evaluate the quality of\nthe embeddings produced by the self-supervised learning models, measuring the\naccuracy of linear classifiers, which receive the embeddings as input and aim at\naddressing a variety of downstream audio tasks. Our results show that the\nlearned representations partially bridge the performance gap with fully\nsupervised models of similar size, and for some tasks even approach their\nperformance.",
        "keywords": "Audio representations;self-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "M. Tagliasacchi;B. Gfeller;F. de Chaumont Quitry;D. Roblek",
        "authorids": "mtagliasacchi@google.com;beatg@google.com;fcq@google.com;droblek@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1xBk6NKwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "384;687;467",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            512.6666666666666,
            127.84452363015876
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2efbt1-rWksJ:scholar.google.com/&scioq=Learning+audio+representations+with+self-supervision&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1xBoxBYDH",
        "title": "Tree-structured Attention Module for Image Classification",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Our paper proposes an attention module which captures inter-channel relationships and offers large performance gains.",
        "abstract": "Recent studies in attention modules have enabled higher performance in computer vision tasks by capturing global contexts and accordingly attending important features. In this paper, we propose a simple and highly parametrically efficient module named Tree-structured Attention Module (TAM) which recursively encourages neighboring channels to collaborate in order to produce a spatial attention map as an output. Unlike other attention modules which try to capture long-range dependencies at each channel, our module focuses on imposing non-linearities be- tween channels by utilizing point-wise group convolution. This module not only strengthens representational power of a model but also acts as a gate which controls signal flow. Our module allows a model to achieve higher performance in a highly parameter-efficient manner. We empirically validate the effectiveness of our module with extensive experiments on CIFAR-10/100 and SVHN datasets. With our proposed attention module employed, ResNet50 and ResNet101 models gain 2.3% and 1.2% accuracy improvement with less than 1.5% parameter over- head. Our PyTorch implementation code is publicly available.",
        "keywords": "inter-channel relationship;attention module;point-wise group convolution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gyungin Shin;Sung-Ho Bae;and Yong-Jae Moon",
        "authorids": "gishin@khu.ac.kr;shbae@khu.ac.kr;moonyj@khu.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://github.com/NoelShin/TAM",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1xBoxBYDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "600;434;149",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "27;28;21",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            394.3333333333333,
            186.24416471097527
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            25.333333333333332,
            3.0912061651652345
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yKHaiG_aqOMJ:scholar.google.com/&scioq=Tree-structured+Attention+Module+for+Image+Classification&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1xCMyBtPS",
        "title": "Multilingual Alignment of Contextual Word Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose procedures for evaluating and strengthening contextual embedding alignment and show that they both improve multilingual BERT's zero-shot XNLI transfer and provide useful insights into the model.",
        "abstract": "We propose procedures for evaluating and strengthening contextual embedding alignment and show that they are useful in analyzing and improving multilingual BERT. In particular, after our proposed alignment procedure, BERT exhibits significantly improved zero-shot performance on XNLI compared to the base model, remarkably matching pseudo-fully-supervised translate-train models for Bulgarian and Greek. Further, to measure the degree of alignment, we introduce a contextual version of word retrieval and show that it correlates well with downstream zero-shot transfer. Using this word retrieval task, we also analyze BERT and find that it exhibits systematic deficiencies, e.g. worse alignment for open-class parts-of-speech and word pairs written in different scripts, that are corrected by the alignment procedure. These results support contextual alignment as a useful concept for understanding large multilingual pre-trained models.",
        "keywords": "multilingual;natural language processing;embedding alignment;BERT;word embeddings;transfer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Steven Cao;Nikita Kitaev;Dan Klein",
        "authorids": "stevencao@berkeley.edu;kitaev@berkeley.edu;klein@berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nCao2020Multilingual,\ntitle={Multilingual Alignment of Contextual Word Representations},\nauthor={Steven Cao and Nikita Kitaev and Dan Klein},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xCMyBtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1xCMyBtPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "1051;629;257",
        "wc_reply_reviewers": "136;312;0",
        "wc_reply_authors": "1936;1101;437",
        "reply_reviewers": "1;1;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            645.6666666666666,
            324.36330796740185
        ],
        "wc_reply_reviewers_avg": [
            149.33333333333334,
            127.72192015816584
        ],
        "wc_reply_authors_avg": [
            1158.0,
            613.2900347035379
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 196,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=503101340202443295&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "r1xF7lSYDS",
        "title": "Transferable Recognition-Aware Image Processing",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose simple and effective approaches to enhance the machine interpretability of image processing outputs; the improvement on recognition accuracy is transferable among different recognition architectures, object categories, and tasks.",
        "abstract": "Recent progress in image recognition has stimulated the deployment of vision systems (e.g. image search engines) at an unprecedented scale. As a result, visual data are now often consumed not only by humans but also by machines. Meanwhile, existing image processing methods only optimize for better human perception, whereas the resulting images may not be accurately recognized by machines. This can be undesirable, e.g., the images can be improperly handled by search engines or recommendation systems. In this work, we propose simple approaches to improve machine interpretability of processed images: optimizing the recognition loss directly on the image processing network or through an intermediate transforming model, a process which we show can also be done in an unsupervised manner. Interestingly, the processing model's ability to enhance the recognition performance can transfer when evaluated on different recognition models, even if they are of different architectures, trained on different object categories or even different recognition tasks. This makes the solutions applicable even when we do not have the knowledge about future downstream recognition models, e.g., if we are to upload the processed images to the Internet. We conduct comprehensive experiments on three image processing tasks with two downstream recognition tasks, and confirm our method brings substantial accuracy improvement on both the same recognition model and when transferring to a different one, with minimal or no loss in the image processing quality. ",
        "keywords": "Image Recognition;Image Processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhuang Liu;Tinghui Zhou;Zhiqiang Shen;Bingyi Kang;Trevor Darrell",
        "authorids": "zhuangl@berkeley.edu;tinghuiz@eecs.berkeley.edu;zhiqians@andrew.cmu.edu;kang@u.nus.edu;trevor@eecs.berkeley.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nliu2020transferable,\ntitle={Transferable Recognition-Aware Image Processing},\nauthor={Zhuang Liu and Tinghui Zhou and Zhiqiang Shen and Bingyi Kang and Trevor Darrell},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xF7lSYDS}\n}",
        "github": "https://drive.google.com/open?id=1aMJwg7UrJ9f1aQHYBC10ZZWcZDwFYmex",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1xF7lSYDS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "402;874;320",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2157;1776;202",
        "reply_reviewers": "0;0;0",
        "reply_authors": "4;3;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            532.0,
            244.1365737997211
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1378.3333333333333,
            846.2112948640874
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.247219128924647
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "r1xGP6VYwH",
        "title": "Optimistic Exploration even with a Pessimistic Initialisation",
        "track": "main",
        "status": "Poster",
        "tldr": "We augment the Q-value estimates with a count-based bonus that ensures optimism during action selection and bootstrapping, even if the Q-value estimates are pessimistic.",
        "abstract": "Optimistic initialisation is an effective strategy for efficient exploration in reinforcement learning (RL). In the tabular case, all provably efficient model-free algorithms rely on it. However, model-free deep RL algorithms do not use optimistic initialisation despite taking inspiration from these provably efficient tabular algorithms. In particular, in scenarios with only positive rewards, Q-values are initialised at their lowest possible values due to commonly used network initialisation schemes, a pessimistic initialisation. Merely initialising the network to output optimistic Q-values is not enough, since we cannot ensure that they remain optimistic for novel state-action pairs, which is crucial for exploration. We propose a simple count-based augmentation to pessimistically initialised Q-values that separates the source of optimism from the neural network. We show that this scheme is provably efficient in the tabular setting and extend it to the deep RL setting. Our algorithm, Optimistic Pessimistically Initialised Q-Learning (OPIQ), augments the Q-value estimates of a DQN-based agent with count-derived bonuses to ensure optimism during both action selection and bootstrapping. We show that OPIQ outperforms non-optimistic DQN variants that utilise a pseudocount-based intrinsic motivation in hard exploration tasks, and that it predicts optimistic estimates for novel state-action pairs.",
        "keywords": "Reinforcement Learning;Exploration;Optimistic Initialisation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tabish Rashid;Bei Peng;Wendelin Boehmer;Shimon Whiteson",
        "authorids": "tabish.rashid@cs.ox.ac.uk;bei.peng@cs.ox.ac.uk;wendelin.boehmer@cs.ox.ac.uk;shimon.whiteson@cs.ox.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nRashid2020Optimistic,\ntitle={Optimistic Exploration even with a Pessimistic Initialisation},\nauthor={Tabish Rashid and Bei Peng and Wendelin Boehmer and Shimon Whiteson},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xGP6VYwH}\n}",
        "github": "[![github](/images/github_icon.svg) oxwhirl/opiq](https://github.com/oxwhirl/opiq)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1xGP6VYwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "483;347;186",
        "wc_reply_reviewers": "238;0;34",
        "wc_reply_authors": "743;244;395",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            338.6666666666667,
            121.39284254939508
        ],
        "wc_reply_reviewers_avg": [
            90.66666666666667,
            105.10100961561797
        ],
        "wc_reply_authors_avg": [
            460.6666666666667,
            208.94071461116002
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "r1xGnA4Kvr",
        "title": "Biologically inspired sleep algorithm for increased generalization and adversarial robustness in deep neural networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We describe a biologically inspired sleep algorithm for increasing an artificial neural network's ability to extract the gist of a training set and exhibit increased robustness to adversarial attacks and general distortions.",
        "abstract": "Current artificial neural networks (ANNs) can perform and excel at a variety of tasks ranging from image classification to spam detection through training on large datasets of labeled data. While the trained network may perform well on similar testing data, inputs that differ even slightly from the training data may trigger unpredictable behavior. Due to this limitation, it is possible to design inputs with very small perturbations that can result in misclassification. These adversarial attacks present a security risk to deployed ANNs and indicate a divergence between how ANNs and humans perform classification. Humans are robust at behaving in the presence of noise and are capable of correctly classifying objects that are noisy, blurred, or otherwise distorted.  It has been hypothesized that sleep promotes generalization of knowledge and improves robustness against noise in animals and humans. In this work, we utilize a biologically inspired sleep phase in ANNs and demonstrate the benefit of sleep on defending against adversarial attacks as well as in increasing ANN classification robustness. We compare the sleep algorithm's performance on various robustness tasks with two previously proposed adversarial defenses - defensive distillation and fine-tuning. We report an increase in robustness after sleep phase to adversarial attacks as well as to general image distortions for three datasets: MNIST, CUB200, and a toy dataset. Overall, these results demonstrate the potential for biologically inspired solutions to solve existing problems in ANNs and guide the development of more robust, human-like ANNs.",
        "keywords": "Adversarial Robustness;Generalization;Neural Computing;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Timothy Tadros;Giri Krishnan;Ramyaa Ramyaa;Maxim Bazhenov",
        "authorids": "tttadros@ucsd.edu;gkrishnan@ucsd.edu;ramyaa.ramyaa@gmail.com;mbazhenov@ucsd.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nTadros2020Biologically,\ntitle={Biologically inspired sleep algorithm for increased generalization and adversarial robustness in deep neural networks},\nauthor={Timothy Tadros and Giri Krishnan and Ramyaa Ramyaa and Maxim Bazhenov},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xGnA4Kvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1xGnA4Kvr",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "151;1126",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "703;1246",
        "reply_reviewers": "0;0",
        "reply_authors": "1;2",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            638.5,
            487.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            974.5,
            271.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1163295489483467994&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "r1xH5xHYwH",
        "title": "Effects of Linguistic Labels on Learned Visual Representations in Convolutional Neural Networks: Labels matter!",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigated the changes in visual representations learnt by CNNs when using different linguistic labels",
        "abstract": "We investigated the changes in visual representations learnt by CNNs when using different linguistic labels (e.g., trained with basic-level labels only, superordinate-level only, or both at the same time) and how they compare to human behavior when asked to select which of three images is most different. We compared CNNs with identical architecture and input, differing only in what labels were used to supervise the training. The results showed that in the absence of labels, the models learn very little categorical structure that is often assumed to be in the input. Models trained with superordinate labels (vehicle, tool, etc.) are most helpful in allowing the models to match human categorization, implying that human representations used in odd-one-out tasks are highly modulated by semantic information not obviously present in the visual input.",
        "keywords": "category learning;visual representation;linguistic labels;human behavior prediction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Seoyoung Ahn;Gregory Zelinsky;Gary Lupyan",
        "authorids": "seoyoung.ahn@stonybrook.edu;gregory.zelinsky@stonybrook.edu;lupyan@wisc.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nahn2020effects,\ntitle={Effects of Linguistic Labels on Learned Visual Representations in Convolutional Neural Networks: Labels matter!},\nauthor={Seoyoung Ahn and Gregory Zelinsky and Gary Lupyan},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xH5xHYwH}\n}",
        "github": "https://github.com/ahnchive/19vscl.git",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer5",
        "site": "https://openreview.net/forum?id=r1xH5xHYwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "247;854;339",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "395;1361;595",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            480.0,
            267.1117119608698
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            783.6666666666666,
            416.3214569963402
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FLp6rx4cR3AJ:scholar.google.com/&scioq=Effects+of+Linguistic+Labels+on+Learned+Visual+Representations+in+Convolutional+Neural+Networks:+Labels+matter!&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1xHxgrKwr",
        "title": "Anomaly Detection Based on Unsupervised Disentangled Representation Learning in Combination with Manifold Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We developed anomaly detection framework based on beta-VAE and t-SNE",
        "abstract": "Identifying anomalous samples from highly complex and unstructured data is a crucial but challenging task in a variety of intelligent systems. In this paper, we present a novel deep anomaly detection framework named AnoDM (standing for Anomaly detection based on unsupervised Disentangled representation learning and Manifold learning). The disentanglement learning is currently implemented by beta-VAE for automatically discovering interpretable factorized latent representations in a completely unsupervised manner. The manifold learning is realized by t-SNE for projecting the latent representations to a 2D map.  We define a new anomaly score function by combining beta-VAE's reconstruction error in the raw feature space and local density estimation in the t-SNE space. AnoDM was evaluated on both image and time-series data and achieved better results than models that use just one of the two measures and other deep learning methods.",
        "keywords": "anomaly detection;disentangled representation learning;manifold learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiaoyan Li;Iluju Kiringa;Tet Yeap;Xiaodan Zhu;Yifeng Li",
        "authorids": "xli343@uottawa.ca;iluju.kiringa@uottawa.ca;tyeap@uottawa.ca;xiaodan.zhu@queensu.ca;yifeng.li@nrc-cnrc.gc.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nli2020anomaly,\ntitle={Anomaly Detection Based on Unsupervised Disentangled Representation Learning in Combination with Manifold Learning},\nauthor={Xiaoyan Li and Iluju Kiringa and Tet Yeap and Xiaodan Zhu and Yifeng Li},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xHxgrKwr}\n}",
        "github": "https://drive.google.com/file/d/1KSibzY4z87yTIHFvOr29iOCoxdxt-Jzs/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1xHxgrKwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "202;270;495",
        "wc_reply_reviewers": "0;121;0",
        "wc_reply_authors": "472;606;134",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            322.3333333333333,
            125.21004574536165
        ],
        "wc_reply_reviewers_avg": [
            40.333333333333336,
            57.03994701571483
        ],
        "wc_reply_authors_avg": [
            404.0,
            198.60177911254135
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3115005917479977649&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1xI-gHFDH",
        "title": "How can we generalise learning distributed representations of graphs?",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a general framework for building models that can learn distributed representations of discrete structures and test this on graphs.",
        "abstract": "We propose a general framework to construct unsupervised models capable of learning distributed representations of discrete structures such as graphs based on R-Convolution kernels and distributed semantics research. Our framework combines the insights and observations of Deep Graph Kernels and Graph2Vec towards a unified methodology for performing similarity learning on graphs of arbitrary size. This is exemplified by our own instance G2DR which extends Graph2Vec from labelled graphs towards unlabelled graphs and tackles issues of diagonal dominance through pruning of the subgraph vocabulary composing graphs. These changes produce new state of the art results in the downstream application of G2DR embeddings in graph classification tasks over datasets with small labelled graphs in binary classification to multi-class classification on large unlabelled graphs using an off-the-shelf support vector machine. ",
        "keywords": "graphs;distributed representations;similarity learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Paul M Scherer;Pietro Lio",
        "authorids": "pms69@cam.ac.uk;pl219@cam.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nscherer2020how,\ntitle={How can we generalise learning distributed representations of graphs?},\nauthor={Paul M Scherer and Pietro Lio},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xI-gHFDH}\n}",
        "github": "https://github.com/ANON-ICLR2020/ICLR2020-G2DR",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1xI-gHFDH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "248;239;196",
        "wc_reply_reviewers": "0;229;0",
        "wc_reply_authors": "668;1872;506",
        "reply_reviewers": "0;2;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            227.66666666666666,
            22.69116323349001
        ],
        "wc_reply_reviewers_avg": [
            76.33333333333333,
            107.95163526114627
        ],
        "wc_reply_authors_avg": [
            1015.3333333333334,
            609.3544854096742
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BB2j33bGE3sJ:scholar.google.com/&scioq=How+can+we+generalise+learning+distributed+representations+of+graphs%3F&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1xMH1BtvB",
        "title": "ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators",
        "track": "main",
        "status": "Poster",
        "tldr": "A text encoder trained to distinguish real input tokens from plausible fakes efficiently learns effective language representations.",
        "abstract": "Masked language modeling (MLM) pre-training methods such as BERT corrupt the input by replacing some tokens with [MASK] and then train a model to reconstruct the original tokens. While they produce good results when transferred to downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a more sample-efficient pre-training task called replaced token detection. Instead of masking the input, our approach corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments demonstrate this new pre-training task is more efficient than MLM because the task is defined over all input tokens rather than just the small subset that was masked out. As a result, the contextual representations learned by our approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained using 30x more compute) on the GLUE natural language understanding benchmark. Our approach also works well at scale, where it performs comparably to RoBERTa and XLNet while using less than 1/4 of their compute and outperforms them when using the same amount of compute.\n",
        "keywords": "Natural Language Processing;Representation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kevin Clark;Minh-Thang Luong;Quoc V. Le;Christopher D. Manning",
        "authorids": "kevclark@cs.stanford.edu;thangluong@google.com;qvl@google.com;manning@cs.stanford.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nClark2020ELECTRA:,\ntitle={ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators},\nauthor={Kevin Clark and Minh-Thang Luong and Quoc V. Le and Christopher D. Manning},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xMH1BtvB}\n}",
        "github": "https://github.com/google-research/electra",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1xMH1BtvB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "220;230;311",
        "wc_reply_reviewers": "70;0;0",
        "wc_reply_authors": "366;330;233",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            253.66666666666666,
            40.74582459862878
        ],
        "wc_reply_reviewers_avg": [
            23.333333333333332,
            32.99831645537222
        ],
        "wc_reply_authors_avg": [
            309.6666666666667,
            56.16839759944099
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4882,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18273102803868155691&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "r1xMnCNYvB",
        "title": "JAX MD: End-to-End Differentiable, Hardware Accelerated, Molecular Dynamics in Pure Python",
        "track": "main",
        "status": "Reject",
        "tldr": "Software package to do fast, end-to-end differentiable, physics simulations with machine learning as a first class citizen.",
        "abstract": "A large fraction of computational science involves simulating the dynamics of particles that interact via pairwise or many-body interactions. These simulations, called Molecular Dynamics (MD), span a vast range of subjects from physics and materials science to biochemistry and drug discovery. Most MD software involves significant use of handwritten derivatives and code reuse across C++, FORTRAN, and CUDA. This is reminiscent of the state of machine learning before automatic differentiation became popular. In this work we bring the substantial advances in software that have taken place in machine learning to MD with JAX, M.D. (JAX MD). JAX MD is an end-to-end differentiable MD package written entirely in Python that can be just-in-time compiled to CPU, GPU, or TPU. JAX MD allows researchers to iterate extremely quickly and lets researchers easily incorporate machine learning models into their workflows. Finally, since all of the simulation code is written in Python, researchers can have unprecedented flexibility in setting up experiments without having to edit any low-level C++ or CUDA code. In addition to making existing workloads easier, JAX MD allows researchers to take derivatives through whole-simulations as well as seamlessly incorporate neural networks into simulations. This paper explores the architecture of JAX MD and its capabilities through several vignettes. Code is available at github.com/jaxmd/jax-md along with an interactive Colab notebook.",
        "keywords": "Automatic Differentiation;Software Library;Physics Simulation;Differentiable Physics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Samuel S. Schoenholz;Ekin D. Cubuk",
        "authorids": "schsam@google.com;cubuk@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nschoenholz2020jax,\ntitle={{\\{}JAX{\\}} {\\{}MD{\\}}: End-to-End Differentiable, Hardware Accelerated, Molecular Dynamics in Pure Python},\nauthor={Samuel S. Schoenholz and Ekin D. Cubuk},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xMnCNYvB}\n}",
        "github": "https://github.com/jaxmd/jax-md",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1xMnCNYvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "94;97;330",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "281;303;543",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            173.66666666666666,
            110.55114452395124
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            375.6666666666667,
            118.66292128920848
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 40,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1880976045122355413&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1xNJ0NYDH",
        "title": "The Effect of Neural Net Architecture on Gradient Confusion & Training Performance",
        "track": "main",
        "status": "Reject",
        "tldr": "We formally show that increased layer width helps training, while increased network depth makes training harder.",
        "abstract": "The goal of this paper is to study why typical neural networks train so fast, and how neural network architecture affects the speed of training. We introduce a simple concept called gradient confusion to help formally analyze this.  When confusion is high, stochastic gradients produced by different data samples may be negatively correlated, slowing down convergence. But when gradient confusion is low, data samples interact harmoniously, and training proceeds quickly. Through novel theoretical and experimental results, we show how the neural net architecture affects gradient confusion, and thus the efficiency of training. We show that increasing the width of neural networks leads to lower gradient confusion, and thus easier model training. On the other hand, increasing the depth of neural networks has the opposite effect. Finally, we observe empirically that techniques like batch normalization and skip connections reduce gradient confusion, which helps reduce the training burden of very deep networks.",
        "keywords": "neural network architecture;speed of training;layer width;network depth",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Karthik A. Sankararaman;Soham De;Zheng Xu;W. Ronny Huang;Tom Goldstein",
        "authorids": "karthikabinavs@gmail.com;sohamde@google.com;xuzh@cs.umd.edu;wrhuang@cs.umd.edu;tomg@cs.umd.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsankararaman2020the,\ntitle={The Effect of Neural Net Architecture on Gradient Confusion {\\&} Training Performance},\nauthor={Karthik A. Sankararaman and Soham De and Zheng Xu and W. Ronny Huang and Tom Goldstein},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xNJ0NYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1xNJ0NYDH",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "396;556;406",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "857;1151;584",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.6666666666667,
            73.18166133366717
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            864.0,
            231.52969571957718
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gHVS-Lan0aUJ:scholar.google.com/&scioq=The+Effect+of+Neural+Net+Architecture+on+Gradient+Confusion+%26+Training+Performance&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1xPh2VtPB",
        "title": "SVQN: Sequential Variational Soft Q-Learning Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "SVQNs  formalizes the inference of hidden states and maximum entropy reinforcement learning under a unified graphical model and optimizes the two modules jointly.",
        "abstract": "Partially Observable Markov Decision Processes (POMDPs) are popular and flexible models for real-world decision-making applications that demand the information from past observations to make optimal decisions. Standard reinforcement learning algorithms for solving Markov Decision Processes (MDP) tasks are not applicable, as they cannot infer the unobserved states. In this paper, we propose a novel algorithm for POMDPs, named sequential variational soft Q-learning networks (SVQNs), which formalizes the inference of hidden states and maximum entropy reinforcement learning (MERL) under a unified graphical model and optimizes the two modules jointly. We further design a deep recurrent neural network to reduce the computational complexity of the algorithm. Experimental results show that SVQNs can utilize past information to help decision making for efficient inference, and outperforms other baselines on several challenging tasks. Our ablation study shows that SVQNs have the generalization ability over time and are robust to the disturbance of the observation.",
        "keywords": "reinforcement learning;POMDP;variational inference;generative model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shiyu Huang;Hang Su;Jun Zhu;Ting Chen",
        "authorids": "huangsy1314@163.com;suhangss@mail.tsinghua.edu.cn;dcszj@tsinghua.edu.cn;tingchen@tsinghua.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nHuang2020SVQN:,\ntitle={SVQN: Sequential Variational Soft Q-Learning Networks},\nauthor={Shiyu Huang and Hang Su and Jun Zhu and Ting Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xPh2VtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1xPh2VtPB",
        "pdf_size": 0,
        "rating": "3;8",
        "confidence": "0;0",
        "wc_review": "141;286",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "372;15",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            5.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            213.5,
            72.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            193.5,
            178.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14502970570337775007&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "r1xQH0EtvH",
        "title": "The Blessing of Dimensionality: An Empirical Study of Generalization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "An intuitive empirical and visual exploration of the generalization properties of deep neural networks.",
        "abstract": "The power of neural networks lies in their ability to generalize to unseen data, yet the underlying reasons for this phenomenon remain elusive. Numerous rigorous attempts have been made to explain generalization, but available bounds are still quite loose, and analysis does not always lead to true understanding. The goal of this work is to make generalization more intuitive. Using visualization methods, we discuss the mystery of generalization, the geometry of loss landscapes, and how the curse (or, rather, the blessing) of dimensionality causes optimizers to settle into minima that generalize well. ",
        "keywords": "deep neural networks;convolutional neural networks;generalization;visualization;loss landscape;optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "W. Ronny Huang;Zeyad Emam;Micah Goldblum;Liam Fowl;J K Terry;Furong Huang;Tom Goldstein",
        "authorids": "wrhuang@umd.edu;zeyad@math.umd.edu;goldblum@math.umd.edu;lfowl@math.umd.edu;jkterry@cs.umd.edu;furongh@cs.umd.edu;tomg@cs.umd.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "https://github.com/genviz2019/genviz",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1xQH0EtvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "793;411;302",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            502.0,
            210.52474122218192
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PyqXlx-A3_gJ:scholar.google.com/&scioq=The+Blessing+of+Dimensionality:+An+Empirical+Study+of+Generalization&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1xQNlBYPS",
        "title": "Multichannel Generative Language Models",
        "track": "main",
        "status": "Reject",
        "tldr": "we propose Multichannel Generative Language Models (MGLM), which models the joint distribution over multiple channels, and all its decompositions using a single neural network",
        "abstract": "A channel corresponds to a viewpoint or transformation of an underlying meaning. A pair of parallel sentences in English and French express the same underlying meaning but through two separate channels corresponding to their languages. In this work, we present Multichannel Generative Language Models (MGLM), which models the joint distribution over multiple channels, and all its decompositions using a single neural network. MGLM can be trained by feeding it k way parallel-data, bilingual data, or monolingual data across pre-determined channels. MGLM is capable of both conditional generation and unconditional sampling. For conditional generation, the model is given a fully observed channel, and generates the k-1 channels in parallel. In the case of machine translation, this is akin to giving it one source, and the model generates k-1 targets. MGLM can also do partial conditional sampling, where the channels are seeded with prespecified words, and the model is asked to infill the rest. Finally, we can sample from MGLM unconditionally over all k channels. Our experiments on the Multi30K dataset containing English, French, Czech, and German languages suggest that the multitask training with the joint objective leads to improvements in bilingual translations. We provide a quantitative analysis of the quality-diversity trade-offs for different variants of the multichannel model for conditional generation, and a measurement of self-consistency during unconditional generation. We provide qualitative examples for parallel greedy decoding across languages and sampling from the joint distribution of the 4 languages.",
        "keywords": "text generation;generative language models;natural language processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Harris Chan;Jamie Kiros;William Chan",
        "authorids": "hchan@cs.toronto.edu;kiros@google.com;williamchan@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchan2020multichannel,\ntitle={Multichannel Generative Language Models},\nauthor={Harris Chan and Jamie Kiros and William Chan},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xQNlBYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1xQNlBYPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "236;373;266",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "59;334;201",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            291.6666666666667,
            58.80098260705815
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            198.0,
            112.28831936878683
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SByUA-ANtu8J:scholar.google.com/&scioq=Multichannel+Generative+Language+Models&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "r1xZAkrFPr",
        "title": "Deep Ensembles: A Loss Landscape Perspective",
        "track": "main",
        "status": "Reject",
        "tldr": "We study deep ensembles through the lens of loss landscape and the space of predictions, demonstrating that the decorrelation power of random initializations is unmatched by subspace sampling that only explores a single mode.",
        "abstract": "Deep ensembles have been empirically shown to be a promising approach for improving accuracy, uncertainty  and out-of-distribution robustness of deep learning models. While deep ensembles were theoretically motivated by the bootstrap, non-bootstrap ensembles trained with just random initialization also perform well in practice, which suggests that there could be other explanations for why deep ensembles work well. Bayesian neural networks, which learn distributions over the parameters of the network, are theoretically well-motivated by Bayesian principles, but do not perform as well as deep ensembles in practice, particularly under dataset shift. One possible explanation for this gap between theory and practice is that popular scalable approximate Bayesian methods tend to focus on a single mode, whereas deep ensembles tend to explore diverse modes in function space. We investigate this hypothesis by building on recent work on understanding the loss landscape of neural networks and adding our own exploration to measure the similarity of functions in the space of predictions. Our results show that random initializations explore entirely different modes, while functions along an optimization trajectory or sampled from the subspace thereof cluster within a single mode predictions-wise, while often deviating significantly in the weight space. We demonstrate that while low-loss connectors between modes exist, they are not connected in the space of predictions. Developing the concept of the diversity--accuracy plane, we show that the decorrelation power of random initializations is unmatched by popular subspace sampling methods.",
        "keywords": "loss landscape;deep ensemble;subspace;tunnel;low loss;connector;weight averaging;dropout;gaussian;connectivity;diversity;function space",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stanislav Fort;Clara Huiyi Hu;Balaji Lakshminarayanan",
        "authorids": "stanislav.fort@gmail.com;clarahu@google.com;balajiln@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfort2020deep,\ntitle={Deep Ensembles: A Loss Landscape Perspective},\nauthor={Stanislav Fort and Clara Huiyi Hu and Balaji Lakshminarayanan},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xZAkrFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1xZAkrFPr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "310;564;507",
        "wc_reply_reviewers": "525;0;0",
        "wc_reply_authors": "1569;663;306",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            460.3333333333333,
            108.8189117550601
        ],
        "wc_reply_reviewers_avg": [
            175.0,
            247.48737341529164
        ],
        "wc_reply_authors_avg": [
            846.0,
            531.6069976966068
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 743,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17183981266562689466&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "r1x_DaVKwH",
        "title": "Is Deep Reinforcement Learning Really Superhuman on Atari? Leveling the playing field",
        "track": "main",
        "status": "Reject",
        "tldr": "Introducing a Standardized Atari BEnchmark for general Reinforcement learning algorithms (SABER) and highlight the remaining gap between RL agents and best human players.",
        "abstract": "Consistent and reproducible evaluation of Deep Reinforcement Learning (DRL) is not straightforward. In the Arcade Learning Environment (ALE), small changes in environment parameters such as stochasticity or the maximum allowed play time can lead to very different performance. In this work, we discuss the difficulties of comparing different agents trained on ALE. In order to take a step further towards reproducible and comparable DRL, we introduce SABER, a Standardized Atari BEnchmark for general Reinforcement learning algorithms. Our methodology extends previous recommendations and contains a complete set of environment parameters as well as train and test procedures. We then use SABER to evaluate the current state of the art, Rainbow. Furthermore, we introduce a human world records baseline, and argue that previous claims of expert or superhuman performance of DRL might not be accurate. Finally, we propose Rainbow-IQN by extending Rainbow with Implicit Quantile Networks (IQN) leading to new state-of-the-art performance. Source code is available for reproducibility.",
        "keywords": "Reinforcement Learning;Deep Learning;Atari benchmark;Reproducibility",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marin Toromanoff;Emilie Wirbel;Fabien Moutarde",
        "authorids": "marin.toromanoff@mines-paristech.fr;emilie.wirbel@valeo.com;fabien.moutarde@mines-paristech.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntoromanoff2020is,\ntitle={Is Deep Reinforcement Learning Really Superhuman on Atari? Leveling the playing field},\nauthor={Marin Toromanoff and Emilie Wirbel and Fabien Moutarde},\nyear={2020},\nurl={https://openreview.net/forum?id=r1x_DaVKwH}\n}",
        "github": "https://anonymous.4open.science/r/728e379d-4d38-49e2-9dd8-bf2fb4bd4844/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1x_DaVKwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "388;559;772",
        "wc_reply_reviewers": "200;241;0",
        "wc_reply_authors": "639;570;384",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            573.0,
            157.07959765672945
        ],
        "wc_reply_reviewers_avg": [
            147.0,
            105.28374360112137
        ],
        "wc_reply_authors_avg": [
            531.0,
            107.69401097554126
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 54,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8169391677834141647&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "r1xa9TVFvH",
        "title": "NeuralUCB: Contextual Bandits with Neural Network-Based Exploration",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We study the stochastic contextual bandit problem, where the reward is generated from an unknown bounded function with additive noise. We propose the NeuralUCB algorithm, which leverages the representation power of deep neural networks and uses the neural network-based random feature mapping to construct an upper confidence bound (UCB) of reward for efficient exploration. We prove that, under mild assumptions, NeuralUCB achieves $\\tilde O(\\sqrt{T})$ regret bound, where $T$ is the number of rounds. To the best of our knowledge, our algorithm is the first neural network-based contextual bandit algorithm with near-optimal regret guarantee.  Preliminary experiment results on synthetic data corroborate our theory, and shed light on potential applications of our algorithm to real-world problems.",
        "keywords": "contextual bandits;neural network;upper confidence bound",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongruo Zhou;Lihong Li;Quanquan Gu",
        "authorids": "drzhou@cs.ucla.edu;lihongli.cs@gmail.com;qgu@cs.ucla.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhou2020neuralucb,\ntitle={Neural{\\{}UCB{\\}}: Contextual Bandits with Neural Network-Based Exploration},\nauthor={Dongruo Zhou and Lihong Li and Quanquan Gu},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xa9TVFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=r1xa9TVFvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "333;893;223",
        "wc_reply_reviewers": "0;442;218",
        "wc_reply_authors": "726;922;521",
        "reply_reviewers": "0;1;2",
        "reply_authors": "1;2;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            483.0,
            293.37120967584167
        ],
        "wc_reply_reviewers_avg": [
            220.0,
            180.45128613192722
        ],
        "wc_reply_authors_avg": [
            723.0,
            163.72130791887375
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4xEF0lhqGVIJ:scholar.google.com/&scioq=NeuralUCB:+Contextual+Bandits+with+Neural+Network-Based+Exploration&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "r1xapAEKwS",
        "title": "SDGM: Sparse Bayesian Classifier Based on a Discriminative Gaussian Mixture Model",
        "track": "main",
        "status": "Reject",
        "tldr": "A sparse classifier based on a discriminative Gaussian mixture model, which can also be embedded into a neural network.",
        "abstract": "In probabilistic classification, a discriminative model based on Gaussian mixture exhibits flexible fitting capability. Nevertheless, it is difficult to determine the number of components. We propose a sparse classifier based on a discriminative Gaussian mixture model (GMM), which is named sparse discriminative Gaussian mixture (SDGM). In the SDGM, a GMM-based discriminative model is trained by sparse Bayesian learning. This learning algorithm improves the generalization capability by obtaining a sparse solution and automatically determines the number of components by removing redundant components. The SDGM can be embedded into neural networks (NNs) such as convolutional NNs and can be trained in an end-to-end manner. Experimental results indicated that the proposed method prevented overfitting by obtaining sparsity. Furthermore, we demonstrated that the proposed method outperformed a fully connected layer with the softmax function in certain cases when it was used as the last layer of a deep NN.",
        "keywords": "classification;sparse Bayesian learning;Gaussian mixture model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hideaki Hayashi;Seiichi Uchida",
        "authorids": "hayashi@ait.kyushu-u.ac.jp;uchida@ait.kyushu-u.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nhayashi2020sdgm,\ntitle={{\\{}SDGM{\\}}: Sparse Bayesian Classifier Based on a Discriminative Gaussian Mixture Model},\nauthor={Hideaki Hayashi and Seiichi Uchida},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xapAEKwS}\n}",
        "github": "https://drive.google.com/file/d/1WQJS7lZGXx7JSnje3DgmBQQx0zVi3Lqe/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1xapAEKwS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "237;115;456",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "496;49;615",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            269.3333333333333,
            141.07759882025528
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            386.6666666666667,
            243.65868659983283
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10949241568684148572&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1xbj2VKvr",
        "title": "Dual Graph Representation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Graph representation learning embeds nodes in large graphs as low-dimensional vectors and  benefit to many downstream applications. Most embedding frameworks, however, are inherently transductive and unable to  generalize to unseen nodes or learn representations across different graphs. Inductive approaches, such as GraphSAGE,  neglect different contexts of nodes and cannot learn node embeddings dually. In this paper, we present an unsupervised dual encoding framework, \\textbf{CADE},  to generate context-aware representation of nodes by combining real-time neighborhood structure with neighbor-attentioned representation, and preserving extra memory of known nodes. Experimently, we exhibit that our approach is effective by comparing to state-of-the-art methods.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Huiling Zhu;Xin Luo;Hankz Hankui Zhuo",
        "authorids": "zhuhling6@mail.sysu.edu.cn;luo35@mail2.sysu.edu.cn;zhuohank@mail.sysu.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhu2020dual,\ntitle={Dual Graph Representation Learning},\nauthor={Huiling Zhu and Xin Luo and Hankz Hankui Zhuo},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xbj2VKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1xbj2VKvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "131;150;272",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            184.33333333333334,
            62.47310532452256
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7769729019575753116&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "r1xfECEKvr",
        "title": "Analyzing the Role of Model Uncertainty for Electronic Health Records",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigate the role of model uncertainty methods for domains like medicine, and compare a multitude of Bayesian RNN variants with deterministic RNN ensembles.",
        "abstract": "In medicine, both ethical and monetary costs of incorrect predictions can be significant, and the complexity of the problems often necessitates increasingly complex models. Recent work has shown that changing just the random seed is enough for otherwise well-tuned deep neural networks to vary in their individual predicted probabilities. In light of this, we investigate the role of model uncertainty methods in the medical domain. Using RNN ensembles and various Bayesian RNNs, we show that population-level metrics, such as AUC-PR, AUC-ROC, log-likelihood, and calibration error, do not capture model uncertainty. Meanwhile, the presence of significant variability in patient-specific predictions and optimal decisions motivates the need for capturing model uncertainty. Understanding the uncertainty for individual patients is an area with clear clinical impact, such as determining when a model decision is likely to be brittle. We further show that RNNs with only Bayesian embeddings can be a more efficient way to capture model uncertainty compared to ensembles, and we analyze how model uncertainty is impacted across individual input features and patient subgroups.",
        "keywords": "medicine;uncertainty;neural networks;Bayesian;electronic health records",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael W. Dusenberry;Dustin Tran;Edward Choi;Jonas Kemp;Jeremy Nixon;Ghassen Jerfel;Katherine Heller;Andrew M. Dai",
        "authorids": "dusenberrymw@google.com;trandustin@google.com;mp2893@gmail.com;jonasbkemp@google.com;jeremynixon@google.com;ghassen@google.com;kheller@google.com;adai@google.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\ndusenberry2020analyzing,\ntitle={Analyzing the Role of Model Uncertainty for Electronic Health Records},\nauthor={Michael W. Dusenberry and Dustin Tran and Edward Choi and Jonas Kemp and Jeremy Nixon and Ghassen Jerfel and Katherine Heller and Andrew M. Dai},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xfECEKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1xfECEKvr",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "374;373",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "744;754",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            373.5,
            0.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            749.0,
            5.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 121,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15320720850647935722&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "r1xjgxBFPB",
        "title": "Continual Deep Learning by Functional Regularisation of Memorable Past",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper introduces a scalable functional-regularisation approach for continual learning that uses a GP formulation of neural networks to identify and regularise over a memorable past.",
        "abstract": "Continually learning new skills without forgetting old ones is an important quality for an intelligent system, yet most deep learning methods suffer from catastrophic forgetting of the past. Recent works have addressed this by regularising the network weights, but it is challenging to identify weights crucial to avoid forgetting. A better approach is to directly regularise the network outputs at past inputs, e.g., by using Gaussian processes (GPs), but this is usually computationally challenging. In this paper, we propose a scalable functional-regularisation approach where we regularise only over a few memorable past examples that are crucial to avoid forgetting. Our key idea is to use a GP formulation of deep networks, enabling us to both identify the memorable past and regularise over them. Our method achieves state-of-the-art performance on standard benchmarks and opens a new direction for life-long learning where regularisation methods are naturally combined with memory-based methods.",
        "keywords": "Continual learning;deep learning;functional regularisation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pingbo Pan;Alexander Immer;Siddharth Swaroop;Runa Eschenhagen;Richard E Turner;Mohammad Emtiyaz Khan",
        "authorids": "pingbo.pan@student.uts.edu.au;alexander.immer@epfl.ch;ss2163@cam.ac.uk;reschenhagen@uni-osnabrueck.de;ret26@cam.ac.uk;emtiyaz.khan@riken.jp",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\npan2020continual,\ntitle={Continual Deep Learning by Functional Regularisation of Memorable Past},\nauthor={Pingbo Pan and Alexander Immer and Siddharth Swaroop and Runa Eschenhagen and Richard E Turner and Mohammad Emtiyaz Khan},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xjgxBFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1xjgxBFPB",
        "pdf_size": 0,
        "rating": "1;1;6",
        "confidence": "0;0;0",
        "wc_review": "187;1626;289",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "338;648;133",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            700.6666666666666,
            655.6331969088271
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            373.0,
            211.6994725233548
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 165,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10115135321591353527&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "r1xo9grKPr",
        "title": "Flexible and Efficient Long-Range Planning Through Curious Exploration",
        "track": "main",
        "status": "Reject",
        "tldr": "We designed a flexible and efficient curiosity-based planning algorithm and tested it on a wide range of physically realistic 3D tasks.",
        "abstract": "Identifying algorithms that flexibly and efficiently discover temporally-extended multi-phase plans is an essential next step for the advancement of robotics and model-based reinforcement learning. The core problem of long-range planning is finding an efficient way to search through the tree of possible action sequences \u2014 which, if left unchecked, grows exponentially with the length of the plan. Existing non-learned planning solutions from the Task and Motion Planning (TAMP) literature rely on the existence of logical descriptions for the effects and preconditions for actions. This constraint allows TAMP methods to efficiently reduce the tree search problem but limits their ability to generalize to unseen and complex physical environments. In contrast, deep reinforcement learning (DRL) methods use flexible neural-network-based function approximators to discover policies that generalize naturally to unseen circumstances. However, DRL methods have had trouble dealing with the very sparse reward landscapes inherent to long-range multi-step planning situations. Here, we propose the Curious Sample Planner (CSP), which fuses elements of TAMP and DRL by using a curiosity-guided sampling strategy to learn to efficiently explore the tree of action effects. We show that CSP can efficiently discover interesting and complex temporally-extended plans for solving a wide range of physically realistic 3D tasks. In contrast, standard DRL and random sampling methods often fail to solve these tasks at all or do so only with a huge and highly variable number of training samples. We explore the use of a variety of curiosity metrics with CSP and analyze the types of solutions that CSP discovers. Finally, we show that CSP supports task transfer so that the exploration policies learned during experience with one task can help improve efficiency on related tasks.",
        "keywords": "Curiosity;Planning;Reinforcement Learning;Robotics;Exploration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aidan Curtis;Minjian Xin;Kevin Feigelis;Dan Yamins",
        "authorids": "southpawac@gmail.com;xinminjian@sjtu.edu.cn;feigelis@stanford.edu;yamins@stanford.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ncurtis2020flexible,\ntitle={Flexible and Efficient Long-Range Planning Through Curious Exploration},\nauthor={Aidan Curtis and Minjian Xin and Kevin Feigelis and Dan Yamins},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xo9grKPr}\n}",
        "github": "https://github.com/CuriousSamplePlanner/CuriousSamplePlanner",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1xo9grKPr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "254;964;660",
        "wc_reply_reviewers": "193;93;0",
        "wc_reply_authors": "1441;990;801",
        "reply_reviewers": "1;1;0",
        "reply_authors": "3;3;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            626.0,
            290.85162311162486
        ],
        "wc_reply_reviewers_avg": [
            95.33333333333333,
            78.80919292118712
        ],
        "wc_reply_authors_avg": [
            1077.3333333333333,
            268.47760096928425
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=89571480004634303&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "r1xpF0VYDS",
        "title": "Quantum algorithm for finding the negative curvature direction",
        "track": "main",
        "status": "Reject",
        "tldr": "We present an efficient quantum algorithm aiming to find the negative curvature direction.",
        "abstract": "We present an efficient quantum algorithm aiming to find the negative curvature direction for escaping the saddle point, which is a critical subroutine for many second-order non-convex optimization algorithms. We prove that our algorithm could produce the target state corresponding to the negative curvature direction with query complexity O(polylog(d)\u03b5^(-1)), where d is the dimension of the optimization function. The quantum negative curvature finding algorithm is exponentially faster than any known classical method which takes time at least O(d\u03b5^(\u22121/2)). Moreover, we propose an efficient algorithm to achieve the classical read-out of the target state. Our classical read-out algorithm runs exponentially faster on the degree of d than existing counterparts.",
        "keywords": "quantum algorithm;negative curvature",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kaining Zhang;Min-Hsiu Hsieh;Liu Liu;Dacheng Tao",
        "authorids": "kzha3670@uni.sydney.edu.au;min-hsiu.hsieh@uts.edu.au;liu.liu1@sydney.edu.au;dacheng.tao@sydney.edu.au",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020quantum,\ntitle={Quantum algorithm for finding the negative curvature direction},\nauthor={Kaining Zhang and Min-Hsiu Hsieh and Liu Liu and Dacheng Tao},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xpF0VYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1xpF0VYDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "253;75;171",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "892;445;258",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            166.33333333333334,
            72.74307909042314
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            531.6666666666666,
            265.9853797151682
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:edecGEhZ3jEJ:scholar.google.com/&scioq=Quantum+algorithm+for+finding+the+negative+curvature+direction&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "r1xwA34KDB",
        "title": "Learning Invariants through Soft Unification",
        "track": "main",
        "status": "Reject",
        "tldr": "End-to-end learning of invariant representations with variables across examples such as if someone went somewhere then they are there.",
        "abstract": "Human reasoning involves recognising common underlying principles across many examples by utilising variables. The by-products of such reasoning are invariants that capture patterns across examples such as \"if someone went somewhere then they are there\" without mentioning specific people or places. Humans learn what variables are and how to use them at a young age, and the question this paper addresses is whether machines can also learn and use variables solely from examples without requiring human pre-engineering. We propose Unification Networks that incorporate soft unification into neural networks to learn variables and by doing so lift examples into invariants that can then be used to solve a given task. We evaluate our approach on four datasets to demonstrate that learning invariants captures patterns in the data and can improve performance over baselines.",
        "keywords": "representation learning;neural networks;unification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nuri Cingillioglu;Alessandra Russo",
        "authorids": "nuri.cingillioglu13@imperial.ac.uk;a.russo@imperial.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ncingillioglu2020learning,\ntitle={Learning Invariants through Soft Unification},\nauthor={Nuri Cingillioglu and Alessandra Russo},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xwA34KDB}\n}",
        "github": "https://drive.google.com/file/d/1Ema_awqOoOn-Xd4aTkkwOQz1r26QqZ4A/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=r1xwA34KDB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "376;183;865",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "747;335;824",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            474.6666666666667,
            287.0334862384914
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            635.3333333333334,
            214.6816764938783
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3931986727564031878&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "r1xxKJBKvr",
        "title": "PassNet: Learning pass probability surfaces from single-location labels. An architecture for visually-interpretable soccer analytics",
        "track": "main",
        "status": "Reject",
        "tldr": "A deep neural network architecture that is able to produce full pass probability surfaces from low level spatio-temporal soccer data.",
        "abstract": "We propose a fully convolutional network architecture that is able to estimate a full surface of pass probabilities from single-location labels derived from high frequency spatio-temporal data of professional soccer matches. The network is able to perform remarkably well from low-level inputs by learning a feature hierarchy that produces predictions at different sampling levels that are merged together to preserve  both coarse and fine detail. Our approach presents an extreme case of weakly supervised learning where there is just a single pixel correspondence between ground-truth outcomes and the predicted probability map. By providing not just an accurate evaluation of observed events but also a visual interpretation of the results of other potential actions, our approach opens the door for spatio-temporal decision-making analysis, an as-yet little-explored area in sports. Our proposed deep learning architecture can be easily adapted to solve many other related problems in sports analytics; we demonstrate this by extending the network to learn to estimate pass-selection likelihood.",
        "keywords": "fully convolutional neural networks;convolutional neural networks;sports analytics;interpretable machine learning;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Javier Fern\u00e1ndez;Luke Bornn",
        "authorids": "javier.fernandezr@fcbarcelona.cat;lbornn@kings.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nfern{\\'a}ndez2020passnet,\ntitle={PassNet: Learning pass probability surfaces from single-location labels. An architecture for visually-interpretable soccer analytics},\nauthor={Javier Fern{\\'a}ndez and Luke Bornn},\nyear={2020},\nurl={https://openreview.net/forum?id=r1xxKJBKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=r1xxKJBKvr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "268;629;269",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            388.6666666666667,
            169.94182010977232
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12606830756277035792&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "r1xyayrtDS",
        "title": "Dynamically Balanced Value Estimates for Actor-Critic Methods",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A method for more accurate critic estimates in reinforcement learning.",
        "abstract": "Reinforcement learning in an actor-critic setting relies on accurate value estimates of the critic. However, the combination of function approximation, temporal difference (TD) learning and off-policy training can lead to an overestimating value function. A solution is to use Clipped Double Q-learning (CDQ), which is used in the TD3 algorithm and computes the minimum of two critics in the TD-target. \nWe show that CDQ induces an underestimation bias and propose a new algorithm that accounts for this by using a weighted average of the target from CDQ and the target coming from a single critic.\nThe weighting parameter is adjusted during training such that the value estimates match the actual discounted return on the most recent episodes and by that it balances over- and underestimation.\nEmpirically, we obtain more accurate value estimates and demonstrate state of the art results on several OpenAI gym tasks.",
        "keywords": "Reinforcement Learning;Actor-Critic;Continuous Control",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nicolai Dorka;Joschka Boedecker;Wolfram Burgard",
        "authorids": "dorka@cs.uni-freiburg.de;jboedeck@cs.uni-freiburg.de;burgard@cs.uni-freiburg.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://gofile.io/?c=AQFK3j",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=r1xyayrtDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "658;198;362",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.0,
            190.354056081468
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JM6N1kZn72IJ:scholar.google.com/&scioq=Dynamically+Balanced+Value+Estimates+for+Actor-Critic+Methods&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJe04p4YDB",
        "title": "Semi-supervised Learning by Coaching",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recent semi-supervised learning (SSL) methods often have a teacher to train a student in order to propagate labels from labeled data to unlabeled data. We argue that a weakness of these methods is that the teacher does not learn from the student\u2019s mistakes during the course of student\u2019s learning.  To address this weakness, we introduce Coaching, a framework where a teacher generates pseudo labels for unlabeled data, from which a student will learn and the student\u2019s performance on labeled data will be used as reward to train the teacher using policy gradient.\n\nOur experiments show that Coaching significantly improves over state-of-the-art SSL baselines. For instance, on CIFAR-10, with only 4,000 labeled examples, a WideResNet-28-2 trained by Coaching achieves 96.11% accuracy, which is better than 94.9% achieved by the same architecture trained with 45,000 labeled. On ImageNet with 10% labeled examples, Coaching trains a ResNet-50 to 72.94% top-1 accuracy, comfortably outperforming the existing state-of-the-art by more than 4%. Coaching also scales successfully to the high data regime with full ImageNet. Specifically, with additional 9 million unlabeled images from OpenImages, Coaching trains a ResNet-50 to 82.34% top-1 accuracy, setting a new state-of-the-art for the architecture on ImageNet without using extra labeled data.",
        "keywords": "semi-supervised;teacher;student;label propagation;image classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hieu Pham;Quoc V. Le",
        "authorids": "hyhieu@cmu.edu;qvl@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\npham2020semisupervised,\ntitle={Semi-supervised Learning by Coaching},\nauthor={Hieu Pham and Quoc V. Le},\nyear={2020},\nurl={https://openreview.net/forum?id=rJe04p4YDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJe04p4YDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "445;423;297",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "412;1261;416",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            388.3333333333333,
            65.20395352703359
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            696.3333333333334,
            399.28296844329446
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11909375960442703697&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJe0rkBYvS",
        "title": "Robustified Importance Sampling for Covariate Shift",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "In many learning problems, the training and testing data follow different distributions and a particularly common situation is the covariate shift. To correct for sampling biases, most approaches, including the popular kernel mean matching (KMM), focus on estimating the importance weights between the two distributions. Reweighting-based methods, however, are exposed to high variance when the distributional discrepancy is large. On the other hand, the alternate approach of using nonparametric regression (NR) incurs high bias when the training size is limited. In this paper, we propose and analyze a new estimator that systematically integrates the residuals of NR with KMM reweighting, based on a  control-variate perspective. The proposed estimator is shown to either outperform or match the best-known existing rates for both KMM and NR, and thus is a robust combination of both estimators. The experiments shows our estimator works well in practice.",
        "keywords": "Covariate Shift;Transfer Learning;Kernel Mean Matching",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Henry Lam;Fengpei Li;Siddharth Prusty",
        "authorids": "khl2114@columbia.edu;fl2412@columbia.edu;siddharth.prusty@columbia.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=rJe0rkBYvS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8IsraltcATsJ:scholar.google.com/&scioq=Robustified+Importance+Sampling+for+Covariate+Shift&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJe1DTNYPH",
        "title": "Towards Disentangling Non-Robust and Robust Components in Performance Metric",
        "track": "main",
        "status": "Reject",
        "tldr": "We show the relation between standard performance and adversarial robustness by disentangling the non-robust and robust components in a proposed performance metric.",
        "abstract": "The vulnerability to slight input perturbations is a worrying yet intriguing property of deep neural networks (DNNs). Though some efforts have been devoted to investigating the reason behind such adversarial behavior, the relation between standard accuracy and adversarial behavior of DNNs is still little understood. In this work, we reveal such relation by first introducing a metric characterizing the standard performance of DNNs. Then we theoretically show this metric can be disentangled into an information-theoretic non-robust component that is related to adversarial behavior, and a robust component. Then, we show by experiments that DNNs under standard training rely heavily on optimizing the non-robust component in achieving decent performance. We also demonstrate current state-of-the-art adversarial training algorithms indeed try to robustify DNNs by preventing them from using the non-robust component to distinguish samples from different categories. Based on our findings, we take a step forward and point out the possible direction of simultaneously achieving decent standard generalization and adversarial robustness. It is hoped that our theory can further inspire the community to make more interesting discoveries about the relation between standard accuracy and adversarial robustness of DNNs.",
        "keywords": "adversarial examples;robust machine learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yujun Shi;Benben Liao;Guangyong Chen;Yun Liu;Ming-ming Cheng;Jiashi Feng",
        "authorids": "shiyujun1016@gmail.com;bliao@tencent.com;gycchen@tencent.com;nk12csly@mail.nankai.edu.cn;cmm@nankai.edu.cn;elefjia@nus.edu.sg",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nshi2020towards,\ntitle={Towards Disentangling Non-Robust and Robust Components in Performance Metric},\nauthor={Yujun Shi and Benben Liao and Guangyong Chen and Yun Liu and Ming-ming Cheng and Jiashi Feng},\nyear={2020},\nurl={https://openreview.net/forum?id=rJe1DTNYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJe1DTNYPH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "431;169;140",
        "wc_reply_reviewers": "200;0;0",
        "wc_reply_authors": "488;286;302",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            246.66666666666666,
            130.8799280086735
        ],
        "wc_reply_reviewers_avg": [
            66.66666666666667,
            94.28090415820634
        ],
        "wc_reply_authors_avg": [
            358.6666666666667,
            91.68545262047967
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pfJ6_Lb_qlcJ:scholar.google.com/&scioq=Towards+Disentangling+Non-Robust+and+Robust+Components+in+Performance+Metric&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJe2syrtvS",
        "title": "The Ingredients of Real World Robotic Reinforcement Learning",
        "track": "main",
        "status": "Spotlight",
        "tldr": "System to learn robotic tasks in the real world with reinforcement learning without instrumentation",
        "abstract": "The success of reinforcement learning in the real world has been limited to instrumented laboratory scenarios, often requiring arduous human supervision to enable continuous learning. In this work, we discuss the required elements of a robotic system that can continually and autonomously improve with data collected in the real world, and propose a particular instantiation of such a system. Subsequently, we investigate a number of challenges of learning without instrumentation -- including the lack of episodic resets, state estimation, and hand-engineered rewards -- and propose simple, scalable solutions to these challenges. We demonstrate the efficacy of our proposed system on dexterous robotic manipulation tasks in simulation and the real world, and also provide an insightful analysis and ablation study of the challenges associated with this learning paradigm.",
        "keywords": "Reinforcement Learning;Robotics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Henry Zhu;Justin Yu;Abhishek Gupta;Dhruv Shah;Kristian Hartikainen;Avi Singh;Vikash Kumar;Sergey Levine",
        "authorids": "henryzhu@berkeley.edu;justinvyu@berkeley.edu;abhigupta@berkeley.edu;shah@eecs.berkeley.edu;kristian.hartikainen@gmail.com;avisingh@cs.berkeley.edu;vikashplus@gmail.com;svlevine@eecs.berkeley.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nZhu2020The,\ntitle={The Ingredients of Real World Robotic Reinforcement Learning},\nauthor={Henry Zhu and Justin Yu and Abhishek Gupta and Dhruv Shah and Kristian Hartikainen and Avi Singh and Vikash Kumar and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJe2syrtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJe2syrtvS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "345;529;166",
        "wc_reply_reviewers": "166;0;0",
        "wc_reply_authors": "614;752;21",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            346.6666666666667,
            148.19881540987058
        ],
        "wc_reply_reviewers_avg": [
            55.333333333333336,
            78.25315045131126
        ],
        "wc_reply_authors_avg": [
            462.3333333333333,
            317.11442028951564
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 220,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5896140548161036447&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJe4_xSFDB",
        "title": "Lipschitz constant estimation of Neural Networks via sparse polynomial optimization",
        "track": "main",
        "status": "Poster",
        "tldr": "LP-based upper bounds on the Lipschitz constant of Neural Networks",
        "abstract": "We introduce LiPopt, a polynomial optimization framework for computing increasingly tighter upper bound on the Lipschitz constant of neural networks. The underlying optimization problems boil down to either linear (LP) or semidefinite (SDP) programming. We show how to use the sparse connectivity of a network, to significantly reduce the complexity of computation. This is specially useful for convolutional as well as pruned neural networks. We conduct experiments on networks with random weights as well as networks trained on MNIST, showing that in the particular case of the $\\ell_\\infty$-Lipschitz constant, our approach yields superior estimates as compared to other baselines available in the literature.\n",
        "keywords": "robust networks;Lipschitz constant;polynomial optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fabian Latorre;Paul Rolland;Volkan Cevher",
        "authorids": "fabian.latorre@epfl.ch;paul.rolland@epfl.ch;volkan.cevher@epfl.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLatorre2020Lipschitz,\ntitle={Lipschitz constant estimation of Neural Networks via sparse polynomial optimization},\nauthor={Fabian Latorre and Paul Rolland and Volkan Cevher},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJe4_xSFDB}\n}",
        "github": "https://drive.google.com/drive/folders/1bkj0H6Thgd9sjRloyq9NBP0uO0v704E9?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJe4_xSFDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "281;374;241",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "162;148;196",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            298.6666666666667,
            55.71554740126155
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            168.66666666666666,
            20.154955277107966
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 156,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4669545160056881801&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rJe5_CNtPB",
        "title": "Attention Forcing for Sequence-to-sequence Model Training",
        "track": "main",
        "status": "Reject",
        "tldr": "A method to train attention-based sequence-to-sequence models",
        "abstract": "Auto-regressive sequence-to-sequence models with attention mechanism have achieved state-of-the-art performance in many tasks such as machine translation and speech synthesis. These models can be difficult to train. The standard approach, teacher forcing, guides a model with reference output history during training. The problem is that the model is unlikely to recover from its mistakes during inference, where the reference output is replaced by generated output. Several approaches deal with this problem, largely by guiding the model with generated output history. To make training stable, these approaches often require a heuristic schedule or an auxiliary classifier. This paper introduces attention forcing, which guides the model with generated output history and reference attention. This approach can train the model to recover from its mistakes, in a stable fashion, without the need for a schedule or a classifier. In addition, it allows the model to generate output sequences aligned with the references, which can be important for cascaded systems like many speech synthesis systems. Experiments on speech synthesis show that attention forcing yields significant performance gain. Experiments on machine translation show that for tasks where various re-orderings of the output are valid, guiding the model with generated output history is challenging, while guiding the model with reference attention is beneficial.",
        "keywords": "deep learning;sequence-to-sequence model;attention mechanism;speech synthesis;machine translation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qingyun Dou;Yiting Lu;Joshua Efiong;Mark J.F. Gales",
        "authorids": "qd212@cam.ac.uk;ytl28@cam.ac.uk;je369@cam.ac.uk;mjfg@cam.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ndou2020attention,\ntitle={Attention Forcing for Sequence-to-sequence Model Training},\nauthor={Qingyun Dou and Yiting Lu and Joshua Efiong and Mark J.F. Gales},\nyear={2020},\nurl={https://openreview.net/forum?id=rJe5_CNtPB}\n}",
        "github": "https://github.com/qingyundou/tacotron_qdou",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJe5_CNtPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "469;223;157",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "336;310;187",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            283.0,
            134.25349157470728
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            277.6666666666667,
            64.98375865467378
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8588978536486211543&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJe6t1SFDB",
        "title": "AMUSED: A Multi-Stream Vector Representation Method for Use In Natural Dialogue",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper provides a multi -stream end to end approach to learn unified embeddings for query-response pairs in dialogue systems by leveraging contextual, syntactic, semantic and external information together.",
        "abstract": "The problem of building a coherent and non-monotonous conversational agent with proper discourse and coverage is still an area of open research. Current architectures only take care of semantic and contextual information for a given query and fail to completely account for syntactic and external knowledge which are crucial for generating responses in a chit-chat system. To overcome this problem, we propose an end to end multi-stream deep learning architecture which learns unified embeddings for query-response pairs by leveraging contextual information from memory networks and syntactic information by incorporating Graph Convolution Networks (GCN) over their dependency parse. A stream of this network also utilizes transfer learning by pre-training a bidirectional transformer to extract semantic representation for each input sentence and incorporates external knowledge through the neighbourhood of the entities from a Knowledge Base (KB). We benchmark these embeddings on next sentence prediction task and significantly improve upon the existing techniques. Furthermore, we use AMUSED to represent query and responses along with its context to develop a retrieval based conversational agent which has been validated by expert linguists to have comprehensive engagement with humans.",
        "keywords": "Natural Language Processing;Dialogue Systems;Learning Embeddings;Knowledge Graphs;Memory Networks;Graph Convolution Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gaurav Kumar;Rishabh Joshi;Jaspreet Singh;Promod Yenigalla",
        "authorids": "gaurav.k1@samsung.com;rjoshi2@andrew.cmu.edu;jaspreet.ahluwalia@stonybrook.edu;promod.y@samsung.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJe6t1SFDB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "464;530;370",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "695;399;144",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            454.6666666666667,
            65.65228268858763
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            412.6666666666667,
            225.15229413789137
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15745751645303397818&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "rJe7CkrFvS",
        "title": "Improving Exploration of Deep Reinforcement Learning using Planning for Policy Search",
        "track": "main",
        "status": "Reject",
        "tldr": "We employ a sample-based planning method for more directed exploration and efficiency in policy learning",
        "abstract": "Most Deep Reinforcement Learning methods perform local search and\ntherefore are prone to get stuck on non-optimal\nsolutions. Furthermore, in simulation based training, such as\ndomain-randomized simulation training, the availability of a simulation\nmodel is not exploited, which potentially decreases\nefficiency. To overcome issues of local search and exploit\naccess to simulation models, we propose the use of kino-dynamic\nplanning methods as part of a model-based reinforcement learning\nmethod and to learn in an off-policy fashion from solved planning\ninstances. We show that, even on a simple toy domain, D-RL\nmethods (DDPG, PPO, SAC) are not immune to local optima and\nrequire additional exploration mechanisms. We show that our\nplanning method exhibits a better state space coverage, collects\ndata that allows for better policies than D-RL methods without\nadditional exploration mechanisms and that starting from the\nplanner data and performing additional training results in as\ngood as or better policies than vanilla D-RL methods, while also\ncreating data that is more fit for re-use in modified tasks.\n",
        "keywords": "reinforcement learning;kinodynamic planning;policy search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jakob J. Hollenstein;Erwan Renaudo;Justus Piater",
        "authorids": "jakob.hollenstein@uibk.ac.at;erwan.renaudo@uibk.ac.at;justus.piater@uibk.ac.at",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhollenstein2020improving,\ntitle={Improving Exploration of Deep Reinforcement Learning using Planning for Policy Search},\nauthor={Jakob J. Hollenstein and Erwan Renaudo and Justus Piater},\nyear={2020},\nurl={https://openreview.net/forum?id=rJe7CkrFvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJe7CkrFvS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "313;232;135",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "361;466;76",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            226.66666666666666,
            72.76598717044172
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            301.0,
            164.7725705328408
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16808387671525443674&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJe8pxSFwr",
        "title": "End-to-end learning of energy-based representations for irregularly-sampled signals and images",
        "track": "main",
        "status": "Reject",
        "tldr": "We address the end-to-end learning of energy-based representations for signal and image observation dataset with irregular sampling patterns.",
        "abstract": "For numerous domains, including for instance earth observation, medical imaging, astrophysics,..., available image and signal datasets often irregular space-time sampling patterns and large missing data rates. These sampling properties is a critical issue to apply state-of-the-art learning-based (e.g., auto-encoders, CNNs,...) to fully benefit from the available large-scale observations and reach breakthroughs in the reconstruction and identification of processes of interest. In this paper, we address the end-to-end learning of representations of signals, images and image sequences from irregularly-sampled data, {\\em i.e.} when the training data involved missing data. From an analogy to Bayesian formulation, we consider energy-based representations. Two energy forms are investigated: one derived from auto-encoders and one relating to Gibbs energies. The learning stage of these energy-based representations (or priors) involve a joint interpolation issue, which resorts to solving an energy minimization problem under observation constraints. Using a neural-network-based implementation of the considered energy forms, we can state an end-to-end learning scheme from irregularly-sampled data. We demonstrate the relevance of the proposed representations for different case-studies: namely, multivariate time series, 2{\\sc } images and image sequences.",
        "keywords": "end-to-end-learning;irregularly-sampled data;energy representations;optimal interpolation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ronan Fablet;Lucas Drumetz;Fran\u00e7ois Rousseau",
        "authorids": "ronan.fablet@imt-atlantique.fr;lucas.drumetz@imt-atlantique.fr;francois.rousseau@imt-atlantique.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfablet2020endtoend,\ntitle={End-to-end learning of energy-based representations for irregularly-sampled signals and images},\nauthor={Ronan Fablet and Lucas Drumetz and Fran{\\c{c}}ois Rousseau},\nyear={2020},\nurl={https://openreview.net/forum?id=rJe8pxSFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJe8pxSFwr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "641;469;256",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            455.3333333333333,
            157.47239617010837
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14288699022988636396&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJe9fTNtPS",
        "title": "AHash: A Load-Balanced One Permutation Hash",
        "track": "main",
        "status": "Reject",
        "tldr": "Compact high-dimensional data for efficient learning and searching.",
        "abstract": "Minwise Hashing (MinHash) is a fundamental method to compute set similarities and compact high-dimensional data for efficient learning and searching. The bottleneck of MinHash is computing k (usually hundreds) MinHash values. One Permutation Hashing (OPH) only requires one permutation (hash function)  to get k MinHash values by dividing elements into k bins.  One drawback of OPH is that the load of the bins (the number of elements in a bin) could be unbalanced, which leads to the existence of empty bins and false similarity computation. Several strategies for densification, that is, filling empty bins, have been proposed. However, the densification is just a remedial strategy and cannot eliminate the error incurred by the unbalanced load. Unlike the densification to fill the empty bins after they undesirably occur, our design goal is to balance the load so as to reduce the empty bins in advance.  In this paper, we propose a load-balanced hashing, Amortization Hashing (AHash), which can generate as few empty bins as possible. Therefore, AHash is more load-balanced and accurate without hurting runtime efficiency compared with OPH and densification strategies. Our experiments on real datasets validate the claim. All source codes and datasets have been provided as Supplementary Materials and released on GitHub anonymously.",
        "keywords": "Data Representation;Probabilistic Algorithms",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chenxingyu Zhao;Jie Gui;Yixiao Guo;Jie Jiang;Tong Yang;Bin Cui;Gong Zhang",
        "authorids": "dkzcxy@pku.edu.cn;guisj2017@pku.edu.cn;1700016637@pku.edu.cn;jie.jiang@pku.edu.cn;yangtongemail@gmail.com;bin.cui@pku.edu.cn;nicholas.zhang@huawei.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nzhao2020ahash,\ntitle={{\\{}AH{\\}}ash: A Load-Balanced One Permutation Hash},\nauthor={Chenxingyu Zhao and Jie Gui and Yixiao Guo and Jie Jiang and Tong Yang and Bin Cui and Gong Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=rJe9fTNtPS}\n}",
        "github": "https://github.com/AHashCodes/AHash",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJe9fTNtPS",
        "pdf_size": 0,
        "rating": "1;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "905;130;229;298",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            4.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            390.5,
            302.9888611813972
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:t2_bzu3ThOwJ:scholar.google.com/&scioq=AHash:+A+Load-Balanced+One+Permutation+Hash&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJe9lpEFDH",
        "title": "The Geometry of Sign Gradient Descent",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigate which properties of an objective function favor sign gradient descent.",
        "abstract": "Sign gradient descent has become popular in machine learning due to its favorable communication cost in distributed optimization and its good performance in neural network training. However, we currently do not have a good understanding of which geometrical properties of the objective function determine the relative speed of sign gradient descent compared to standard gradient descent. In this work, we frame sign gradient descent as steepest descent with respect to the maximum norm. We review the steepest descent framework and the related concept of smoothness with respect to arbitrary norms.\nBy studying the smoothness constant resulting from the $L^\\infty$-geometry, we isolate properties of the objective which favor sign gradient descent relative to gradient descent. In short, we find two requirements on its Hessian: (i) some degree of ``diagonal dominance'' and (ii) the maximal eigenvalue being much larger than the average eigenvalue. We also clarify the meaning of a certain separable smoothness assumption used in previous analyses of sign gradient descent.\nExperiments verify the developed theory.",
        "keywords": "Sign gradient descent;signSGD;steepest descent;Adam",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lukas Balles;Fabian Pedregosa;Nicolas Le Roux",
        "authorids": "lukas.balles@tuebingen.mpg.de;f@bianp.net;nicolas@le-roux.name",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nballes2020the,\ntitle={The Geometry of Sign Gradient Descent},\nauthor={Lukas Balles and Fabian Pedregosa and Nicolas Le Roux},\nyear={2020},\nurl={https://openreview.net/forum?id=rJe9lpEFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJe9lpEFDH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "112;509;208",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "238;885;284",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            276.3333333333333,
            169.12388621625536
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            469.0,
            294.7552657149091
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=529255871309555210&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJeA_aVtPB",
        "title": "Decaying momentum helps neural network training",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a momentum decay rule which significantly improves the performance of Adam and momentum SGD",
        "abstract": "Momentum is a simple and popular technique in deep learning for gradient-based optimizers. We propose a decaying momentum (Demon) rule, motivated by decaying the total contribution of a gradient to all future updates. Applying Demon to Adam leads to significantly improved training, notably competitive to momentum SGD with learning rate decay, even in settings in which adaptive methods are typically non-competitive. Similarly, applying Demon to momentum SGD rivals momentum SGD with learning rate decay, and in many cases leads to improved performance. Demon is trivial to implement and incurs limited extra computational overhead, compared to the vanilla counterparts. ",
        "keywords": "sgd;momentum;adam;optimization;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "John Chen;Anastasios Kyrillidis",
        "authorids": "jc114@rice.edu;anastasios@rice.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nchen2020decaying,\ntitle={Decaying momentum helps neural network training},\nauthor={John Chen and Anastasios Kyrillidis},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeA_aVtPB}\n}",
        "github": "https://gofile.io/?c=rRFyJF",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJeA_aVtPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "258;289;192",
        "wc_reply_reviewers": "200;114;44",
        "wc_reply_authors": "813;1160;320",
        "reply_reviewers": "1;1;1",
        "reply_authors": "2;3;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            246.33333333333334,
            40.45024378445972
        ],
        "wc_reply_reviewers_avg": [
            119.33333333333333,
            63.798293254774634
        ],
        "wc_reply_authors_avg": [
            764.3333333333334,
            344.6508700441974
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8769803349073207025&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJeB22VFvS",
        "title": "Towards More Realistic Neural Network Uncertainties",
        "track": "main",
        "status": "Reject",
        "tldr": "We assess and improve the quality of neural network uncertainties by proposing an evaluation criterion and introducing a new uncertainty mechanism.",
        "abstract": "Statistical models are inherently uncertain. Quantifying or at least upper-bounding their uncertainties is vital for safety-critical systems. While standard neural networks do not report this information, several approaches exist to integrate uncertainty estimates into them. Assessing the quality of these uncertainty estimates is not straightforward, as no direct ground truth labels are available. Instead, implicit statistical assessments are required. For regression, we propose to evaluate uncertainty realism---a strict quality criterion---with a Mahalanobis distance-based statistical test. An empirical evaluation reveals the need for uncertainty measures that are appropriate to upper-bound heavy-tailed empirical errors. Alongside, we transfer the variational U-Net classification architecture to standard supervised image-to-image tasks. It provides two uncertainty mechanisms and significantly improves uncertainty realism compared to a plain encoder-decoder model.",
        "keywords": "uncertainty;variational inference;MC dropout;variational autoencoder;evaluation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Joachim Sicking;Alexander Kister;Matthias Fahrland;Stefan Eickeler;Fabian Hueger;Stefan Rueping;Peter Schlicht;Tim Wirtz",
        "authorids": "joachim.sicking@iais.fraunhofer.de;alexander.kister@iais.fraunhofer.de;matthias.fahrland@iav.de;stefan.eickeler@iais.fraunhofer.de;fabian.hueger@volkswagen.de;stefan.rueping@iais.fraunhofer.de;peter.schlicht@volkswagen.de;tim.wirtz@iais.fraunhofer.de",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nsicking2020towards,\ntitle={Towards More Realistic Neural Network Uncertainties},\nauthor={Joachim Sicking and Alexander Kister and Matthias Fahrland and Stefan Eickeler and Fabian Hueger and Stefan Rueping and Peter Schlicht and Tim Wirtz},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeB22VFvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJeB22VFvS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "138;471;460",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            356.3333333333333,
            154.45027966594586
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3T5QYnXoeT0J:scholar.google.com/&scioq=Towards+More+Realistic+Neural+Network+Uncertainties&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJeB36NKvB",
        "title": "How much Position Information Do Convolutional Neural Networks Encode?",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Our work shows positional information has been implicitly encoded in a network. This information is important for detecting position-dependent features, e.g. semantic and saliency.",
        "abstract": "In contrast to fully connected networks, Convolutional Neural Networks (CNNs) achieve efficiency by learning weights associated with local filters with a finite spatial extent. An implication of this is that a filter may know what it is looking at, but not where it is positioned in the image. Information concerning absolute position is inherently useful, and it is reasonable to assume that deep CNNs may implicitly learn to encode this information if there is a means to do so. In this paper, we test this hypothesis revealing the surprising degree of absolute position information that is encoded in commonly used neural networks. A comprehensive set of experiments show the validity of this hypothesis and shed light on how and where this information is represented while offering clues to where positional information is derived from in deep CNNs.",
        "keywords": "network understanding;absolute position information",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Md Amirul Islam*;Sen Jia*;Neil D. B. Bruce",
        "authorids": "amirul@scs.ryerson.ca;sen.jia@ryerson.ca;bruce@ryerson.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nIslam*2020How,\ntitle={How much Position Information Do Convolutional Neural Networks Encode?},\nauthor={Md Amirul Islam* and Sen Jia* and Neil D. B. Bruce},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeB36NKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJeB36NKvB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "164;163;185",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "98;218;159",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            170.66666666666666,
            10.143416036468626
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            158.33333333333334,
            48.99206284922306
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 462,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14040908048184084463&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJeBJJBYDB",
        "title": "Chart Auto-Encoders for Manifold Structured Data",
        "track": "main",
        "status": "Reject",
        "tldr": "Manifold-structured latent space for generative models",
        "abstract": " Auto-encoding and generative models have made tremendous successes in image and signal representation learning and generation. These models, however, generally employ the full Euclidean space or a bounded subset (such as $[0,1]^l$) as the latent space, whose trivial geometry is often too simplistic to meaningfully reflect the structure of the data. This paper aims at exploring a nontrivial geometric structure of the latent space for better data representation. Inspired by differential geometry, we propose \\textbf{Chart Auto-Encoder (CAE)}, which captures the manifold structure of the data with multiple charts and transition functions among them. CAE translates the mathematical definition of manifold through parameterizing the entire data set as a collection of overlapping charts, creating local latent representations. These representations are an enhancement of the single-charted latent space commonly employed in auto-encoding models, as they reflect the intrinsic structure of the manifold.  Therefore, CAE achieves a more accurate approximation of data and generates realistic new ones. We conduct experiments with synthetic and real-life data to demonstrate the effectiveness of the proposed CAE. ",
        "keywords": "Auto-encoder;differential manifolds;multi-charted latent space",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stephan Schonsheck;Jie Chen;Rongjie Lai",
        "authorids": "schons@rpi.edu;chenjie@us.ibm.com;lair@rpi.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nschonsheck2020chart,\ntitle={Chart Auto-Encoders for Manifold Structured  Data},\nauthor={Stephan Schonsheck and Jie Chen and Rongjie Lai},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeBJJBYDB}\n}",
        "github": "https://anonymous.4open.science/r/a40668ab-7542-4028-8709-694142a985da/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJeBJJBYDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "2069;1198;269",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1631;1306;485",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1178.6666666666667,
            734.9740736168105
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1140.6666666666667,
            482.2380693760661
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 40,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14471010307390772550&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJeGJaEtPH",
        "title": "MIST: Multiple Instance Spatial Transformer Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We propose a deep network that can be trained to tackle image reconstruction and classification problems that involve detection of multiple object instances, without any supervision regarding their whereabouts. The network learns to extract the most significant top-K patches, and feeds these patches to a task-specific network -- e.g., auto-encoder or classifier -- to solve a domain specific problem. The challenge in training such a network is the non-differentiable top-K selection process. To address this issue, we lift the training optimization problem by treating the result of top-K selection as a slack variable, resulting in a simple, yet effective, multi-stage training. Our method is able to learn to detect recurrent structures in the training dataset by learning to reconstruct images. It can also learn to localize structures when only knowledge on the occurrence of the object is provided, and in doing so it outperforms the state-of-the-art.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Baptiste Angles;Simon Kornblith;Shahram Izadi;Andrea Tagliasacchi;Kwang Moo Yi",
        "authorids": "baptiste.angles@gmail.com;skornblith@google.com;shahrami@google.com;taglia@google.com;kyi@uvic.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nangles2020mist,\ntitle={{\\{}MIST{\\}}: Multiple Instance Spatial Transformer Networks},\nauthor={Baptiste Angles and Simon Kornblith and Shahram Izadi and Andrea Tagliasacchi and Kwang Moo Yi},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeGJaEtPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJeGJaEtPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "502;386;318",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "192;479;316",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            402.0,
            75.96490417730195
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            329.0,
            117.52730179267567
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1866821344209049802&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rJeIGkBKPS",
        "title": "Improving Confident-Classifiers For Out-of-distribution Detection",
        "track": "main",
        "status": "Reject",
        "tldr": "It is a classifier based Out-of-distribution detection method",
        "abstract": "Discriminatively trained neural classifiers can be trusted, only when the input data comes from the training distribution (in-distribution).  Therefore, detecting out-of-distribution  (OOD)  samples is very important to avoid classification errors. In the context of OOD detection for image classification,  one of the recent approaches proposes training a classifier called \u201cconfident-classifier\u201d by minimizing the standard cross-entropy loss on in-distribution samples and minimizing the KLdivergence between the predictive distribution of OOD samples in the low-density\u201cboundary\u201d of in-distribution and the uniform distribution (maximizing the entropy of the outputs).  Thus, the samples could be detected as OOD if they have low confidence or high entropy.  In this paper, we analyze this setting both theoretically and experimentally.  We also propose a novel algorithm to generate the\u201cboundary\u201d OOD samples to train a classifier with an explicit \u201creject\u201d class for OOD samples.  We compare our approach against several recent classifier-based OOD detectors including the confident-classifiers on MNIST and Fashion-MNISTdatasets.  Overall the proposed approach consistently performs better than others across most of the experiments.",
        "keywords": "Out-of-distribution detection;Manifold;Nullspace;Variational Auto-encoder;GAN;Confident-classifier",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sachin Vernekar;Ashish Gaurav;Vahdat Abdelzad;Taylor Denouden;Rick Salay;Krzysztof Czarnecki",
        "authorids": "sverneka@uwaterloo.ca;a5gaurav@uwaterloo.ca;vabdelza@gsd.uwaterloo.ca;tadenoud@uwaterloo.ca;rsalay@gsd.uwaterloo.ca;kczarnec@gsd.uwaterloo.ca",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nvernekar2020improving,\ntitle={Improving Confident-Classifiers For Out-of-distribution Detection},\nauthor={Sachin Vernekar and Ashish Gaurav and Vahdat Abdelzad and Taylor Denouden and Rick Salay and Krzysztof Czarnecki},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeIGkBKPS}\n}",
        "github": "https://github.com/iclr2020-ai/ICLR2020",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJeIGkBKPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "366;581;187",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "758;801;598",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            378.0,
            161.07348219575644
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            719.0,
            87.34223873170797
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Jd-liAum3J8J:scholar.google.com/&scioq=Improving+Confident-Classifiers+For+Out-of-distribution+Detection&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJeINp4KwH",
        "title": "Population-Guided Parallel Policy Search for Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "In this paper, a new population-guided parallel learning scheme is proposed to enhance the performance of off-policy reinforcement learning (RL). In the proposed scheme, multiple identical learners with their own value-functions and policies share a common experience replay buffer, and search a good policy in collaboration with the guidance of the best policy information. The key point is that the information of the best policy  is fused in a soft manner by constructing an augmented loss function for policy update to enlarge the overall search region by the multiple learners. The guidance by the previous best policy and the enlarged  range enable faster and better policy search, and monotone improvement of the expected cumulative return by the proposed scheme is proved theoretically. Working algorithms are constructed by applying the proposed scheme to the twin delayed deep deterministic (TD3) policy gradient algorithm, and numerical results show that the constructed P3S-TD3 outperforms most of the current state-of-the-art RL algorithms, and the gain is significant in the case of sparse reward environment.",
        "keywords": "Reinforcement Learning;Parallel Learning;Population Based Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Whiyoung Jung;Giseung Park;Youngchul Sung",
        "authorids": "wy.jung@kaist.ac.kr;gs.park@kaist.ac.kr;ycsung@kaist.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nJung2020Population-Guided,\ntitle={Population-Guided Parallel Policy Search for Reinforcement Learning},\nauthor={Whiyoung Jung and Giseung Park and Youngchul Sung},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeINp4KwH}\n}",
        "github": "[![github](/images/github_icon.svg) wyjung0625/p3s](https://github.com/wyjung0625/p3s)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJeINp4KwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "428;1125;523",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "295;201;516",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            692.0,
            308.6238271207631
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            337.3333333333333,
            132.03619032506035
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 50,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13101828686651859537&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJeIcTNtvS",
        "title": "Low-Resource Knowledge-Grounded Dialogue Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Responding with knowledge has been recognized as an important capability for an intelligent conversational agent. Yet knowledge-grounded dialogues, as training data for learning such a response generation model, are difficult to obtain. Motivated by the challenge in practice, we consider knowledge-grounded dialogue generation under a natural assumption that only limited training examples are available. In such a low-resource setting, we devise a disentangled response decoder in order to isolate parameters that depend on knowledge-grounded dialogues from the entire generation model. By this means, the major part of the model can be learned from a large number of ungrounded dialogues and unstructured documents, while the remaining small parameters can be well fitted using the limited training examples. Evaluation results on two benchmarks indicate that with only $1/8$ training data, our model can achieve the state-of-the-art performance and generalize well on out-of-domain knowledge. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xueliang Zhao;Wei Wu;Chongyang Tao;Can Xu;Dongyan Zhao;Rui Yan",
        "authorids": "xl.zhao@pku.edu.cn;wuwei@microsoft.com;chongyangtao@pku.edu.cn;can.xu@microsoft.com;zhaody@pku.edu.cn;ruiyan@pku.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nZhao2020Low-Resource,\ntitle={Low-Resource Knowledge-Grounded Dialogue Generation},\nauthor={Xueliang Zhao and Wei Wu and Chongyang Tao and Can Xu and Dongyan Zhao and Rui Yan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeIcTNtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJeIcTNtvS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "593;392;335",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "759;423;246",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            440.0,
            110.66164647247935
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            476.0,
            212.7580785775243
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 117,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9356652618510912396&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJeO3aVKPB",
        "title": "Faster Neural Network Training with Data Echoing",
        "track": "main",
        "status": "Reject",
        "tldr": "When training is I/O bound reuse already-read examples without losing accuracy for faster training.",
        "abstract": "In the twilight of Moore's law, GPUs and other specialized hardware accelerators have dramatically sped up neural network training. However, earlier stages of the training pipeline, such as disk I/O and data preprocessing, do not run on accelerators. As accelerators continue to improve, these earlier stages will increasingly become the bottleneck. In this paper, we introduce \u201cdata echoing,\u201d which reduces the total computation used by earlier pipeline stages and speeds up training whenever computation upstream from accelerators dominates the training time. Data echoing reuses (or \u201cechoes\u201d) intermediate outputs from earlier pipeline stages in order to reclaim idle capacity. We investigate the behavior of different data echoing algorithms on various workloads, for various amounts of echoing, and for various batch sizes. We find that in all settings, at least one data echoing algorithm can match the baseline's predictive performance using less upstream computation. We measured a factor of 3.25 decrease in wall-clock time for ResNet-50 on ImageNet when reading training data over a network.",
        "keywords": "systems;faster training;large scale",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dami Choi;Alexandre Passos;Christopher J. Shallue;George E. Dahl",
        "authorids": "choidami@cs.toronto.edu;apassos@google.com;shallue@google.com;gdahl@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nchoi2020faster,\ntitle={Faster Neural Network Training with Data Echoing},\nauthor={Dami Choi and Alexandre Passos and Christopher J. Shallue and George E. Dahl},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeO3aVKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJeO3aVKPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "373;170;408",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "394;173;309",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            317.0,
            104.92219339427987
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            292.0,
            91.02014429051772
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 60,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15170951070340219064&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJePwgSYwB",
        "title": "SGD Learns One-Layer Networks in WGANs",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that stochastic gradient descent ascent converges to a global optimum for WGAN with one-layer generator network.",
        "abstract": "Generative adversarial networks (GANs) are a widely used framework for learning generative models. Wasserstein GANs (WGANs), one of the most successful variants of GANs, require solving a minmax problem to global optimality, but in practice, are successfully trained with stochastic gradient descent-ascent. In this paper, we show that, when the generator is a one-layer network, stochastic gradient descent-ascent converges to a global solution in polynomial time and sample complexity.",
        "keywords": "Wasserstein GAN;global min-max;one-layer network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qi Lei;Jason D. Lee;Alexandros G. Dimakis;Constantinos Daskalakis",
        "authorids": "leiqi@ices.utexas.edu;jasondlee88@gmail.com;dimakis@austin.utexas.edu;costis@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nlei2020sgd,\ntitle={{\\{}SGD{\\}} Learns One-Layer Networks in {\\{}WGAN{\\}}s},\nauthor={Qi Lei and Jason D. Lee and Alexandros G. Dimakis and Constantinos Daskalakis},\nyear={2020},\nurl={https://openreview.net/forum?id=rJePwgSYwB}\n}",
        "github": "https://colab.research.google.com/drive/1P1hBwPcq21oj2IroX1rWXUJ7NUCD33CH",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJePwgSYwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "226;495;506",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "652;620;631",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            409.0,
            129.47844093387388
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            634.3333333333334,
            13.27487183449325
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15750153197265759924&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "rJeQoCNYDS",
        "title": "Single Episode Policy Transfer in Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "Single episode policy transfer in a family of environments with related dynamics, via optimized probing for rapid inference of latent variables and immediate execution of a universal policy.",
        "abstract": "Transfer and adaptation to new unknown environmental dynamics is a key challenge for reinforcement learning (RL). An even greater challenge is performing near-optimally in a single attempt at test time, possibly without access to dense rewards, which is not addressed by current methods that require multiple experience rollouts for adaptation. To achieve single episode transfer in a family of environments with related dynamics, we propose a general algorithm that optimizes a probe and an inference model to rapidly estimate underlying latent variables of test dynamics, which are then immediately used as input to a universal control policy. This modular approach enables integration of state-of-the-art algorithms for variational inference or RL. Moreover, our approach does not require access to rewards at test time, allowing it to perform in settings where existing adaptive approaches cannot. In diverse experimental domains with a single episode test constraint, our method significantly outperforms existing adaptive approaches and shows favorable performance against baselines for robust transfer.",
        "keywords": "transfer learning;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiachen Yang;Brenden Petersen;Hongyuan Zha;Daniel Faissol",
        "authorids": "yjiachen@gmail.com;petersen33@llnl.gov;zha@cc.gatech.edu;faissol1@llnl.gov",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nYang2020Single,\ntitle={Single Episode Policy Transfer in Reinforcement Learning},\nauthor={Jiachen Yang and Brenden Petersen and Hongyuan Zha and Daniel Faissol},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeQoCNYDS}\n}",
        "github": "[![github](/images/github_icon.svg) 011235813/SEPT](https://github.com/011235813/SEPT)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJeQoCNYDS",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "401;314;860",
        "wc_reply_reviewers": "0;0;11",
        "wc_reply_authors": "693;411;1724",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;3",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            525.0,
            239.5287039166705
        ],
        "wc_reply_reviewers_avg": [
            3.6666666666666665,
            5.185449728701348
        ],
        "wc_reply_authors_avg": [
            942.6666666666666,
            564.3535141105165
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 42,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2255040216539653326&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rJeUPlrYvr",
        "title": "FNNP: Fast Neural Network Pruning Using Adaptive Batch Normalization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Finding out the computational redundant part of a trained Deep Neural Network (DNN) is the key question that pruning algorithms target on. Many algorithms try to predict model performance of the pruned sub-nets by introducing various evaluation methods. But they are either inaccurate or very complicated for general application. In this work, we present a pruning method called Fast Neural Network Pruning (FNNP), in which a simple yet efficient evaluation component called ABN-based evaluation is applied to unveil a strong correlation between different pruned DNN structures and their final settled accuracy. This strong correlation allows us to fast spot the pruned candidates with highest potential accuracy without actually fine tuning them. FNNP does not require any extra regularization or supervision introduced to a common DNN training pipeline but still can achieve better accuracy than many carefully-designed pruning methods. In the experiments of pruning MobileNet V1 and ResNet-50, FNNP outperforms all compared methods by up to 3.8%. Even in the more challenging experiments of pruning the compact model of MobileNet V1, our FNNP achieves the highest accuracy of 70.7% with an overall 50% operations (FLOPs) pruned. All accuracy data are Top-1 ImageNet classification accuracy. Source code and models are accessible to open-source community.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bailin Li;Bowen Wu;Jiang Su;Guangrun Wang",
        "authorids": "bl-zorro@163.com;wubw6@mail2.sysu.edu.cn;sujiang@dm-ai.cn;wangguangrun@dm-ai.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://github.com/anonymous47823493/FNNP",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJeUPlrYvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "209;334;187",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "497;513;174",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            243.33333333333334,
            64.73707507208593
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            394.6666666666667,
            156.17155808775027
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:77AThhnncfMJ:scholar.google.com/&scioq=FNNP:+Fast+Neural+Network+Pruning+Using+Adaptive+Batch+Normalization&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJeU_1SFvr",
        "title": "LOGAN: Latent Optimisation for Generative Adversarial Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Latent optimisation improves adversarial training dynamics. We present both theoretical analysis and state-of-the-art image generation with ImageNet 128x128.",
        "abstract": "Training generative adversarial networks requires balancing of delicate adversarial dynamics. Even with careful tuning, training may diverge or end up in a bad equilibrium with dropped modes. In this work, we introduce a new form of latent optimisation inspired by the CS-GAN and show that it improves adversarial dynamics by enhancing interactions between the discriminator and the generator. We develop supporting theoretical analysis from the perspectives of differentiable games and stochastic approximation. Our experiments demonstrate that latent optimisation can significantly improve GAN training, obtaining state-of-the-art performance for the ImageNet (128 x 128) dataset. Our model achieves an Inception Score (IS) of 148 and an Frechet Inception Distance (FID) of 3.4, an improvement of 17% and 32% in IS and FID respectively, compared with the baseline BigGAN-deep model with the same architecture and number of parameters.",
        "keywords": "GAN;adversarial training;generative model;game theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yan Wu;Jeff Donahue;David Balduzzi;Karen Simonyan;Timothy Lillicrap",
        "authorids": "yanwu@google.com;jeffdonahue@google.com;dbalduzzi@google.com;simonyan@google.com;countzero@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nwu2020logan,\ntitle={{\\{}LOGAN{\\}}:  Latent Optimisation for Generative Adversarial Networks},\nauthor={Yan Wu and Jeff Donahue and David Balduzzi and Karen Simonyan and Timothy Lillicrap},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeU_1SFvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJeU_1SFvr",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "487;161",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "462;55",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            324.0,
            163.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            258.5,
            203.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 120,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2634479742348525863&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJeW1yHYwH",
        "title": "Inductive representation learning on temporal graphs",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Inductive representation learning on temporal graphs is an important step toward salable machine learning on real-world dynamic networks. The evolving nature of temporal dynamic graphs requires handling new nodes as well as capturing temporal patterns. The node embeddings, which are now functions of time, should represent both the static node features and the evolving topological structures. Moreover, node and topological features can be temporal as well, whose patterns the node embeddings should also capture. We propose the temporal graph attention (TGAT) layer to efficiently aggregate temporal-topological neighborhood features to learn the time-feature interactions. For TGAT, we use the self-attention mechanism as building block and develop a novel functional time encoding technique based on the classical Bochner's theorem from harmonic analysis. By stacking TGAT layers, the network recognizes the node embeddings as functions of time and is able to inductively infer embeddings for both new and observed nodes as the graph evolves. The proposed approach handles both node classification and link prediction task, and can be naturally extended to include the temporal edge features. We evaluate our method with transductive and inductive tasks under temporal settings with two benchmark and one industrial dataset. Our TGAT model compares favorably to state-of-the-art baselines as well as the previous temporal graph embedding approaches.",
        "keywords": "temporal graph;inductive representation learning;functional time encoding;self-attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "da Xu;chuanwei ruan;evren korpeoglu;sushant kumar;kannan achan",
        "authorids": "da.xu@walmartlabs.com;ruanchuanwei@gmail.com;ekorpeoglu@walmart.com;skumar4@walmartlabs.com;kachan@walmartlabs.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nXu2020Inductive,\ntitle={Inductive representation learning on temporal graphs},\nauthor={da Xu and chuanwei ruan and evren korpeoglu and sushant kumar and kannan achan},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeW1yHYwH}\n}",
        "github": "https://drive.google.com/drive/folders/1GaH8vusCXJj4ucayfO-PyHpnNsJRkB78?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJeW1yHYwH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "419;135;158",
        "wc_reply_reviewers": "0;0;64",
        "wc_reply_authors": "569;400;335",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            237.33333333333334,
            128.80044858445004
        ],
        "wc_reply_reviewers_avg": [
            21.333333333333332,
            30.169889330626027
        ],
        "wc_reply_authors_avg": [
            434.6666666666667,
            98.62499119842236
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 800,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6732351798905235278&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJeXDANKwr",
        "title": "NADS: Neural Architecture Distribution Search for Uncertainty Awareness",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an architecture search method to identify a distribution of architectures and use it to construct a Bayesian ensemble for outlier detection.",
        "abstract": "Machine learning systems often encounter Out-of-Distribution (OoD) errors when dealing with testing data coming from a different distribution from the one used for training. With their growing use in critical applications, it becomes important to develop systems that are able to accurately quantify its predictive uncertainty and screen out these anomalous inputs. However, unlike standard learning tasks, there is currently no well established guiding principle for designing architectures that can accurately quantify uncertainty. Moreover, commonly used OoD detection approaches are prone to errors and even sometimes assign higher likelihoods to OoD samples. To address these problems, we first seek to identify guiding principles for designing uncertainty-aware architectures, by proposing Neural Architecture Distribution Search (NADS). Unlike standard neural architecture search methods which seek for a single best performing architecture, NADS searches for a distribution of architectures that perform well on a given task, allowing us to identify building blocks common among all uncertainty aware architectures. With this formulation, we are able to optimize a stochastic outlier detection objective and construct an ensemble of models to perform OoD detection. We perform multiple OoD detection experiments and observe that our NADS performs favorably compared to state-of-the-art OoD detection methods.",
        "keywords": "Neural Architecture Search;Bayesian ensembling;out-of-distribution detection;uncertainty quantification;density estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Randy Ardywibowo;Shahin Boluki;Xinyu Gong;Zhangyang Wang;Xiaoning Qian",
        "authorids": "randyardywibowo@tamu.edu;s.boluki@tamu.edu;gong1994@tamu.edu;atlaswang@tamu.edu;xqian@tamu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nardywibowo2020nads,\ntitle={{\\{}NADS{\\}}: Neural Architecture Distribution Search for Uncertainty Awareness},\nauthor={Randy Ardywibowo and Shahin Boluki and Xinyu Gong and Zhangyang Wang and Xiaoning Qian},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeXDANKwr}\n}",
        "github": "https://anonymous.4open.science/r/e514158f-b2f9-4b61-a0eb-d05fdb250bd9/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJeXDANKwr",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "294;516;723",
        "wc_reply_reviewers": "0;0;28",
        "wc_reply_authors": "541;792;535",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            511.0,
            175.1741990134392
        ],
        "wc_reply_reviewers_avg": [
            9.333333333333334,
            13.199326582148887
        ],
        "wc_reply_authors_avg": [
            622.6666666666666,
            119.76180062477722
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 26,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12928293311704034349&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "rJeXS04FPH",
        "title": "DeFINE: Deep Factorized Input Token Embeddings for Neural Sequence Modeling",
        "track": "main",
        "status": "Poster",
        "tldr": "DeFINE uses a deep, hierarchical, sparse network with new skip connections to learn better word embeddings efficiently. ",
        "abstract": "For sequence models with large vocabularies, a majority of network parameters lie in the input and output layers. In this work, we describe a new method, DeFINE, for learning deep token representations efficiently. Our architecture uses a hierarchical structure with novel skip-connections which allows for the use of low dimensional input and output layers, reducing total parameters and training time while delivering similar or better performance versus existing methods. DeFINE can be incorporated easily in new or existing sequence models. Compared to state-of-the-art methods including adaptive input representations, this technique results in a 6% to 20% drop in perplexity. On WikiText-103, DeFINE reduces the total parameters of Transformer-XL by half with minimal impact on performance. On the Penn Treebank, DeFINE improves AWD-LSTM by 4 points with a 17% reduction in parameters, achieving comparable performance to state-of-the-art methods with fewer parameters. For machine translation, DeFINE improves the efficiency of the Transformer model by about 1.4 times while delivering similar performance.",
        "keywords": "sequence modeling;input representations;language modeling;word embedding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sachin Mehta;Rik Koncel-Kedziorski;Mohammad Rastegari;Hannaneh Hajishirzi",
        "authorids": "sacmehta@uw.edu;kedzior@uw.edu;mohammadr@allenai.org;hannaneh@washington.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nMehta2020DeFINE:,\ntitle={DeFINE: Deep Factorized Input Token Embeddings for Neural Sequence Modeling},\nauthor={Sachin Mehta and Rik Koncel-Kedziorski and Mohammad Rastegari and Hannaneh Hajishirzi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeXS04FPH}\n}",
        "github": "[![github](/images/github_icon.svg) sacmehta/delight](https://github.com/sacmehta/delight)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJeXS04FPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "292;180;368",
        "wc_reply_reviewers": "0;0;126",
        "wc_reply_authors": "515;191;1418",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;3;5",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            280.0,
            77.21830525637472
        ],
        "wc_reply_reviewers_avg": [
            42.0,
            59.39696961966999
        ],
        "wc_reply_authors_avg": [
            708.0,
            519.178196768701
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            3.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 28,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1535018014104631427&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJe_cyrKPB",
        "title": "GroSS Decomposition: Group-Size Series Decomposition for Whole Search-Space Training",
        "track": "main",
        "status": "Reject",
        "tldr": "A decomposition method which allows for simultaneous training of an entire search space of group convolution architectures. ",
        "abstract": "We present Group-size Series (GroSS) decomposition, a mathematical formulation of tensor factorisation into a series of approximations of increasing rank terms. GroSS allows for dynamic and differentiable selection of factorisation rank, which is analogous to a grouped convolution. Therefore, to the best of our knowledge, GroSS is the first method to simultaneously train differing numbers of groups within a single layer, as well as all possible combinations between layers. In doing so, GroSS trains an entire grouped convolution architecture search-space concurrently. We demonstrate this with a proof-of-concept exhaustive architecure search with a performance objective. GroSS represents a significant step towards liberating network architecture search from the burden of training and finetuning.",
        "keywords": "architecture search;block term decomposition;network decomposition;network acceleration;group convolution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Henry Howard-Jenkins;Yiwen Li;Victor Adrian Prisacariu",
        "authorids": "henryhj@robots.ox.ac.uk;kate@robots.ox.ac.uk;victor@robots.ox.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhoward-jenkins2020gross,\ntitle={Gro{\\{}SS{\\}} Decomposition: Group-Size Series Decomposition for Whole Search-Space Training},\nauthor={Henry Howard-Jenkins and Yiwen Li and Victor Adrian Prisacariu},\nyear={2020},\nurl={https://openreview.net/forum?id=rJe_cyrKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJe_cyrKPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "306;430;315",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "835;378;219",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            350.3333333333333,
            56.45253660278596
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            477.3333333333333,
            261.1057682668505
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dh2a6aLZtdIJ:scholar.google.com/&scioq=GroSS+Decomposition:+Group-Size+Series+Decomposition+for+Whole+Search-Space+Training&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJebgkSFDB",
        "title": "Learning to Learn Kernels with Variational Random Features",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel meta-learning approach for few-shot classification and regression that achieves strong performance with meta variational random features by leveraging variational inference to learn adaptive kernels.",
        "abstract": "Meta-learning for few-shot learning involves a meta-learner that acquires shared knowledge from a set of prior tasks to improve the performance of a base-learner on new tasks with a small amount of data. Kernels are commonly used in machine learning due to their strong nonlinear learning capacity, which have not yet been fully investigated in the meta-learning scenario for few-shot learning. In this work, we explore kernel approximation with random Fourier features in the meta-learning framework for few-shot learning. We propose learning adaptive kernels by meta variational random features (MetaVRF), which is formulated as a variational inference problem. To explore shared knowledge across diverse tasks, our MetaVRF deploys an LSTM inference network to generate informative features, which can establish kernels of highly representational power with low spectral sampling rates, while also being able to quickly adapt to specific tasks for improved performance. We evaluate MetaVRF on a variety of few-shot learning tasks for both regression and classification. Experimental results demonstrate that our MetaVRF can deliver much better or competitive performance than recent meta-learning algorithms.",
        "keywords": "Meta-learning;few-shot learning;Random Fourier Feature;Kernel learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haoliang Sun;Yingjun Du;Jun Xu;Yilong Yin;Xiantong Zhen;Ling Shao",
        "authorids": "haolsun.cn@gmail.com;duyingjun@buaa.edu.cn;nankaimathxujun@gmail.com;ylyin@sdu.edu.cn;zhenxt@gmail.com;ling.shao@ieee.org",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nsun2020learning,\ntitle={Learning to Learn Kernels with Variational Random Features},\nauthor={Haoliang Sun and Yingjun Du and Jun Xu and Yilong Yin and Xiantong Zhen and Ling Shao},\nyear={2020},\nurl={https://openreview.net/forum?id=rJebgkSFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJebgkSFDB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "427;245;90",
        "wc_reply_reviewers": "44;0;0",
        "wc_reply_authors": "1249;517;219",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            254.0,
            137.72678267739602
        ],
        "wc_reply_reviewers_avg": [
            14.666666666666666,
            20.741798914805393
        ],
        "wc_reply_authors_avg": [
            661.6666666666666,
            432.75962021529796
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12238881257485933013&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rJecSyHtDS",
        "title": "Learning to Recognize the Unseen Visual Predicates",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose and address a new problem named predicate zero-shot learning in visual relationship recognition. ",
        "abstract": "Visual relationship recognition models are limited in the ability to generalize from finite seen predicates to unseen ones. We propose a new problem setting named predicate zero-shot learning (PZSL): learning to recognize the predicates without training data. It is unlike the previous zero-shot learning problem on visual relationship recognition which learns to recognize the unseen relationship triplets (<subject, predicate, object>) but requires all components (subject, predicate, and object) to be seen in the training set. For the PZSL problem, however, the models are expected to recognize the diverse even unseen predicates, which is meaningful for many downstream high-level tasks, like visual question answering, to handle complex scenes and open questions. The PZSL is a very challenging task since the predicates are very abstract and follow an extreme long-tail distribution. To address the PZSL problem, we present a model that performs compatibility learning leveraging the linguistic priors from the corpus and knowledge base. An unbalanced sampled-softmax is further developed to tackle the extreme long-tail distribution of predicates. Finally, the experiments are conducted to analyze the problem and verify the effectiveness of our methods. The dataset and source code will be released for further study. ",
        "keywords": "Visual Relationship Detection;Scene Graph Generation;Knowledge;Zero-shot Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Defa Zhu;Si Liu;Wentao Jiang;Guanbin Li;Tianyi Wu;Guodong Guo",
        "authorids": "zhudefa@iie.ac.cn;liusi@buaa.edu.cn;jiangwentao@buaa.edu.cn;liguanbin@mail.sysu.edu.cn;wutianyi01@baidu.com;guoguodong01@baidu.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nzhu2020learning,\ntitle={Learning to Recognize the Unseen Visual Predicates},\nauthor={Defa Zhu and Si Liu and Wentao Jiang and Guanbin Li and Tianyi Wu and Guodong Guo},\nyear={2020},\nurl={https://openreview.net/forum?id=rJecSyHtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJecSyHtDS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "313;394;804",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "677;440;953",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            503.6666666666667,
            214.92686094473058
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            690.0,
            209.63301266737545
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AyVjK5NMIsoJ:scholar.google.com/&scioq=Learning+to+Recognize+the+Unseen+Visual+Predicates&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJecbgHtDH",
        "title": "A Boolean Task Algebra for Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We formalise the composition of tasks as a Boolean algebra and provide a method for producing the optimal value functions of the composed tasks with no further learning.",
        "abstract": "We propose a framework for defining a Boolean algebra over the space of tasks. This allows us to formulate new tasks in terms of the negation, disjunction and conjunction of a set of base tasks. We then show that by learning goal-oriented value functions and restricting the transition dynamics of the tasks, an agent can solve these new tasks with no further learning. We prove that by composing these value functions in specific ways, we immediately recover the optimal policies for all tasks expressible under the Boolean algebra. We verify our approach in two domains, including a high-dimensional video game environment requiring function approximation, where an agent first learns a set of base skills, and then composes them to solve a super-exponential number of new tasks. ",
        "keywords": "Reinforcement Learning;Transfer;Composition;Lifelong;Multi-task;Deep Reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Geraud Nangue Tasse;Steven James;Benjamin Rosman",
        "authorids": "nanguetasse2000s@gmail.com;steven.james@wits.ac.za;brosman@csir.co.za",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntasse2020a,\ntitle={A Boolean Task Algebra for Reinforcement Learning},\nauthor={Geraud Nangue Tasse and Steven James and Benjamin Rosman},\nyear={2020},\nurl={https://openreview.net/forum?id=rJecbgHtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJecbgHtDH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "770;429;403",
        "wc_reply_reviewers": "164;137;0",
        "wc_reply_authors": "1472;1208;918",
        "reply_reviewers": "1;1;0",
        "reply_authors": "3;3;2",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            534.0,
            167.21443318884488
        ],
        "wc_reply_reviewers_avg": [
            100.33333333333333,
            71.79755489491887
        ],
        "wc_reply_authors_avg": [
            1199.3333333333333,
            226.25256290752205
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 70,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10476464165839002673&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "rJeeKTNKDB",
        "title": "Hierarchical Graph-to-Graph Translation for Molecules",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a multi-resolution, hierarchically coupled encoder-decoder for graph-to-graph translation.",
        "abstract": "The problem of accelerating drug discovery relies heavily on automatic tools to optimize precursor molecules to afford them with better biochemical properties. Our work in this paper substantially extends prior state-of-the-art on graph-to-graph translation methods for molecular optimization. In particular, we realize coherent multi-resolution representations by interweaving the encoding of substructure components with the atom-level encoding of the original molecular graph. Moreover, our graph decoder is fully autoregressive, and interleaves each step of adding a new substructure with the process of resolving its attachment to the emerging molecule. We evaluate our model on multiple molecular optimization tasks and show that our model significantly outperforms previous state-of-the-art baselines.",
        "keywords": "graph generation;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wengong Jin;Regina Barzilay;Tommi Jaakkola",
        "authorids": "wengong@csail.mit.edu;regina@csail.mit.edu;tommi@csail.mit.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\njin2020hierarchical,\ntitle={Hierarchical Graph-to-Graph Translation for Molecules},\nauthor={Wengong Jin and Regina Barzilay and Tommi Jaakkola},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeeKTNKDB}\n}",
        "github": "https://www.dropbox.com/sh/6lem7o23f3ds6rw/AAC8N7mqYYrGvWtuZd-o01Usa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJeeKTNKDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "178;471;169",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "387;732;97",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            272.6666666666667,
            140.29096747672514
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            405.3333333333333,
            259.5615962006364
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8206779302048154422&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJeg7TEYwB",
        "title": "Pruned Graph Scattering Transforms",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Graph convolutional networks (GCNs) have achieved remarkable performance in a variety of network science learning tasks. However, theoretical analysis of such approaches is still at its infancy. Graph scattering transforms (GSTs) are non-trainable deep GCN models that are amenable to generalization and stability analyses. The present work addresses some limitations of GSTs by introducing a novel so-termed pruned (p)GST approach. The resultant pruning algorithm is guided by a graph-spectrum-inspired criterion, and retains informative scattering features on-the-fly while bypassing the exponential complexity associated with GSTs. It is further established that pGSTs are stable to perturbations of the input graph signals with bounded energy. Experiments showcase that i) pGST performs comparably to the baseline GST that uses all scattering features, while achieving significant computational savings; ii) pGST achieves comparable performance to state-of-the-art GCNs; and iii) Graph data from various domains lead to different scattering patterns, suggesting domain-adaptive pGST network architectures.",
        "keywords": "Graph scattering transforms;pruning;graph convolutional networks;stability;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vassilis N. Ioannidis;Siheng Chen;Georgios B. Giannakis",
        "authorids": "ioann006@umn.edu;schen@merl.com;georgios@umn.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nIoannidis2020Pruned,\ntitle={Pruned Graph Scattering Transforms},\nauthor={Vassilis N. Ioannidis and Siheng Chen and Georgios B. Giannakis},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeg7TEYwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJeg7TEYwB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "347;699;214",
        "wc_reply_reviewers": "0;67;0",
        "wc_reply_authors": "456;1339;386",
        "reply_reviewers": "0;2;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            420.0,
            204.6183439153652
        ],
        "wc_reply_reviewers_avg": [
            22.333333333333332,
            31.584102892999123
        ],
        "wc_reply_authors_avg": [
            727.0,
            433.6919029295643
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13720686463395330202&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rJehNT4YPr",
        "title": "I Am Going MAD: Maximum Discrepancy Competition for Comparing Classifiers Adaptively",
        "track": "main",
        "status": "Poster",
        "tldr": "We present an efficient and adaptive framework for comparing image classifiers to maximize the discrepancies between the classifiers, in place of comparing on fixed test sets.",
        "abstract": "The learning of hierarchical representations for image classification has experienced an impressive series of successes due in part to the availability of large-scale labeled data for training. On the other hand, the trained classifiers have traditionally been evaluated on small and fixed sets of test images, which are deemed to be extremely sparsely distributed in the space of all natural images. It is thus questionable whether recent performance improvements on the excessively re-used test sets generalize to real-world natural images with much richer content variations. Inspired by efficient stimulus selection for testing perceptual models in psychophysical and physiological studies, we present an alternative framework for comparing image classifiers, which we name the MAximum Discrepancy (MAD) competition. Rather than comparing image classifiers using fixed test images, we adaptively sample a small test set from an arbitrarily large corpus of unlabeled images so as to maximize the discrepancies between the classifiers, measured by the distance over WordNet hierarchy. Human labeling on the resulting model-dependent image sets reveals the relative performance of the competing classifiers, and provides useful insights on potential ways to improve them. We report the MAD competition results of eleven ImageNet classifiers while noting that the framework is readily extensible and cost-effective to add future classifiers into the competition. Codes can be found at https://github.com/TAMU-VITA/MAD.",
        "keywords": "model comparison",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haotao Wang;Tianlong Chen;Zhangyang Wang;Kede Ma",
        "authorids": "htwang@tamu.edu;wiwjp619@tamu.edu;atlaswang@tamu.edu;kede.ma@cityu.edu.hk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWang2020I,\ntitle={I Am Going MAD: Maximum Discrepancy Competition for Comparing Classifiers Adaptively},\nauthor={Haotao Wang and Tianlong Chen and Zhangyang Wang and Kede Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJehNT4YPr}\n}",
        "github": "https://github.com/TAMU-VITA/MAD",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJehNT4YPr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "264;163;268",
        "wc_reply_reviewers": "0;0;57",
        "wc_reply_authors": "776;1090;287",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            231.66666666666666,
            48.58211833815218
        ],
        "wc_reply_reviewers_avg": [
            19.0,
            26.870057685088806
        ],
        "wc_reply_authors_avg": [
            717.6666666666666,
            330.408165086088
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7857243656260190793&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rJehVyrKwH",
        "title": "And the Bit Goes Down: Revisiting the Quantization of Neural Networks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Using a structured quantization technique aiming at better in-domain reconstruction to compress convolutional neural networks",
        "abstract": "In this paper, we address the problem of reducing the memory footprint of convolutional network architectures. We introduce a vector quantization method that aims at preserving the quality of the reconstruction of the network outputs rather than its weights. The principle of our approach is that it minimizes the loss reconstruction error for in-domain inputs. Our method only requires a set of unlabelled data at quantization time and allows for efficient inference on CPU by using byte-aligned codebooks to store the compressed weights. We validate our approach by quantizing a high performing ResNet-50 model to a memory size of 5MB (20x compression factor) while preserving a top-1 accuracy of 76.1% on ImageNet object classification and by compressing a Mask R-CNN with a 26x factor.",
        "keywords": "compression;quantization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pierre Stock;Armand Joulin;R\u00e9mi Gribonval;Benjamin Graham;Herv\u00e9 J\u00e9gou",
        "authorids": "pstock@fb.com;ajoulin@fb.com;remi.gribonval@inria.fr;benjamingraham@fb.com;rvj@fb.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nStock2020And,\ntitle={And the Bit Goes Down: Revisiting the Quantization of Neural Networks},\nauthor={Pierre Stock and Armand Joulin and R\u00e9mi Gribonval and Benjamin Graham and Herv\u00e9 J\u00e9gou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJehVyrKwH}\n}",
        "github": "https://drive.google.com/file/d/12QK7onizf2ArpEBK706ly8bNfiM9cPzp/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJehVyrKwH",
        "pdf_size": 0,
        "rating": "6;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "213;277;253;692",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "443;364;392;338",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            358.75,
            193.7554837933626
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            384.25,
            38.925409439079765
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 190,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9220174723943814446&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rJehf0VKwS",
        "title": "Proactive Sequence Generator via Knowledge Acquisition",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop a knowledge acquisition framework to transfer knowledge from larger sequence models to small models, which helps to alleviate exposure bias. We observed +0.7-1.1 BLEU gains on benchmark datasets",
        "abstract": "Sequence-to-sequence models such as transformers, which are now being used in a wide variety of NLP tasks, typically need to have very high capacity in order to perform well. Unfortunately, in production, memory size and inference speed are all strictly constrained.  To address this problem, Knowledge Distillation (KD), a technique to train small models to mimic larger pre-trained models, has drawn lots of attention.  The KD approach basically attempts to maximize recall, i.e., ranking Top-k\u201dtokens in teacher models as higher as possible, however, whereas precision is more important for sequence generation  because of exposure bias. Motivated by this, we develop Knowledge Acquisition (KA) where student models receive log q(y_t|y_{<t},x) as rewards when producing the next token y_t given previous tokens y_{<t} and the source sentence x.   We demonstrate the effectiveness of our approach on WMT\u201917 De-En and IWSLT\u201915 Th-En translation tasks, with experimental results showing that our approach gains +0.7-1.1 BLEU score compared to token-level knowledge distillation.",
        "keywords": "neural machine translation;knowledge distillation;exposure bias;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qing Sun;James Cross;Dmitriy Genzel",
        "authorids": "qingsun@fb.com;jcross@fb.com;dgenzel@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsun2020proactive,\ntitle={Proactive Sequence Generator via Knowledge Acquisition},\nauthor={Qing Sun and James Cross and Dmitriy Genzel},\nyear={2020},\nurl={https://openreview.net/forum?id=rJehf0VKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJehf0VKwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "397;281;950",
        "wc_reply_reviewers": "21;0;0",
        "wc_reply_authors": "39;0;0",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            542.6666666666666,
            291.89533892970763
        ],
        "wc_reply_reviewers_avg": [
            7.0,
            9.899494936611665
        ],
        "wc_reply_authors_avg": [
            13.0,
            18.384776310850235
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gwrXOOpXhh0J:scholar.google.com/&scioq=Proactive+Sequence+Generator+via+Knowledge+Acquisition&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJehllrtDS",
        "title": "Rethinking deep active learning: Using unlabeled data at model training",
        "track": "main",
        "status": "Reject",
        "tldr": "We revisit deep active learning be making use of the unlabeled data through unsupervised and semi-supervised learning, allowing us to improve drastically the results using the same annotation effort.",
        "abstract": "Active learning typically focuses on training a model on few labeled examples alone, while unlabeled ones are only used for acquisition. In this work we depart from this setting by using both labeled and unlabeled data during model training across active learning cycles. We do so by using unsupervised feature learning at the beginning of the active learning pipeline and semi-supervised learning at every active learning cycle, on all available data. The former has not been investigated before in active learning, while the study of latter in the context of deep learning is scarce and recent findings are not conclusive with respect to its benefit. Our idea is orthogonal to acquisition strategies by using more data, much like ensemble methods use more models. By systematically evaluating on a number of popular acquisition strategies and datasets, we find that the use of unlabeled data during model training brings a spectacular accuracy improvement in image classification, compared to the differences between acquisition strategies. We thus explore smaller label budgets, even one label per class. ",
        "keywords": "active learning;deep learning;semi-supervised learning;unsupervised feature learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Oriane Sim\u00e9oni;Mateusz Budnik;Yannis Avrithis;Guillaume Gravier",
        "authorids": "oriane.simeoni@inria.fr;mateusz.budnik@inria.fr;yannis@avrithis.net;guig@irisa.fr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsim{\\'e}oni2020rethinking,\ntitle={Rethinking deep active learning: Using unlabeled data at model training},\nauthor={Oriane Sim{\\'e}oni and Mateusz Budnik and Yannis Avrithis and Guillaume Gravier},\nyear={2020},\nurl={https://openreview.net/forum?id=rJehllrtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJehllrtDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "373;218;838",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "975;354;1251",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            476.3333333333333,
            263.44934153562724
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            860.0,
            375.1186478969021
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 100,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18087806471833715062&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 15
    },
    {
        "id": "rJeidA4KvS",
        "title": "Role-Wise Data Augmentation for Knowledge Distillation",
        "track": "main",
        "status": "Reject",
        "tldr": "We study whether and how adaptive data augmentation and knowledge distillation can be leveraged simultaneously in a synergistic manner for better training student networks.",
        "abstract": "Knowledge Distillation (KD) is a common method for transferring the ``knowledge'' learned by one machine learning model (the teacher) into another model (the student), where typically, the teacher has a greater capacity (e.g., more parameters or higher bit-widths). To our knowledge, existing methods overlook the fact that although the student absorbs extra knowledge from the teacher, both models share the same input data -- and this data is the only medium by which the teacher's knowledge can be demonstrated. Due to the difference in model capacities, the student may not benefit fully from the same data points on which the teacher is trained. On the other hand, a human teacher may demonstrate a piece of knowledge with individualized examples adapted to a particular student, for instance, in terms of her cultural background and interests. Inspired by this behavior, we design data augmentation agents with distinct roles to facilitate knowledge distillation. Our data augmentation agents generate distinct training data for the teacher and student, respectively. We focus specifically on KD when the teacher network has greater precision (bit-width) than the student network.\n\nWe find empirically that specially tailored data points enable the teacher's knowledge to be demonstrated more effectively to the student. We compare our approach with existing KD methods on training popular neural architectures and demonstrate that role-wise data augmentation improves the effectiveness of KD over strong prior approaches. The code for reproducing our results will be made publicly available.",
        "keywords": "Data Augmentation;Knowledge Distillation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jie Fu;Xue Geng;Bohan Zhuang;Xingdi Yuan;Adam Trischler;Jie Lin;Vijay Chandrasekhar;Chris Pal",
        "authorids": "jie.fu@polymtl.ca;geng_xue@i2r.a-star.edu.sg;bohan.zhuang@adelaide.edu.au;eryua@microsoft.com;adam.trischler@microsoft.com;lin-j@i2r.a-star.edu.sg;vijay@i2r.a-star.edu.sg;christopher.pal@polymtl.ca",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nfu2020rolewise,\ntitle={Role-Wise Data Augmentation for Knowledge Distillation},\nauthor={Jie Fu and Xue Geng and Bohan Zhuang and Xingdi Yuan and Adam Trischler and Jie Lin and Vijay Chandrasekhar and Chris Pal},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeidA4KvS}\n}",
        "github": "http://bit.ly/2n0WkDj",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJeidA4KvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "482;332;703",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "79;325;137",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            505.6666666666667,
            152.38183035461356
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            180.33333333333334,
            104.99947089813782
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14994017971413262067&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJejta4KDS",
        "title": "SELF-KNOWLEDGE DISTILLATION ADVERSARIAL ATTACK",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural networks show great vulnerability under the threat of adversarial examples.\n   By adding small perturbation to a clean image, neural networks with high classification accuracy can be completely fooled.\n   One intriguing property of the adversarial examples is transferability.  This property allows adversarial examples to transfer to networks of unknown structure, which is harmful even to the physical world.\n   The current way of generating adversarial examples is mainly divided into optimization based and gradient based methods.\n   Liu et al. (2017) conjecture that gradient based methods can hardly produce transferable targeted adversarial examples in black-box-attack.\n   However, in this paper, we use a simple technique to improve the transferability and success rate of targeted attacks with gradient based methods.\n   We prove that gradient based methods can also generate transferable adversarial examples in targeted attacks.\n   Specifically, we use knowledge distillation for gradient based methods, and show that the transferability can be improved by effectively utilizing different classes of information.\n   Unlike the usual applications of knowledge distillation, we did not train a student network to generate adversarial examples.\n   We take advantage of the fact that knowledge distillation can soften the target and obtain higher information, and combine the soft target and hard target of the same network as the loss function.\n   Our method is generally applicable to most gradient based attack methods.",
        "keywords": "Adversarial Examples;Transferability;black-box targeted attack;Distillation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ma Xiaoxiong[1];Wang Renzhi[1];Tian Cong;Dong Zeqian;Duan Zhenhua",
        "authorids": "maxrumi@163.com;shanicky4ever@gmail.com;tico_tools@163.com;zqdong@stu.xidian.edu.cn;zhenhua_duan@126.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nxiaoxiong[1]2020selfknowledge,\ntitle={{\\{}SELF{\\}}-{\\{}KNOWLEDGE{\\}} {\\{}DISTILLATION{\\}} {\\{}ADVERSARIAL{\\}} {\\{}ATTACK{\\}}},\nauthor={Ma Xiaoxiong[1] and Wang Renzhi[1] and Tian Cong and Dong Zeqian and Duan Zhenhua},\nyear={2020},\nurl={https://openreview.net/forum?id=rJejta4KDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJejta4KDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "243;363;377",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "154;223;195",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            327.6666666666667,
            60.140576060500855
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            190.66666666666666,
            28.335294049804546
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tNtX7EZ4vOgJ:scholar.google.com/&scioq=SELF-KNOWLEDGE+DISTILLATION+ADVERSARIAL+ATTACK&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJel41BtDH",
        "title": "Pseudo-Labeling and Confirmation Bias in Deep Semi-Supervised Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Pseudo-labeling has shown to be a weak alternative for semi-supervised learning. We, conversely, demonstrate that dealing with confirmation bias with several regularizations makes pseudo-labeling a suitable approach.",
        "abstract": "Semi-supervised learning, i.e. jointly learning from labeled an unlabeled samples, is an active research topic due to its key role on relaxing human annotation constraints. In the context of image classification, recent advances to learn from unlabeled samples are mainly focused on consistency regularization methods that encourage invariant predictions for different  perturbations of unlabeled samples. We, conversely, propose to learn from unlabeled data by generating soft pseudo-labels using the network predictions. We show that a naive pseudo-labeling overfits to incorrect pseudo-labels due to the so-called confirmation bias and demonstrate that mixup augmentation and setting a minimum number of labeled samples per mini-batch are effective regularization techniques for reducing it. The proposed approach achieves state-of-the-art results in CIFAR-10/100 and Mini-ImageNet despite being much simpler than other state-of-the-art. These results demonstrate that pseudo-labeling can outperform consistency regularization methods, while the opposite was supposed in previous work. Code will be made available.",
        "keywords": "Semi-supervised learning;pseudo-labeling;deep semi-supervised learning;confirmation bias;image classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eric Arazo;Diego Ortego;Paul Albert;Noel E. O'Connor;Kevin McGuinness",
        "authorids": "eric.arazo@insight-centre.org;diego.ortego@insight-centre.org;paul.albert@insight-centre.org;noel.oconnor@dcu.ie;kevin.mcguinness@dcu.ie",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\narazo2020pseudolabeling,\ntitle={Pseudo-Labeling and Confirmation Bias in Deep Semi-Supervised Learning},\nauthor={Eric Arazo and Diego Ortego and Paul Albert and Noel E. O'Connor and Kevin McGuinness},\nyear={2020},\nurl={https://openreview.net/forum?id=rJel41BtDH}\n}",
        "github": "https://drive.google.com/file/d/1qaVu1RU4pvXewwxNvTXDvQrahFnLTPl3/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJel41BtDH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "166;479;401",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "526;805;769",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.6666666666667,
            133.0321598545087
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            700.0,
            123.9112585683803
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1155,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18080258362292652224&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rJeqeCEtvH",
        "title": "Semi-Supervised Generative Modeling for Controllable Speech Synthesis",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We present a novel generative model that combines state-of-the-art neural text- to-speech (TTS) with semi-supervised probabilistic latent variable models. By providing partial supervision to some of the latent variables, we are able to force them to take on consistent and interpretable purposes, which previously hasn\u2019t been possible with purely unsupervised methods. We demonstrate that our model is able to reliably discover and control important but rarely labelled attributes of speech, such as affect and speaking rate, with as little as 1% (30 minutes) supervision. Even at such low supervision levels we do not observe a degradation of synthesis quality compared to a state-of-the-art baseline. We will release audio samples at https://google.github.io/tacotron/publications/semisupervised_generative_modeling_for_controllable_speech_synthesis/.",
        "keywords": "TTS;Speech Synthesis;Semi-supervised Models;VAE;disentanglement",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Raza Habib;Soroosh Mariooryad;Matt Shannon;Eric Battenberg;RJ Skerry-Ryan;Daisy Stanton;David Kao;Tom Bagby",
        "authorids": "raza.habib@cs.ucl.ac.uk;soroosh@google.com;mattshannon@google.com;ebattenberg@google.com;rjryan@google.com;daisy@google.com;davidkao@google.com;tombagby@google.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nHabib2020Semi-Supervised,\ntitle={Semi-Supervised Generative Modeling for Controllable Speech Synthesis},\nauthor={Raza Habib and Soroosh Mariooryad and Matt Shannon and Eric Battenberg and RJ Skerry-Ryan and Daisy Stanton and David Kao and Tom Bagby},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeqeCEtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJeqeCEtvH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "514;259;373",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1036;1165;1026",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;3",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            382.0,
            104.29765098025938
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1075.6666666666667,
            63.29999122345033
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 61,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2357172807378936638&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJerHlrYwH",
        "title": "Data-Efficient Image Recognition with Contrastive Predictive Coding",
        "track": "main",
        "status": "Reject",
        "tldr": "Unsupervised representations learned with Contrastive Predictive Coding enable data-efficient image classification.",
        "abstract": "Human observers can learn to recognize new categories of objects from a handful of examples, yet doing so with machine perception remains an open challenge. We hypothesize that data-efficient recognition is enabled by representations which make the variability in natural signals more predictable, as suggested by recent perceptual evidence. We therefore revisit and improve Contrastive Predictive Coding, a recently-proposed unsupervised learning framework, and arrive at a representation which enables generalization from small amounts of labeled data. When provided with only 1% of ImageNet labels (i.e. 13 per class), this model retains a strong classification performance, 73% Top-5 accuracy, outperforming supervised networks by 28% (a 65% relative improvement) and state-of-the-art semi-supervised methods by 14%. We also find this representation to serve as a useful substrate for object detection on the PASCAL-VOC 2007 dataset, approaching the performance of representations trained with a fully annotated ImageNet dataset.",
        "keywords": "Deep learning;representation learning;contrastive methods;unsupervised learning;self-supervised learning;vision;data-efficiency",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Olivier J Henaff;Aravind Srinivas;Jeffrey De Fauw;Ali Razavi;Carl Doersch;S. M. Ali Eslami;Aaron van den Oord",
        "authorids": "henaff@google.com;aravind@cs.berkeley.edu;defauw@google.com;alirazavi@google.com;doersch@google.com;aeslami@google.com;avdnoord@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nhenaff2020dataefficient,\ntitle={Data-Efficient Image Recognition with Contrastive Predictive Coding},\nauthor={Olivier J Henaff and Aravind Srinivas and Jeffrey De Fauw and Ali Razavi and Carl Doersch and S. M. Ali Eslami and Aaron van den Oord},\nyear={2020},\nurl={https://openreview.net/forum?id=rJerHlrYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJerHlrYwH",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "245;317;410;392",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "250;493;293;184",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            341.0,
            65.48663985882922
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            305.0,
            115.2757563410451
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1716,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1109479161401817158&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rJeuMREKwS",
        "title": "Using Logical Specifications of Objectives in Multi-Objective Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We present a multi-objective reinforcement learning agent able to generalize, post-training, to novel behaviors specified by a custom language based on propositional logic.",
        "abstract": "In the multi-objective reinforcement learning (MORL) paradigm, the relative importance of each environment objective is often unknown prior to training, so agents must learn to specialize their behavior to optimize different combinations of environment objectives that are specified post-training. These are typically linear combinations, so the agent is effectively parameterized by a weight vector that describes how to balance competing environment objectives. However, many real world behaviors require non-linear combinations of objectives. Additionally, the conversion between desired behavior and weightings is often unclear.\nIn this work, we explore the use of a language based on propositional logic with quantitative semantics--in place of weight vectors--for specifying non-linear behaviors in an interpretable way. We use a recurrent encoder to encode logical combinations of objectives, and train a MORL agent to generalize over these encodings. We test our agent in several grid worlds with various objectives and show that our agent can generalize to many never-before-seen specifications with performance comparable to single policy baseline agents. We also demonstrate our agent's ability to generate meaningful policies when presented with novel specifications and quickly specialize to novel specifications.",
        "keywords": "reinforcement learning;multi-objective;multi-task;propositional logic",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kolby Nottingham;Anand Balakrishnan;Jyotirmoy Deshmukh;Connor Christopherson;David Wingate",
        "authorids": "kolbytn@byu.edu;anandbal@usc.edu;jdeshmukh@usc.edu;connormc@byu.edu;wingated@cs.byu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nnottingham2020using,\ntitle={Using Logical Specifications of Objectives in Multi-Objective Reinforcement Learning},\nauthor={Kolby Nottingham and Anand Balakrishnan and Jyotirmoy Deshmukh and Connor Christopherson and David Wingate},\nyear={2020},\nurl={https://openreview.net/forum?id=rJeuMREKwS}\n}",
        "github": "https://mega.nz/#F!H4VTBKiZ!DZ60VqxeyuXtfkQ8XJoiUA",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJeuMREKwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "273;253;284",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "487;199;342",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            270.0,
            12.832251036613439
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            342.6666666666667,
            117.57645266898564
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2847852064307498383&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJg2fTNtwr",
        "title": "Quantifying Exposure Bias for Neural Language Generation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We show that exposure bias could be much less serious than it is currently assumed to be for MLE LM training.",
        "abstract": "The exposure bias problem refers to the training-inference discrepancy caused by teacher forcing in maximum likelihood estimation (MLE) training for auto-regressive neural network language models (LM). It has been regarded as a central problem for natural language generation (NLG) model training. Although a lot of algorithms have been proposed to avoid teacher forcing and therefore to alleviate exposure bias, there is little work showing how serious the exposure bias problem is. In this work, we first identify the auto-recovery ability of MLE-trained LM, which casts doubt on the seriousness of exposure bias. We then develop a precise, quantifiable definition for exposure bias. However, according to our measurements in controlled experiments, there's only around 3% performance gain when the training-inference discrepancy is completely removed. Our results suggest the exposure bias problem could be much less serious than it is currently assumed to be.",
        "keywords": "language model;exposure bias;language generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianxing He;Jingzhao Zhang;Zhiming Zhou;James Glass",
        "authorids": "cloudygoose@csail.mit.edu;jzhzhang@mit.edu;heyohai@apex.sjtu.edu.cn;glass@mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJg2fTNtwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "651;340;438",
        "wc_reply_reviewers": "0;0;140",
        "wc_reply_authors": "706;409;1093",
        "reply_reviewers": "0;0;2",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            476.3333333333333,
            129.8263797881189
        ],
        "wc_reply_reviewers_avg": [
            46.666666666666664,
            65.99663291074444
        ],
        "wc_reply_authors_avg": [
            736.0,
            280.0464247227591
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13880906737097682369&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJg3zxBYwH",
        "title": "Learning Likelihoods with Conditional Normalizing Flows",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Normalizing Flows (NFs) are able to model complicated distributions p(y) with strong inter-dimensional correlations and high multimodality by transforming a simple base density p(z) through an invertible neural network under the change of variables formula. Such behavior is desirable in multivariate structured prediction tasks, where handcrafted per-pixel loss-based methods inadequately capture strong correlations between output dimensions. We present a study of conditional normalizing flows (CNFs), a class of NFs where the base density to output space mapping is conditioned on an input x, to model conditional densities p(y|x). CNFs are efficient in sampling and inference, they can be trained with a likelihood-based objective, and CNFs, being generative flows, do not suffer from mode collapse or training instabilities. We provide an effective method to train continuous CNFs for binary problems and in particular, we apply these CNFs to super-resolution and vessel segmentation tasks demonstrating competitive performance on standard benchmark datasets in terms of likelihood and conventional metrics.",
        "keywords": "Likelihood learning;conditional normalizing flows;generative modelling;super-resolution;vessel segmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Christina Winkler;Daniel Worrall;Emiel Hoogeboom;Max Welling",
        "authorids": "christina.winkler.94@gmail.com;d.e.worrall@uva.nl;e.hoogeboom@uva.nl;m.welling@uva.nl",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwinkler2020learning,\ntitle={Learning Likelihoods with Conditional Normalizing Flows },\nauthor={Christina Winkler and Daniel Worrall and Emiel Hoogeboom and Max Welling},\nyear={2020},\nurl={https://openreview.net/forum?id=rJg3zxBYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJg3zxBYwH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "138;437;203",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "291;260;183",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            259.3333333333333,
            128.40128071358512
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            244.66666666666666,
            45.40435612973226
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 259,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9624943945095583263&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rJg46kHYwH",
        "title": "Adaptive Generation of Unrestricted Adversarial Inputs",
        "track": "main",
        "status": "Reject",
        "tldr": "Training GANs to generate unrestricted adversarial examples",
        "abstract": "Neural networks are vulnerable to adversarially-constructed perturbations of their inputs. Most research so far has considered perturbations of a fixed magnitude under some $l_p$ norm. Although studying these attacks is valuable, there has been increasing interest in the construction of\u2014and robustness to\u2014unrestricted attacks, which are not constrained to a small and rather artificial subset of all possible adversarial inputs. We introduce a novel algorithm for generating such unrestricted adversarial inputs which, unlike prior work, is adaptive: it is able to tune its attacks to the classifier being targeted. It also offers a 400\u20132,000\u00d7 speedup over the existing state of the art. We demonstrate our approach by generating unrestricted adversarial inputs that fool classifiers robust to perturbation-based attacks. We also show that, by virtue of being adaptive and unrestricted, our attack is able to bypass adversarial training against it.",
        "keywords": "Adversarial Examples;Adversarial Robustness;Generative Adversarial Networks;Image Classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Isaac Dunn;Hadrien Pouget;Tom Melham;Daniel Kroening",
        "authorids": "isaac.dunn@cs.ox.ac.uk;hadrien.pouget@cs.ox.ac.uk;tom.melham@cs.ox.ac.uk;kroening@cs.ox.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ndunn2020adaptive,\ntitle={Adaptive Generation of Unrestricted Adversarial Inputs},\nauthor={Isaac Dunn and Hadrien Pouget and Tom Melham and Daniel Kroening},\nyear={2020},\nurl={https://openreview.net/forum?id=rJg46kHYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJg46kHYwH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "751;333;679",
        "wc_reply_reviewers": "521;0;0",
        "wc_reply_authors": "1884;195;324",
        "reply_reviewers": "2;0;0",
        "reply_authors": "4;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            587.6666666666666,
            182.45973680665978
        ],
        "wc_reply_reviewers_avg": [
            173.66666666666666,
            245.6017553321275
        ],
        "wc_reply_authors_avg": [
            801.0,
            767.6053673600778
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9332176791436419558&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJg4GgHKPB",
        "title": "Mixed Setting Training Methods for Incremental Slot-Filling Tasks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We provide a method-agnostic algorithm for deciding when to incrementally train versus fully train and it provides a significant speedup over fully training and avoids catastrophic forgetting",
        "abstract": "Model training remains a dominant financial cost and time investment in machine learning applications. Developing and debugging models often involve iterative training, further exacerbating this issue. With growing interest in increasingly complex models, there is a need for techniques that help to reduce overall training effort. While incremental training can save substantial time and cost by training an existing model on a small subset of data, little work has explored policies for determining when incremental training provides adequate model performance versus full retraining. We provide a method-agnostic algorithm for deciding when to incrementally train versus fully train. We call this setting of non-deterministic full- or incremental training ``Mixed Setting Training\". Upon evaluation in slot-filling tasks, we find that this algorithm provides a bounded error, avoids catastrophic forgetting, and results in a significant speedup over a policy of always fully training.",
        "keywords": "incremental learning;slot-filling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Daniel C. Michelin;Jonathan K. Kummerfeld;Kevin Leach;Stefan Larson;Yunqi Zhang;Joeseph J. Peper",
        "authorids": "daniel@clinc.com;jkk@clinc.com;kevin.leach@clinc.com;slars@clinc.com;yunqi@clinc.com;joe@clinc.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJg4GgHKPB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "127;359;355",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            280.3333333333333,
            108.43533659385314
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jeRccNhscucJ:scholar.google.com/&scioq=Mixed+Setting+Training+Methods+for+Incremental+Slot-Filling+Tasks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJg76kStwH",
        "title": "Efficient Probabilistic Logic Reasoning with Graph Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We employ graph neural networks in the variational EM framework for efficient inference and learning of Markov Logic Networks.",
        "abstract": "Markov Logic Networks (MLNs), which elegantly combine logic rules and probabilistic graphical models, can be used to address many knowledge graph problems. However, inference in MLN is computationally intensive, making the industrial-scale application of MLN very difficult. In recent years, graph neural networks (GNNs) have emerged as efficient and effective tools for large-scale graph problems. Nevertheless, GNNs do not explicitly incorporate prior logic rules into the models, and may require many labeled examples for a target task. In this paper, we explore the combination of MLNs and GNNs, and use graph neural networks for variational inference in MLN. We propose a GNN variant, named ExpressGNN, which strikes a nice balance between the representation power and the simplicity of the model. Our extensive experiments on several benchmark datasets demonstrate that ExpressGNN leads to effective and efficient probabilistic logic reasoning.",
        "keywords": "probabilistic logic reasoning;Markov Logic Networks;graph neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuyu Zhang;Xinshi Chen;Yuan Yang;Arun Ramamurthy;Bo Li;Yuan Qi;Le Song",
        "authorids": "yuyu@gatech.edu;xinshi.chen@gatech.edu;yuanyang@gatech.edu;arun.ramamurthy@siemens.com;lbo@illinois.edu;yuan.qi@antfin.com;lsong@cc.gatech.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nZhang2020Efficient,\ntitle={Efficient Probabilistic Logic Reasoning with Graph Neural Networks},\nauthor={Yuyu Zhang and Xinshi Chen and Yuan Yang and Arun Ramamurthy and Bo Li and Yuan Qi and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJg76kStwH}\n}",
        "github": "https://github.com/expressGNN/ExpressGNN",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJg76kStwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "431;131;351",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "794;107;312",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            304.3333333333333,
            126.84198393626966
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            404.3333333333333,
            287.9656615331457
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 167,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12549090467067040217&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rJg7BA4YDr",
        "title": "NEURAL EXECUTION ENGINES",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose neural execution engines (NEEs), which leverage a learned mask and supervised execution traces to mimic the functionality of subroutines and demonstrate strong generalization.",
        "abstract": "Turing complete computation and reasoning are often regarded as necessary pre- cursors to general intelligence. There has been a significant body of work studying neural networks that mimic general computation, but these networks fail to generalize to data distributions that are outside of their training set. We study this problem through the lens of fundamental computer science problems: sorting and graph processing. We modify the masking mechanism of a transformer in order to allow them to implement rudimentary functions with strong generalization. We call this model the Neural Execution Engine, and show that it learns, through supervision, to numerically compute the basic subroutines comprising these algorithms with near perfect accuracy. Moreover, it retains this level of accuracy while generalizing to unseen data and long sequences outside of the training distribution.",
        "keywords": "neural computation;strong generalization;numerical reasoning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yujun Yan;Kevin Swersky;Danai Koutra;Parthasarathy Ranganathan;Milad Hashemi",
        "authorids": "yujunyan@umich.edu;kswersky@google.com;dkoutra@umich.edu;parthas@google.com;miladh@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyan2020neural,\ntitle={{\\{}NEURAL{\\}} {\\{}EXECUTION{\\}} {\\{}ENGINES{\\}}},\nauthor={Yujun Yan and Kevin Swersky and Danai Koutra and Parthasarathy Ranganathan and Milad Hashemi},\nyear={2020},\nurl={https://openreview.net/forum?id=rJg7BA4YDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJg7BA4YDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "486;491;241",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "442;454;155",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.0,
            116.69047376142863
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            350.3333333333333,
            138.20837729875694
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "rJg851rYwH",
        "title": "Making the Shoe Fit: Architectures, Initializations, and Tuning for Learning with Privacy",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Because learning sometimes involves sensitive data, standard machine-learning algorithms have been extended to offer strong privacy guarantees for training data. However, in practice, this has been mostly an afterthought, with privacy-preserving models obtained by re-running training with a different optimizer, but using the same model architecture that performed well in a non-privacy-preserving setting. This approach leads to less than ideal privacy/utility tradeoffs, as we show here. Instead, we propose that model architectures and initializations are chosen and hyperparameter tuning is performed, ab initio, explicitly for privacy-preserving training. Using this paradigm, we achieve new state-of-the-art accuracy on MNIST, FashionMNIST, and CIFAR10 without any modification of the fundamental learning procedures or differential-privacy analysis.",
        "keywords": "differential privacy;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nicolas Papernot;Steve Chien;Shuang Song;Abhradeep Thakurta;Ulfar Erlingsson",
        "authorids": "papernot@google.com;schien@google.com;athakurta@google.com;shuangsong@google.com;ulfar@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\npapernot2020making,\ntitle={Making the Shoe Fit: Architectures, Initializations, and Tuning for Learning with Privacy},\nauthor={Nicolas Papernot and Steve Chien and Shuang Song and Abhradeep Thakurta and Ulfar Erlingsson},\nyear={2020},\nurl={https://openreview.net/forum?id=rJg851rYwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJg851rYwH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "292;451;823",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "998;976;1038",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            522.0,
            222.51741504880016
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1004.0,
            25.664502073226878
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11755911281237651250&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJg8NertPr",
        "title": "Top-down training for neural networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Vanishing gradients pose a challenge when training deep neural networks, resulting in the top layers (closer to the output) in the network learning faster when compared with lower layers closer to the input. Interpreting the top layers as a classifier and the lower layers a feature extractor, one can hypothesize that unwanted network convergence may occur when the classifier has overfit with respect to the feature extractor. This can lead to the feature extractor being under-trained, possibly failing to learn much about the patterns in the input data. To address this we propose a good classifier hypothesis: given a fixed classifier that partitions the space well, the feature extractor can be further trained to fit that classifier and learn the data patterns well.  This alleviates the problem of under-training the feature extractor and enables the network to learn patterns in the data with small partial derivatives.  We verify this hypothesis empirically and propose a novel top-down training method. We train all layers jointly, obtaining a good classifier from the top layers, which are then frozen. Following re-initialization, we retrain the bottom layers with respect to the frozen classifier.  Applying this approach to a set of speech recognition experiments using the Wall Street Journal and noisy CHiME-4 datasets we observe substantial accuracy gains. When combined with dropout, our method enables connectionist temporal classification (CTC) models to outperform joint CTC-attention models, which have more capacity and flexibility.  ",
        "keywords": "Neural network training;speech recognition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shucong Zhang;Cong-Thanh Do;Rama Doddipatla;Erfan Loweimi;Peter Bell;Steve Renals",
        "authorids": "s1603602@sms.ed.ac.uk;cong-thanh.do@crl.toshiba.co.uk;rama.doddipatla@crl.toshiba.co.uk;e.loweimi@ed.ac.uk;peter.bell@ed.ac.uk;s.renals@ed.ac.uk",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nzhang2020topdown,\ntitle={Top-down training for neural networks},\nauthor={Shucong Zhang and Cong-Thanh Do and Rama Doddipatla and Erfan Loweimi and Peter Bell and Steve Renals},\nyear={2020},\nurl={https://openreview.net/forum?id=rJg8NertPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJg8NertPr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "172;392;255",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "405;1622;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            273.0,
            90.71199847135254
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            675.6666666666666,
            689.2828801652402
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            1.247219128924647
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13647001642667386428&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJg8TeSFDH",
        "title": "An Exponential Learning Rate Schedule for Deep Learning",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose an exponentially growing learning rate schedule for networks with BatchNorm, which surprisingly performs well in practice and is provably equivalent to popular LR schedules like Step Decay.",
        "abstract": "Intriguing empirical evidence exists that deep learning can work well with exotic schedules for varying the learning rate. This paper suggests that the phenomenon may be due to Batch Normalization or BN(Ioffe & Szegedy, 2015), which is ubiq- uitous and provides benefits in optimization and generalization across all standard architectures. The following new results are shown about BN with weight decay and momentum (in other words, the typical use case which was not considered in earlier theoretical analyses of stand-alone BN (Ioffe & Szegedy, 2015; Santurkar et al., 2018; Arora et al., 2018)\n\u2022 Training can be done using SGD with momentum and an exponentially in- creasing learning rate schedule, i.e., learning rate increases by some (1 + \u03b1) factor in every epoch for some \u03b1 > 0. (Precise statement in the paper.) To the best of our knowledge this is the first time such a rate schedule has been successfully used, let alone for highly successful architectures. As ex- pected, such training rapidly blows up network weights, but the net stays well-behaved due to normalization.\n\u2022 Mathematical explanation of the success of the above rate schedule: a rigor- ous proof that it is equivalent to the standard setting of BN + SGD + Standard Rate Tuning + Weight Decay + Momentum. This equivalence holds for other normalization layers as well, Group Normalization(Wu & He, 2018), Layer Normalization(Ba et al., 2016), Instance Norm(Ulyanov et al., 2016), etc.\n\u2022 A worked-out toy example illustrating the above linkage of hyper- parameters. Using either weight decay or BN alone reaches global minimum, but convergence fails when both are used.",
        "keywords": "batch normalization;weight decay;learning rate;deep learning theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiyuan Li;Sanjeev Arora",
        "authorids": "zhiyuanli@cs.princeton.edu;arora@cs.princeton.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLi2020An,\ntitle={An Exponential Learning Rate Schedule for Deep Learning},\nauthor={Zhiyuan Li and Sanjeev Arora},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJg8TeSFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJg8TeSFDH",
        "pdf_size": 0,
        "rating": "6;6;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "361;789;352;843",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "261;331;45;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;0",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            586.25,
            230.56384690579745
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            159.25,
            139.8791889453181
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.75,
            0.4330127018922193
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 263,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16618218529243736334&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJg9OANFwS",
        "title": "Topic Models with Survival Supervision: Archetypal Analysis and Neural Approaches",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We introduce two approaches to topic modeling supervised by survival analysis. Both approaches predict time-to-event outcomes while simultaneously learning topics over features that help prediction. The high-level idea is to represent each data point as a distribution over topics using some underlying topic model. Then each data point's distribution over topics is fed as input to a survival model. The topic and survival models are jointly learned. The two approaches we propose differ in the generality of topic models they can learn. The first approach finds topics via archetypal analysis, a nonnegative matrix factorization method that optimizes over a wide class of topic models encompassing latent Dirichlet allocation (LDA), correlated topic models, and topic models based on the ``anchor word'' assumption; the resulting survival-supervised variant solves an alternating minimization problem. Our second approach builds on recent work that approximates LDA in a neural net framework. We add a survival loss layer to this neural net to form an approximation to survival-supervised LDA. Both of our approaches can be combined with a variety of survival models. We demonstrate our approach on two survival datasets, showing that survival-supervised topic models can achieve competitive time-to-event prediction accuracy while outputting clinically interpretable topics.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "George H. Chen;Linhong Li;Ren Zuo;Amanda Coston;Jeremy C. Weiss",
        "authorids": "georgechen@cmu.edu;linhongl@andrew.cmu.edu;renzuo.wren@gmail.com;acoston@cs.cmu.edu;jeremyweiss@cmu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchen2020topic,\ntitle={Topic Models with Survival Supervision: Archetypal Analysis and Neural Approaches},\nauthor={George H. Chen and Linhong Li and Ren Zuo and Amanda Coston and Jeremy C. Weiss},\nyear={2020},\nurl={https://openreview.net/forum?id=rJg9OANFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJg9OANFwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "762;369;183",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "345;150;12",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            438.0,
            241.35865428859185
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            169.0,
            136.6089308939939
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZmSTCTF137wJ:scholar.google.com/&scioq=Topic+Models+with+Survival+Supervision:+Archetypal+Analysis+and+Neural+Approaches&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "rJgBd2NYPH",
        "title": "Learning deep graph matching with channel-independent embedding and Hungarian attention",
        "track": "main",
        "status": "Poster",
        "tldr": "We proposed a deep graph matching method with novel channel-independent embedding and Hungarian loss, which achieved state-of-the-art performance.",
        "abstract": "Graph matching aims to establishing node-wise correspondence between two graphs, which is a classic combinatorial problem and in general NP-complete. Until very recently, deep graph matching methods start to resort to deep networks to achieve unprecedented matching accuracy. Along this direction, this paper makes two complementary contributions which can also be reused as plugin in existing works: i) a novel node and edge embedding strategy which stimulates the multi-head strategy in attention models and allows the information in each channel to be merged independently. In contrast, only node embedding is accounted in previous works; ii) a general masking mechanism over the loss function is devised to improve the smoothness of objective learning for graph matching. Using Hungarian algorithm, it dynamically constructs a structured and sparsely connected layer, taking into account the most contributing matching pairs as hard attention. Our approach performs competitively, and can also improve state-of-the-art methods as plugin, regarding with matching accuracy on three public benchmarks.",
        "keywords": "deep graph matching;edge embedding;combinatorial problem;Hungarian loss",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianshu Yu;Runzhong Wang;Junchi Yan;Baoxin Li",
        "authorids": "tianshuy@asu.edu;runzhong.wang@sjtu.edu.cn;yanjunchi@sjtu.edu.cn;baoxin.li@asu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nYu2020Learning,\ntitle={Learning deep graph matching with channel-independent embedding and Hungarian attention},\nauthor={Tianshu Yu and Runzhong Wang and Junchi Yan and Baoxin Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgBd2NYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJgBd2NYPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "142;192;116",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "261;200;203",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            150.0,
            31.538336460039655
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            221.33333333333334,
            28.075295585660754
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 118,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15846437181994594088&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rJgCOySYwH",
        "title": "Function Feature Learning of Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We present a Function Feature Learning (FFL) method that can measure the similarity of non-convex neural networks. The function feature representation provides crucial insights into the understanding of the relations between different local solutions of identical neural networks. Unlike existing methods that use neuron activation vectors over a given dataset as neural network representation, FFL aligns weights of neural networks and projects them into a common function feature space by introducing a chain alignment rule. We investigate the function feature representation on Multi-Layer Perceptron (MLP), Convolutional Neural Network (CNN), and Recurrent Neural Network (RNN), finding that identical neural networks trained with different random initializations on different learning tasks by the Stochastic Gradient Descent (SGD) algorithm can be projected into different fixed points. This finding demonstrates the strong connection between different local solutions of identical neural networks and the equivalence of projected local solutions. With FFL, we also find that the semantics are often presented in a bottom-up way. Besides, FFL provides more insights into the structure of local solutions. Experiments on CIFAR-100, NameData, and tiny ImageNet datasets validate the effectiveness of the proposed method.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guangcong Wang;Jianhuang Lai;Guangrun Wang;Wenqi Liang",
        "authorids": "wanggc3@mail2.sysu.edu.cn;stsljh@mail.sysu.edu.cn;wanggrun@mail2.sysu.edu.cn;liangwq8@mail2.sysu.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020function,\ntitle={Function Feature Learning of Neural Networks},\nauthor={Guangcong Wang and Jianhuang Lai and Guangrun Wang and Wenqi Liang},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgCOySYwH}\n}",
        "github": "https://anonymous.4open.science/r/74e46ebe-4023-4a85-86b6-19ee20c5070a/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJgCOySYwH",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "190;361",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "754;770",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            275.5,
            85.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            762.0,
            8.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "rJgD2ySFDr",
        "title": "Neural Communication Systems with Bandwidth-limited Channel",
        "track": "main",
        "status": "Reject",
        "tldr": "We learn neural joint coding with bandwidth-limited channel models. ",
        "abstract": "Reliably transmitting messages despite information loss due to a noisy channel is a core problem of information theory.  One of the most important aspects of real world communication is that it may happen at varying levels of information transfer. The bandwidth-limited channel models this phenomenon.  In this study we consider learning joint coding with the bandwidth-limited channel.  Although, classical results suggest that it is asymptotically optimal to separate the sub-tasks of compression (source coding) and error correction (channel coding), it is well known that for finite block-length problems, and when there are restrictions to the computational complexity of coding, this optimality may not be achieved. Thus, we empirically compare the performance of joint and separate systems, and conclude that joint systems outperform their separate counterparts when coding is performed by flexible learnable function approximators such as neural networks.  Specifically, we cast the joint communication problem as a variational learning problem. To facilitate this, we introduce a differentiable and computationally efficient version of this channel.  We show that our design compensates for the loss of information by two mechanisms: (i) missing information is modelled by a prior model incorporated in the channel model, and (ii) sampling from the joint model is improved by auxiliary latent variables in the decoder. Experimental results justify the validity of our design decisions through improved distortion and FID scores.",
        "keywords": "variational inference;joint coding;bandwidth-limited channel;deep learning;representation learning;compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Karen Ullrich;Fabio Viola;Danilo J. Rezende",
        "authorids": "mail.karen.ullrich@gmail.com;fviola@google.com;danilor@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nullrich2020neural,\ntitle={Neural Communication Systems with Bandwidth-limited Channel},\nauthor={Karen Ullrich and Fabio Viola and Danilo J. Rezende},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgD2ySFDr}\n}",
        "github": "https://www.dropbox.com/s/tnznqx4u80cpjr6/iclr2020_code.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJgD2ySFDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "557;203;126",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;362;32",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.3333333333333,
            187.67761957380236
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            131.33333333333334,
            163.62830507654297
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17019620823931901268&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJgDT04twH",
        "title": "Deep Reinforcement Learning with Implicit Human Feedback",
        "track": "main",
        "status": "Reject",
        "tldr": "We use implicit human feedback (via error-potentials, EEG) to accelerate and optimize the training of a DRL algorithm, in a practical manner.",
        "abstract": "We consider the following central question in the field of Deep Reinforcement Learning (DRL): How can we use implicit human feedback to accelerate and optimize the training of a DRL algorithm? State-of-the-art methods rely on any human feedback to be provided explicitly, requiring the active participation of humans (e.g., expert labeling, demonstrations, etc.). In this work, we investigate an alternative paradigm, where non-expert humans are silently observing (and assessing) the agent interacting with the environment. The human's intrinsic reactions to the agent's behavior is sensed as implicit feedback by placing electrodes on the human scalp and monitoring what are known as event-related electric potentials. The implicit feedback is then used to augment the agent's learning in the RL tasks. We develop a system to obtain and accurately decode the implicit human feedback (specifically error-related event potentials) for state-action pairs in an Atari-type environment. As a baseline contribution, we demonstrate the feasibility of capturing error-potentials  of a human observer watching an agent learning to play several different Atari-games using an electroencephalogram (EEG) cap, and then decoding the signals appropriately and using them as an auxiliary reward function to a DRL algorithm with the intent of accelerating its learning of the game. Building atop the baseline, we then make the following novel contributions in our work:\n(i) We argue that the definition of error-potentials is generalizable across different environments; specifically we show that error-potentials of an observer can be learned for a specific game, and the definition used as-is for another game without requiring re-learning of the error-potentials.  \n(ii) We propose two different frameworks to combine recent advances in DRL into the error-potential based feedback system in a sample-efficient manner, allowing humans to provide implicit feedback while training in the loop, or prior to the training of the RL agent.\n(iii) Finally, we scale the implicit human feedback (via ErrP) based RL to reasonably complex environments (games) and demonstrate the significance of our approach through synthetic and real user experiments.\n",
        "keywords": "Error-Potentials;Implicit Human Feedback;Deep Reinforcement Learning;Human-assistance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Duo Xu;Mohit Agarwal;Raghupathy Sivakumar;Faramarz Fekri",
        "authorids": "dxu3016@gatech.edu;me.agmohit@gatech.edu;siva@ece.gatech.edu;faramarz.fekri@ece.gatech.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nxu2020deep,\ntitle={Deep Reinforcement Learning with Implicit Human Feedback},\nauthor={Duo Xu and Mohit Agarwal and Raghupathy Sivakumar and Faramarz Fekri},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgDT04twH}\n}",
        "github": "https://drive.google.com/open?id=1hxyFLA86nOaLZo0XDrtGPjeeTWJgwU03",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJgDT04twH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "315;565;812",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            564.0,
            202.90063249449634
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gvWKE2BTME0J:scholar.google.com/&scioq=Deep+Reinforcement+Learning+with+Implicit+Human+Feedback&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "rJgDb1SFwB",
        "title": "MGP-AttTCN: An Interpretable Machine Learning Model for the Prediction of Sepsis",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose MGP-AttTCN: a joint multitask Gaussian Process and attention-based deep learning model to early predict the occurrence of sepsis in an interpretable  and robust manner.",
        "abstract": "With a mortality rate of 5.4 million lives worldwide every year and a healthcare cost of more than 16 billion dollars in the USA alone, sepsis is one of the leading causes of hospital mortality and an increasing concern in the ageing western world. Recently, medical and technological advances have helped re-define the illness criteria of this disease, which is otherwise poorly understood by the medical society. Together with the rise of widely accessible Electronic Health Records, the advances in data mining and complex nonlinear algorithms are a promising avenue for the early detection of sepsis. This work contributes to the research effort in the field of automated sepsis detection with an open-access labelling of the medical MIMIC-III data set. Moreover, we propose MGP-AttTCN: a joint multitask Gaussian Process and attention-based deep learning model to early predict the occurrence of sepsis in an interpretable manner. We show that our model outperforms the current state-of-the-art and present evidence that different labelling heuristics lead to discrepancies in task difficulty.",
        "keywords": "time series analysis;interpretability;Gaussian Processes;attention neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Margherita Rosnati;Vincent Fortuin",
        "authorids": "mrosnati@ethz.ch;fortuin@inf.ethz.ch",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nrosnati2020mgpatttcn,\ntitle={{\\{}MGP{\\}}-Att{\\{}TCN{\\}}: An Interpretable Machine Learning Model for the Prediction of Sepsis},\nauthor={Margherita Rosnati and Vincent Fortuin},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgDb1SFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJgDb1SFwB",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "402;316;502",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "368;217;203",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.6666666666667,
            76.00584772824317
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            262.6666666666667,
            74.70088501632509
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 43,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5229073008708716544&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 14
    },
    {
        "id": "rJgE9CEYPS",
        "title": "Discriminability Distillation in Group Representation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Learning group representation is a commonly concerned issue in tasks where the basic unit is a group, set or sequence.\nThe computer vision community tries to tackle it by aggregating the elements in a group based on an indicator either defined by human such as the quality or saliency of an element, or generated by a black box such as the attention score or output of a RNN. \n\nThis article provides a more essential and explicable view. \nWe claim the most significant indicator to show whether the group representation can be benefited from an element is not the quality, or an inexplicable score, but the \\textit{discrimiability}. \nOur key insight is to explicitly design the \\textit{discrimiability} using embedded class centroids on a proxy set, \nand show the discrimiability distribution \\textit{w.r.t.} the element space can be distilled by a light-weight auxiliary distillation network. \nThis processing is called \\textit{discriminability distillation learning} (DDL).\nWe show the proposed DDL can be flexibly plugged into many group based recognition tasks without influencing the training procedure of the original tasks. Comprehensive experiments on set-to-set face recognition and action recognition valid the advantage of DDL on both accuracy and efficiency, and it pushes forward the state-of-the-art results on these tasks by an impressive margin.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Manyuan Zhang\uff0cGuanglu Song\uff0cYu Liu\uff0cHang Zhou;Manyuan Zhang\uff0cGuanglu Song\uff0cYu Liu\uff0cHang Zhou;Manyuan Zhang\uff0cGuanglu Song\uff0cYu Liu\uff0cHang Zhou;Manyuan Zhang\uff0cGuanglu Song\uff0cYu Liu\uff0cHang Zhou",
        "authorids": "zhangmanyuan@sensetime.com;songguanglu@sensetime.com;yuliu@ee.cuhk.edu.hk;zhouhang@link.cuhk.edu.hk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhou2020discriminability,\ntitle={Discriminability Distillation in Group Representation Learning},\nauthor={Manyuan Zhang\uff0cGuanglu Song\uff0cYu Liu\uff0cHang Zhou},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgE9CEYPS}\n}",
        "github": "https://www.dropbox.com/sh/j4gx4d8qebawl1i/AAAuKPircw50mbKHE03svpBda?dl=0",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJgE9CEYPS",
        "pdf_size": 0,
        "rating": "1;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "148;875;278;203",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "334;723;251;242",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            4.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            376.0,
            291.76960088398516
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            387.5,
            196.9930201809191
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8201468411936088405&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rJgFDnEYPr",
        "title": "Count-guided Weakly Supervised Localization Based on Density Map",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper uses the density map for counting to localize objects and proposes a method that helps generate cleaner density maps.",
        "abstract": "Weakly supervised localization (WSL) aims at training a model to find the positions of objects by providing it with only abstract labels. For most of the existing WSL methods, the labels are the class of the main object in an image. In this paper, we generalize WSL to counting machines that apply convolutional neural networks (CNN) and density maps for counting. We show that given only ground-truth count numbers, the density map as a hidden layer can be trained for localizing objects and detecting features. Convolution and pooling are the two major building blocks of CNNs. This paper discusses their impacts on an end-to-end WSL network. The learned features in a density map present in the form of dots. In order to make these features interpretable for human beings, this paper proposes a Gini impurity penalty to regularize the density map. Furthermore, it will be shown that this regularization is similar to the variational term of the $\\beta$-variational autoencoder. The details of this algorithm are demonstrated through a simple bubble counting task. Finally, the proposed methods are applied to the widely used crowd counting dataset the Mall to learn discriminative features of human figures.",
        "keywords": "Semi-supervised Learning;Weakly Supervised Localization;Variational Autoencoder;Density Map;Counting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ming Ma;Stephan Chalup;Fayeem Aziz;Yang Liu;Defu Cheng;Zhijian Zhou",
        "authorids": "mmingabc@outlook.com;stephan.chalup@newcastle.edu.au;mdfayeembin.aziz@uon.edu.au;liu15@mails.jlu.edu.cn;chengdefu@jlu.edu.cn;zhouzhijian@jlu.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nma2020countguided,\ntitle={Count-guided Weakly Supervised Localization Based on Density Map},\nauthor={Ming Ma and Stephan Chalup and Fayeem Aziz and Yang Liu and Defu Cheng and Zhijian Zhou},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgFDnEYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJgFDnEYPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "454;499;256",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            403.0,
            105.55567251455508
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xF2m3qjJzO8J:scholar.google.com/&scioq=Count-guided+Weakly+Supervised+Localization+Based+on+Density+Map&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJgFjREtwr",
        "title": "Distribution-Guided Local Explanation for Black-Box Classifiers",
        "track": "main",
        "status": "Reject",
        "tldr": "distribution-guided local explanation framework to provide discriminative saliency maps with easy-to-set hyper-parameters",
        "abstract": "Existing local explanation methods provide an explanation for each decision of black-box classifiers, in the form of relevance scores of features according to their contributions. To obtain satisfying explainability, many methods introduce ad hoc constraints into the classification loss to regularize these relevance scores. However, the large information gap between the classification loss and these constraints increases the difficulty of tuning hyper-parameters. To bridge this gap, in this paper we present a simple but effective mask predictor. Specifically, we model the above constraints with a distribution controller, and integrate it with a neural network to directly guide the distribution of relevance scores. The benefit of this strategy is to facilitate the setting of involved hyper-parameters, and enable discriminative scores over supporting features. The experimental results demonstrate that our method outperforms others in terms of faithfulness and explainability. Meanwhile, it also provides effective saliency maps for explaining each decision. ",
        "keywords": "explanation;cnn;saliency map",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Weijie Fu;Meng Wang;Mengnan Du;Ninghao Liu;Shijie Hao;Xia Hu",
        "authorids": "fwj.edu@gmail.com;eric.mengwang@gmail.com;dumengnan@tamu.edu;nhliu43@tamu.edu;hfut.hsj@gmail.com;hu@cse.tamu.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nfu2020distributionguided,\ntitle={Distribution-Guided Local Explanation for Black-Box Classifiers},\nauthor={Weijie Fu and Meng Wang and Mengnan Du and Ninghao Liu and Shijie Hao and Xia Hu},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgFjREtwr}\n}",
        "github": "https://github.com/iclrlocal",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJgFjREtwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "451;958;334",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1076;1274;500",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            581.0,
            270.8246665279956
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            950.0,
            328.30473648730685
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13915234715847996871&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJgHC2VKvB",
        "title": "Recurrent Neural Networks are Universal Filters",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that recurrent neural networks can approximate a large class of optimal filters.",
        "abstract": "Recurrent neural networks (RNN) are powerful time series modeling tools in ma-\nchine learning. It has been successfully applied in a variety of fields such as natural\nlanguage processing (Mikolov et al. (2010), Graves et al. (2013), Du et al. (2015)),\ncontrol (Fei & Lu (2017)) and traffic forecasting (Ma et al. (2015)), etc. In those\napplication scenarios, RNN can be viewed as implicitly modelling a stochastic dy-\nnamic system. Another type of popular neural network, deep (feed-forward) neural\nnetwork has also been successfully applied in different engineering disciplines,\nwhose approximation capability has been well characterized by universal approxi-\nmation theorem (Hornik et al. (1989), Park & Sandberg (1991), Lu et al. (2017)).\nHowever, the underlying approximation capability of RNN has not been fully\nunderstood in a quantitative way. In our paper, we consider a stochastic dynamic\nsystem with noisy observations and analyze the approximation capability of RNN\nin synthesizing the optimal state estimator, namely optimal filter. We unify the\nrecurrent neural network into Bayesian filtering framework and show that recurrent\nneural network is a universal approximator of optimal finite dimensional filters\nunder some mild conditions. That is to say, for any stochastic dynamic systems\nwith noisy sequential observations that satisfy some mild conditions, we show that\n(informal)\n\u2200\u000f > 0, \u2203 RNN-based filter, s.t. lim sup x\u0302 k|k \u2212 E[x k |Y k ] < \u000f,\nk\u2192\u221e\nwhere x\u0302 k|k is RNN-based filter\u2019s estimate of state x k at step k conditioned on\nthe observation history and E[x k |Y k ] is the conditional mean of x k , known as the\noptimal estimate of the state in minimum mean square error sense. As an interesting\nspecial case, the widely used Kalman filter (KF) can be synthesized by RNN.",
        "keywords": "Recurrent Neural Networks;Expressive Power;Deep Learning Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenjie Xu;Xiuqiong Chen;Stephen S.-T. Yau",
        "authorids": "1155118056@link.cuhk.edu.hk;cxq0828@tsinghua.edu.cn;yau@uic.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nxu2020recurrent,\ntitle={Recurrent Neural Networks are Universal Filters},\nauthor={Wenjie Xu and Xiuqiong Chen and Stephen S.-T. Yau},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgHC2VKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJgHC2VKvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "260;315;183",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "500;675;665",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            252.66666666666666,
            54.137684554189136
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            613.3333333333334,
            80.2426874480715
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8378936534951218901&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJgJDAVKvB",
        "title": "Learning to Plan in High Dimensions via Neural Exploration-Exploitation Trees",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose a meta path planning algorithm which exploits a novel attention-based neural module that can learn generalizable structures from prior experiences to drastically reduce the sample requirement for solving new path planning problems.",
        "abstract": "We propose a meta path planning algorithm named \\emph{Neural Exploration-Exploitation Trees~(NEXT)} for learning from prior experience for solving new path planning problems in high dimensional continuous state and action spaces. Compared to more classical sampling-based methods like RRT, our approach achieves much better sample efficiency in  high-dimensions and can benefit from prior experience of planning in similar environments. More specifically, NEXT exploits a novel neural architecture which can learn promising search directions from problem structures. The learned prior is then integrated into a UCB-type algorithm to achieve an online balance between \\emph{exploration} and \\emph{exploitation} when solving a new problem. We conduct thorough experiments to show that NEXT accomplishes new planning problems with more compact search trees and significantly outperforms state-of-the-art methods on several benchmarks.",
        "keywords": "learning to plan;representation learning;learning to design algorithm;reinforcement learning;meta learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Binghong Chen;Bo Dai;Qinjie Lin;Guo Ye;Han Liu;Le Song",
        "authorids": "binghong@gatech.edu;bodai@google.com;qinjielin2018@u.northwestern.edu;guoye2018@u.northwestern.edu;hanliu@northwestern.edu;lsong@cc.gatech.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nChen2020Learning,\ntitle={Learning to Plan in High Dimensions via Neural Exploration-Exploitation Trees},\nauthor={Binghong Chen and Bo Dai and Qinjie Lin and Guo Ye and Han Liu and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgJDAVKvB}\n}",
        "github": "https://github.com/NeurEXT/NEXT-learning-to-plan/blob/master/main.ipynb",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJgJDAVKvB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "345;599;221",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "759;568;471",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            388.3333333333333,
            157.33050844921618
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            599.3333333333334,
            119.64484480699069
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 64,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13265166382638630574&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rJgKzlSKPH",
        "title": "Fix-Net: pure fixed-point representation of deep neural networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Soft quantization approach to learn pure fixed-point representations of deep neural networks",
        "abstract": "Deep neural networks (DNNs) dominate current research in machine learning. Due to massive GPU parallelization DNN training is no longer a bottleneck, and large models with many parameters and high computational effort lead common benchmark tables. In contrast, embedded devices have a very limited capability. As a result, both model size and inference time must be significantly reduced if DNNs are to achieve suitable performance on embedded devices.\nWe propose a soft quantization approach to train DNNs that can be evaluated using pure fixed-point arithmetic. By exploiting the bit-shift mechanism, we derive fixed-point quantization constraints for all important components, including batch normalization and ReLU. Compared to floating-point arithmetic, fixed-point calculations significantly reduce computational effort whereas low-bit representations immediately decrease memory costs. We evaluate our approach with different architectures on common benchmark data sets and compare with recent quantization approaches. We achieve new state of the art performance using 4-bit fixed-point models with an error rate of 4.98% on CIFAR-10.",
        "keywords": "Deep neural networks;fixed-point quantization;bit-shift;soft quantization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lukas Enderich;Fabian Timm;Lars Rosenbaum;Wolfram Burgard",
        "authorids": "lukas.enderich@de.bosch.com;fabian.timm@de.bosch.com;lars.rosenbaum@de.bosch.com;burgard@informatik.uni-freiburg.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJgKzlSKPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "218;144;642",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            334.6666666666667,
            219.4072823667336
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2687248314603987207&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJgLlAVYPr",
        "title": "White Box Network: Obtaining a right composition ordering of functions",
        "track": "main",
        "status": "Reject",
        "tldr": "We presented a new model called the WBN, which obtains the exact order and correct inputs of function blocks to compose them for constructing target functions.",
        "abstract": "Neural networks have significantly benefitted real-world tasks. The universality of a neural network enables the approximation of any type of continuous functions. However, a neural network is regarded as a non-interpretable black box model, and this is fatal to reverse engineering as the main goal of reverse engineering is to reveal the structure or design of a target function instead of approximating it. Therefore, we propose a new type of a function constructing network, called the white box network. This network arranges function blocks to construct a target function to reveal its design. The network uses discretized layers, thus rendering the model interpretable without disordering the function blocks. Additionally, we introduce an end-to-end PathNet structure through this discretization by considering the function blocks as neural networks",
        "keywords": "white box;black box;function composition;neural network;ordering functions;reverse engineering;programmable logic controller;plc;white box network;WBN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eun saem Lee;Hyung Ju Hwang",
        "authorids": "dmstoa2502@postech.ac.kr;hjhwang@postech.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlee2020white,\ntitle={White Box Network: Obtaining a right composition ordering of functions},\nauthor={Eun saem Lee and Hyung Ju Hwang},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgLlAVYPr}\n}",
        "github": "https://github.com/ekRmxskTek/WBN/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJgLlAVYPr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "258;560;462",
        "wc_reply_reviewers": "0;0;354",
        "wc_reply_authors": "252;460;933",
        "reply_reviewers": "0;0;2",
        "reply_authors": "1;1;2",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            426.6666666666667,
            125.79701462629744
        ],
        "wc_reply_reviewers_avg": [
            118.0,
            166.87720036002523
        ],
        "wc_reply_authors_avg": [
            548.3333333333334,
            284.947168592511
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZTsD1G_DOekJ:scholar.google.com/&scioq=White+Box+Network:+Obtaining+a+right+composition+ordering+of+functions&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJgPFgHFwr",
        "title": "Laconic Image Classification: Human vs. Machine Performance",
        "track": "main",
        "status": "Reject",
        "tldr": "A framework for minimal entropy image classification and a comparison between machines and humans",
        "abstract": "We propose laconic classification as a novel way to understand and compare the performance of diverse image classifiers. The goal in this setting is to minimise the amount of information (aka. entropy) required in individual test images to maintain correct classification. Given a classifier and a test image, we compute an approximate minimal-entropy positive image for which the classifier provides a correct classification, becoming incorrect upon any further reduction. The notion of entropy offers a unifying metric that allows to combine and compare the effects of various types of reductions (e.g., crop, colour reduction, resolution reduction) on classification performance, in turn generalising similar methods explored in previous works. Proposing two complementary frameworks for computing the minimal-entropy positive images of both human and machine classifiers, in experiments over the ILSVRC test-set, we find that machine classifiers are more sensitive entropy-wise to reduced resolution (versus cropping or reduced colour for machines, as well as reduced resolution for humans), supporting recent results suggesting a texture bias in the ILSVRC-trained models used. We also find, in the evaluated setting, that humans classify the minimal-entropy positive images of machine models with higher precision than machines classify those of humans.",
        "keywords": "minimal images;entropy;human vs. machine performance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Javier Carrasco;Aidan Hogan;Jorge P\u00e9rez",
        "authorids": "jaco_1031@hotmail.com;aidhog@gmail.com;jorge.perez.rojas@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ncarrasco2020laconic,\ntitle={Laconic Image Classification: Human vs. Machine Performance},\nauthor={Javier Carrasco and Aidan Hogan and Jorge P{\\'e}rez},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgPFgHFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJgPFgHFwr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "377;420;461",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "578;535;788",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            419.3333333333333,
            34.296096311711956
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            633.6666666666666,
            110.53305186936419
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17980565544661145851&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rJgQkT4twH",
        "title": "Analysis of Video Feature Learning in Two-Stream CNNs on the Example of Zebrafish Swim Bout Classification",
        "track": "main",
        "status": "Poster",
        "tldr": "We demonstrate the utility of a recent AI explainability technique by visualizing the learned features of a CNN trained on binary classification of zebrafish movements.",
        "abstract": "Semmelhack et al. (2014) have achieved high classification accuracy in distinguishing swim bouts of zebrafish using a Support Vector Machine (SVM). Convolutional Neural Networks (CNNs) have reached superior performance in various image recognition tasks over SVMs, but these powerful networks remain a black box. Reaching better transparency helps to build trust in their classifications and makes learned features interpretable to experts. Using a recently developed technique called Deep Taylor Decomposition, we generated heatmaps to highlight input regions of high relevance for predictions. We find that our CNN makes predictions by analyzing the steadiness of the tail's trunk, which markedly differs from the manually extracted features used by Semmelhack et al. (2014). We further uncovered that the network paid attention to experimental artifacts. Removing these artifacts ensured the validity of predictions. After correction, our best CNN beats the SVM by 6.12%, achieving a classification accuracy of 96.32%. Our work thus demonstrates the utility of AI explainability for CNNs.",
        "keywords": "convolutional neural networks;neural network transparency;AI explainability;deep Taylor decomposition;supervised classification;zebrafish;transparency;behavioral research;optical flow",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bennet Breier;Arno Onken",
        "authorids": "b.breier@sms.ed.ac.uk;aonken@inf.ed.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nbreier2020analysis,\ntitle={Analysis of Video Feature Learning in Two-Stream {\\{}CNN{\\}}s on the Example of Zebrafish Swim Bout Classification},\nauthor={Bennet Breier and Arno Onken},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgQkT4twH}\n}",
        "github": "https://github.com/Benji4/zebrafish-learning.git",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJgQkT4twH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "113;440;213",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "18;315;264",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            255.33333333333334,
            136.81211772191656
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            199.0,
            129.6688088940436
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7291111967926344032&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rJgRMkrtDr",
        "title": "Learning Video Representations using Contrastive Bidirectional Transformer",
        "track": "main",
        "status": "Reject",
        "tldr": "Generalized BERT for continuous and cross-modal inputs; state-of-the-art self-supervised video representations.",
        "abstract": "This paper proposes a self-supervised learning approach for video features that results in significantly improved performance on downstream tasks (such as video classification, captioning and segmentation) compared to existing methods. Our method extends the BERT model for text sequences to the case of sequences of real-valued feature vectors, by replacing the softmax loss with noise contrastive estimation (NCE). We also show how to learn representations from sequences of visual features and sequences of words derived from ASR (automatic speech recognition), and show that such cross-modal training (when possible) helps even more.",
        "keywords": "self-supervised learning;video representations;cross-modal learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chen Sun;Fabien Baradel;Kevin Murphy;Cordelia Schmid",
        "authorids": "chensun@google.com;fabien.baradel@insa-lyon.fr;kpmurphy@google.com;cordelias@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsun2020learning,\ntitle={Learning Video Representations using Contrastive Bidirectional Transformer},\nauthor={Chen Sun and Fabien Baradel and Kevin Murphy and Cordelia Schmid},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgRMkrtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJgRMkrtDr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "308;206;272",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "280;6;244",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            262.0,
            42.23742416388575
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            176.66666666666666,
            121.57119541879794
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 251,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=41152946187406221&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJgSk04tDH",
        "title": "Why Does Hierarchy (Sometimes) Work So Well in Reinforcement Learning?",
        "track": "main",
        "status": "Reject",
        "tldr": "We perform a methodical evaluation of the benefits of hierarchy and find that oftentimes explicitly imposed hierarchical structures are not necessary for good performance.",
        "abstract": "Hierarchical reinforcement learning has demonstrated significant success at solving difficult reinforcement learning (RL) tasks. Previous works have motivated the use of hierarchy by appealing to a number of intuitive benefits, including learning over temporally extended transitions, exploring over temporally extended periods, and training and exploring in a more semantically meaningful action space, among others. However, in fully observed, Markovian settings, it is not immediately clear why hierarchical RL should provide benefits over standard \"shallow\" RL architectures. In this work, we isolate and evaluate the claimed benefits of hierarchical RL on a suite of tasks encompassing locomotion, navigation, and manipulation.\nSurprisingly, we find that most of the observed benefits of hierarchy can be attributed to improved exploration, as opposed to easier policy learning or imposed hierarchical structures. Given this insight, we present exploration techniques inspired by hierarchy that achieve performance competitive with hierarchical RL while at the same time being much simpler to use and implement. ",
        "keywords": "rl;hierarchy;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ofir Nachum;Haoran Tang;Xingyu Lu;Shixiang Gu;Honglak Lee;Sergey Levine",
        "authorids": "ofirnachum@google.com;hrtang.alex@berkeley.edu;xingyulu0701@berkeley.edu;shanegu@google.com;honglak@google.com;svlevine@eecs.berkeley.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nnachum2020why,\ntitle={Why Does Hierarchy (Sometimes) Work So Well in Reinforcement Learning?},\nauthor={Ofir Nachum and Haoran Tang and Xingyu Lu and Shixiang Gu and Honglak Lee and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgSk04tDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJgSk04tDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "503;664;609",
        "wc_reply_reviewers": "0;228;0",
        "wc_reply_authors": "34;429;692",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            592.0,
            66.81816120387231
        ],
        "wc_reply_reviewers_avg": [
            76.0,
            107.48023074035522
        ],
        "wc_reply_authors_avg": [
            385.0,
            270.423125243879
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 127,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15971044464283211205&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJgUfTEYvH",
        "title": "VideoFlow: A Conditional Flow-Based Model for Stochastic Video Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "We demonstrate that flow-based generative models offer a viable and competitive approach to generative modeling of video.",
        "abstract": "Generative models that can model and predict sequences of future events can, in principle, learn to capture complex real-world phenomena, such as physical interactions. However, a central challenge in video prediction is that the future is highly uncertain: a sequence of past observations of events can imply many possible futures. Although a number of recent works have studied probabilistic models that can represent uncertain futures, such models are either extremely expensive computationally as in the case of pixel-level autoregressive models, or do not directly optimize the likelihood of the data. To our knowledge, our work is the first to propose multi-frame video prediction with normalizing flows, which allows for direct optimization of the data likelihood, and produces high-quality stochastic predictions. We describe an approach for modeling the latent space dynamics, and demonstrate that flow-based generative models offer a viable and competitive approach to generative modeling of video.",
        "keywords": "Video generation;flow-based generative models;stochastic video prediction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Manoj Kumar;Mohammad Babaeizadeh;Dumitru Erhan;Chelsea Finn;Sergey Levine;Laurent Dinh;Durk Kingma",
        "authorids": "manojkumarsivaraj334@gmail.com;mb2@uiuc.edu;dumitru@google.com;cbfinn@eecs.berkeley.edu;slevine@google.com;laurentdinh@google.com;d.p.kingma@uva.nl",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nKumar2020VideoFlow:,\ntitle={VideoFlow: A Conditional Flow-Based Model for Stochastic Video Generation},\nauthor={Manoj Kumar and Mohammad Babaeizadeh and Dumitru Erhan and Chelsea Finn and Sergey Levine and Laurent Dinh and Durk Kingma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgUfTEYvH}\n}",
        "github": "https://storage.googleapis.com/iclr_code/videoflow_code.zip",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJgUfTEYvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "410;365;249",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "830;936;524",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            341.3333333333333,
            67.82493805542488
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            763.3333333333334,
            174.67938884965474
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 123,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13005087974871140727&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJgVwTVtvS",
        "title": "Gradient Perturbation is Underrated for Differentially Private Convex Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We establish a new and tighter utility guarantee for DP-GD and DP-SGD, and justify the advantage of gradient perturbation theoretically over output/objective pertubation.",
        "abstract": "Gradient perturbation, widely used for differentially private optimization, injects noise at every iterative update to guarantee differential privacy. Previous work first determines the noise level that  can satisfy the privacy requirement and then analyzes the utility  of noisy gradient updates as in non-private case.  In this paper, we explore how the privacy noise affects the optimization property. We show that for differentially private convex optimization, the utility guarantee of both DP-GD and  DP-SGD is determined by an \\emph{expected curvature} rather than the minimum curvature. The \\emph{expected curvature} represents the average curvature over the optimization path, which is usually much larger than the minimum curvature and hence can help us achieve a significantly improved utility guarantee. By using the \\emph{expected curvature}, our theory justifies the advantage of gradient perturbation over other perturbation methods and closes the gap between theory and practice. Extensive experiments on real world datasets corroborate our theoretical findings.",
        "keywords": "minimum curvature;gradient perturbation;DP-GD;DP-SGD",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Da Yu;Huishuai Zhang;Wei Chen;Tie-yan Liu;Jian Yin",
        "authorids": "yuda3@mail2.sysu.edu.cn;huishuai.zhang@microsoft.com;wche@microsoft.com;tie-yan.liu@microsoft.com;issjyin@mail.sysu.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyu2020gradient,\ntitle={Gradient Perturbation is Underrated for Differentially Private Convex Optimization},\nauthor={Da Yu and Huishuai Zhang and Wei Chen and Tie-yan Liu and Jian Yin},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgVwTVtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJgVwTVtvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "314;338;398",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "428;225;579",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            350.0,
            35.32704346531139
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            410.6666666666667,
            145.0386921556528
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12622753422828322397&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rJgWiaNtwH",
        "title": "Bridging ELBO objective and MMD",
        "track": "main",
        "status": "Withdraw",
        "tldr": "This paper proposes a new  objective function to replace KL term with one that emulates maximum mean discrepancy (MMD) objective. ",
        "abstract": "One of the challenges in training generative models such as the variational auto encoder (VAE) is avoiding posterior collapse. When the generator has too much capacity, it is prone to ignoring latent code. This problem is exacerbated when the dataset is small, and the latent dimension is high. The root of the problem is the ELBO objective, specifically the Kullback\u2013Leibler (KL) divergence term in objective function. This paper proposes a new  objective function to replace the KL term with one that emulates the maximum mean discrepancy (MMD) objective. It also introduces a new technique, named latent clipping, that is used to control distance between samples in latent space. A probabilistic autoencoder model, named $\\mu$-VAE, is designed and trained on MNIST and MNIST Fashion datasets, using the new objective function and is shown to outperform models trained with ELBO and $\\beta$-VAE objective. The $\\mu$-VAE is less prone to posterior collapse, and can generate reconstructions and new samples in good quality. Latent representations learned by $\\mu$-VAE are shown to be good and can be used for downstream tasks such as classification.  ",
        "keywords": "ELBO;MMD;VAE;Posterior collapse",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Talip Ucar",
        "authorids": "pilatracu@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "",
        "github": "https://drive.google.com/open?id=1yy16YEY0TZmUP0cZmq--q3PYoTQK6gu-",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJgWiaNtwH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "370;1057;203",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "303;823;229",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            543.3333333333334,
            369.56039951392097
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            451.6666666666667,
            264.3045381541696
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Kyor-ANFn6kJ:scholar.google.com/&scioq=Bridging+ELBO+objective+and+MMD&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJgffkSFPS",
        "title": "Multi-objective Neural Architecture Search via Predictive Network Performance Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural Architecture Search (NAS) has shown great potentials in finding a better neural network design than human design. Sample-based NAS is the most fundamental method aiming at exploring the search space and evaluating the most promising architecture. However, few works have focused on improving the sampling efficiency for a multi-objective NAS. Inspired by the nature of the graph structure of a neural network, we propose BOGCN-NAS, a NAS algorithm using Bayesian Optimization with Graph Convolutional Network (GCN) predictor. Specifically, we apply GCN as a surrogate model to adaptively discover and incorporate nodes structure to approximate the performance of the architecture. For NAS-oriented tasks, we also design a weighted loss focusing on architectures with high performance. Our method further considers an efficient multi-objective search which can be flexibly injected into any sample-based NAS pipelines to efficiently find the best speed/accuracy trade-off. Extensive experiments are conducted to verify the effectiveness of our method over many competing methods, e.g. 128.4x more efficient than Random Search and 7.8x more efficient than previous SOTA LaNAS for finding the best architecture on the largest NAS dataset NasBench-101.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Han Shi;Renjie Pi;Hang Xu;Zhenguo Li;James T. Kwok;Tong Zhang",
        "authorids": "hshiac@cse.ust.hk;pipilu@connect.hku.hk;xbjxh@live.com;li.zhenguo@huawei.com;jamesk@cse.ust.hk;tongzhang@tongzhang-ml.org",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nshi2020multiobjective,\ntitle={Multi-objective Neural Architecture Search via Predictive Network Performance Optimization},\nauthor={Han Shi and Renjie Pi and Hang Xu and Zhenguo Li and James T. Kwok and Tong Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgffkSFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJgffkSFPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "312;614;486",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "660;666;854",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            470.6666666666667,
            123.7668058173201
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            726.6666666666666,
            90.07157647609422
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7282937312692693332&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJggX0EKwS",
        "title": "The Benefits of Over-parameterization at Initialization in Deep ReLU Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that the norm of hidden activations and the norm of weight gradients are a function of the norm of input data and error at output. We relax the assumption made by previous papers that study weight initialization in deep ReLU networks.",
        "abstract": "It has been noted in existing literature that over-parameterization in ReLU networks generally improves performance. While there could be several factors involved behind this, we prove some desirable theoretical properties at initialization which may be enjoyed by ReLU networks. Specifically, it is known that He initialization in deep ReLU networks asymptotically preserves variance of activations in the forward pass and variance of gradients in the backward pass for infinitely wide networks, thus preserving the flow of information in both directions. Our paper goes beyond these results and shows novel properties that hold under He initialization: i) the norm of hidden activation of each layer is equal to the norm of the input, and, ii) the norm of weight gradient of each layer is equal to the product of norm of the input vector and the error at output layer. These results are derived using the PAC analysis framework, and hold true for finitely sized datasets such that the width of the ReLU network only needs to be larger than a certain finite lower bound. As we show, this lower bound depends on the depth of the network and the number of samples, and by the virtue of being a lower bound, over-parameterized ReLU networks are endowed with these desirable properties. For the aforementioned hidden activation norm property under He initialization, we further extend our theory and show that this property holds for a finite width network even when the number of data samples is infinite. Thus we overcome several limitations of existing papers, and show new properties of deep ReLU networks at initialization.",
        "keywords": "deep relu networks;he initialization;norm preserving;gradient preserving",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Devansh Arpit;Yoshua Bengio",
        "authorids": "devansharpit@gmail.com;yoshua.bengio@mila.quebec",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\narpit2020the,\ntitle={The Benefits of Over-parameterization at Initialization in Deep Re{\\{}LU{\\}} Networks},\nauthor={Devansh Arpit and Yoshua Bengio},\nyear={2020},\nurl={https://openreview.net/forum?id=rJggX0EKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJggX0EKwS",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "122;113;398;442",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            268.75,
            152.0811871994692
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 33,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16384503702488137425&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJgjGxrFPS",
        "title": "A Simple and Scalable Shape Representation for 3D Reconstruction",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that a shape representation based on applying PCA to the signed distance transform can be effective for shape inference tasks.",
        "abstract": "Deep learning applied to the reconstruction of 3D shapes has seen growing interest. A popular approach to 3D reconstruction and generation in recent years has been the CNN decoder-encoder model often applied in voxel space. However this often scales very poorly with the resolution limiting the effectiveness of these models. Several sophisticated alternatives for decoding to 3D shapes have been proposed typically relying on alternative deep learning architectures. We show however in this work that standard benchmarks in 3D reconstruction can be tackled with a surprisingly simple approach: a linear decoder obtained by principal component analysis on the signed distance transform of the surface. This approach allows easily scaling to larger resolutions. We show in multiple experiments it is competitive with state of the art methods and also allows the decoder to be fine-tuned on the target task using a loss designed for SDF transforms, obtaining further gains.    ",
        "keywords": "Computer Vision;3D Reconstruction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mateusz Michalkiewicz;Eugene Belilovsky;Mahsa Baktashmotagh;Anders Eriksson",
        "authorids": "78lhar@gmail.com;belilovsky.eugene@gmail.com;m.baktashmotlagh@uq.edu.au;a.eriksson@uq.edu.au",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmichalkiewicz2020a,\ntitle={A Simple and Scalable Shape Representation for 3D Reconstruction},\nauthor={Mateusz Michalkiewicz and Eugene Belilovsky and Mahsa Baktashmotagh and Anders Eriksson},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgjGxrFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJgjGxrFPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "317;359;297",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "831;456;206",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            324.3333333333333,
            25.83709650010147
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            497.6666666666667,
            256.85058345704067
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4778780319363913449&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJgqMRVYvr",
        "title": "Differentially Private Meta-Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Parameter-transfer is a well-known and versatile approach for meta-learning, with applications including few-shot learning, federated learning, with personalization, and reinforcement learning. However, parameter-transfer algorithms often require sharing models that have been trained on the samples from specific tasks, thus leaving the task-owners susceptible to breaches of privacy. We conduct the first formal study of privacy in this setting and formalize the notion of task-global differential privacy as a practical relaxation of more commonly studied threat models. We then propose a new differentially private algorithm for gradient-based parameter transfer that not only satisfies this privacy requirement but also retains provable transfer learning guarantees in convex settings. Empirically, we apply our analysis to the problems of federated learning with personalization and few-shot classification, showing that allowing the relaxation to task-global privacy from the more commonly studied notion of local privacy leads to dramatically increased performance in recurrent neural language modeling and image classification.",
        "keywords": "Differential Privacy;Meta-Learning;Federated Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jeffrey Li;Mikhail Khodak;Sebastian Caldas;Ameet Talwalkar",
        "authorids": "jwl3@andrew.cmu.edu;khodak@cs.cmu.edu;scaldas@cs.cmu.edu;talwalkar@cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLi2020Differentially,\ntitle={Differentially Private Meta-Learning},\nauthor={Jeffrey Li and Mikhail Khodak and Sebastian Caldas and Ameet Talwalkar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgqMRVYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJgqMRVYvr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "143;197;294",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "461;93;345",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            211.33333333333334,
            62.473105324522564
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            299.6666666666667,
            153.6171286745792
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 163,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15346558894737862909&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rJgqalBKvH",
        "title": "Deceptive Opponent Modeling with Proactive Network Interdiction for Stochastic Goal Recognition Control",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Goal recognition based on the observations of the behaviors collected online has been used to model some potential applications. Newly formulated problem of goal recognition design aims at facilitating the online goal recognition process by performing offline redesign of the underlying environment with hard action removal.\nIn this paper, we propose the stochastic goal recognition control (S-GRC) problem with two main stages: (1) deceptive opponent modeling based on maximum entropy regularized Markov decision processes (MDPs) and (2) goal recognition control under proactively static interdiction.\nFor the purpose of evaluation, we propose to use the worst case distinctiveness (wcd) as a measure of the non-distinctive path without revealing the true goals, the task of S-GRC is to interdict a set of actions that improve or reduce the wcd.\nWe empirically demonstrate that our proposed approach control the goal recognition process based on opponent's deceptive behavior.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junren Luo;Wei Gao;Zhiyong Liao;Weilin Yuan;Wanpeng Zhang;Shaofei Chen",
        "authorids": ";;;;;",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nluo2020deceptive,\ntitle={Deceptive Opponent Modeling with Proactive Network Interdiction for Stochastic Goal Recognition Control},\nauthor={Junren Luo and Wei Gao and Zhiyong Liao and Weilin Yuan and Wanpeng Zhang and Shaofei Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgqalBKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJgqalBKvH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "191;129;115",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            145.0,
            33.0252428706689
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9SCwIpyT228J:scholar.google.com/&scioq=Deceptive+Opponent+Modeling+with+Proactive+Network+Interdiction+for+Stochastic+Goal+Recognition+Control&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJgqjREtvS",
        "title": "CRNet: Image Super-Resolution Using A Convolutional Sparse Coding Inspired Network",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Convolutional Sparse Coding (CSC) has been attracting more and more attention in recent years, for making full use of image global correlation to improve performance on various computer vision applications. However, very few studies focus on solving CSC based image Super-Resolution (SR) problem. As a consequence, there is no significant progress in this area over a period of time. In this paper, we exploit the natural connection between CSC and Convolutional Neural Networks (CNN) to address CSC based image SR. Specifically, Convolutional Iterative Soft Thresholding Algorithm (CISTA) is introduced to solve CSC problem and it can be implemented using CNN architectures. Then we develop a novel CSC based SR framework analogy to the traditional SC based SR methods. Two models inspired by this framework are proposed for pre-/post-upsampling SR, respectively. Compared with recent state-of-the-art SR methods, both of our proposed models show superior performance in terms of both quantitative and qualitative measurements.",
        "keywords": "Convolutional sparse coding;LISTA;image super-resolution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Menglei Zhang;Zhou Liu;Jingwei He;Lei Yu",
        "authorids": "zmlhome@whu.edu.cn;liuzhou@whu.edu.cn;jingwei_he@whu.edu.cn;ly.wd@whu.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020crnet,\ntitle={{\\{}CRN{\\}}et: Image Super-Resolution Using A Convolutional Sparse Coding  Inspired Network},\nauthor={Menglei Zhang and Zhou Liu and Jingwei He and Lei Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgqjREtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJgqjREtvS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "107;182;196",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            161.66666666666666,
            39.075425639254256
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2772953666790775268&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJgsgCVYwS",
        "title": "Going Deeper with Lean Point Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We introduce three generic point cloud processing blocks that improve both accuracy and memory consumption of multiple state-of-the-art networks, thus allowing to design deeper and more accurate networks.",
        "abstract": "We introduce three generic point cloud processing blocks that improve both accuracy and memory consumption of multiple state-of-the-art networks, thus allowing to design deeper and more accurate networks.\n\nThe novel processing blocks that facilitate efficient information flow are a convolution-type operation block for point sets that blends neighborhood information in a memory-efficient manner; a multi-resolution point cloud processing block; and a crosslink block that efficiently shares information across low- and high-resolution processing branches. Combining these blocks, we design significantly wider and deeper architectures.\n\nWe extensively evaluate the proposed architectures on multiple point segmentation benchmarks (ShapeNetPart, ScanNet, PartNet) and report systematic improvements in terms of both accuracy and memory consumption by using our generic modules in conjunction with multiple recent architectures (PointNet++, DGCNN, SpiderCNN, PointCNN). We report a 9.7% increase in IoU on the PartNet dataset, which is the most complex, while decreasing memory footprint  by  57%.",
        "keywords": "point cloud processing;point convolutions;memory-efficient training;deep neural network design",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eric-Tuan Le;Iasonas Kokkinos;Niloy J. Mitra",
        "authorids": "eric-tuan.le.18@ucl.ac.uk;i.kokkinos@cs.ucl.ac.uk;n.mitra@cs.ucl.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJgsgCVYwS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "431;381;149",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "225;111;142",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            320.3333333333333,
            122.85854558077034
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            159.33333333333334,
            48.12714641678044
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 33,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12165344201622069206&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "rJgsskrFwH",
        "title": "Scaling Autoregressive Video Models",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We present a novel autoregressive video generation that achieves strong results on popular datasets and produces encouraging continuations of real world videos.",
        "abstract": "Due to the statistical complexity of video, the high degree of inherent stochasticity, and the sheer amount of data, generating natural video remains a challenging task. State-of-the-art video generation models attempt to address these issues by combining sometimes complex, often video-specific neural network architectures, latent variable models, adversarial training and a range of other methods. Despite their often high complexity, these approaches still fall short of generating high quality video continuations outside of narrow domains and often struggle with fidelity. In contrast, we show that conceptually simple, autoregressive video generation models based on a three-dimensional self-attention mechanism achieve highly competitive results across multiple metrics on popular benchmark datasets for which they produce continuations of high fidelity and realism. Furthermore, we find that our models are capable of producing diverse and surprisingly realistic continuations on a subset of videos from Kinetics, a large scale action recognition dataset comprised of YouTube videos exhibiting phenomena such as camera movement, complex object interactions and diverse human movement. To our knowledge, this is the first promising application of video-generation models to videos of this complexity.",
        "keywords": "autoregressive models;video prediction;generative models;video generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dirk Weissenborn;Oscar T\u00e4ckstr\u00f6m;Jakob Uszkoreit",
        "authorids": "diwe@google.com;oscar.tackstrom@gmail.com;usz@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nWeissenborn2020Scaling,\ntitle={Scaling Autoregressive Video Models},\nauthor={Dirk Weissenborn and Oscar T\u00e4ckstr\u00f6m and Jakob Uszkoreit},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgsskrFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJgsskrFwH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "412;397;670",
        "wc_reply_reviewers": "0;141;0",
        "wc_reply_authors": "706;344;625",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            493.0,
            125.30762147610974
        ],
        "wc_reply_reviewers_avg": [
            47.0,
            66.46803743153546
        ],
        "wc_reply_authors_avg": [
            558.3333333333334,
            155.1221740722096
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 261,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12473932947561545475&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJguRyBYvr",
        "title": "Improved Detection of Adversarial Attacks via Penetration Distortion Maximization",
        "track": "main",
        "status": "Reject",
        "tldr": "Adversarial detection method based on separating class clusters in the embedding space. ",
        "abstract": "This paper is concerned with the defense of deep models against adversarial at-\ntacks. We develop an adversarial detection method, which is inspired by the cer-\ntificate defense approach, and captures the idea of separating class clusters in the\nembedding space so as to increase the margin. The resulting defense is intuitive,\neffective, scalable and can be integrated into any given neural classification model.\nOur method demonstrates state-of-the-art detection performance under all threat\nmodels.",
        "keywords": "Adversarial Examples;Adversarial Attacks;Adversarial Defense;White-Box threat models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shai Rozenberg;Gal Elidan;Ran El-Yaniv",
        "authorids": "shairoz@cs.technion.ac.il;elidan@google.com;elyaniv@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nrozenberg2020improved,\ntitle={Improved Detection of Adversarial Attacks via Penetration Distortion Maximization},\nauthor={Shai Rozenberg and Gal Elidan and Ran El-Yaniv},\nyear={2020},\nurl={https://openreview.net/forum?id=rJguRyBYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJguRyBYvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "442;775;391",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "382;341;183",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            536.0,
            170.2762461413805
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            302.0,
            85.79432770682843
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AmbQQYjq33cJ:scholar.google.com/&scioq=Improved+Detection+of+Adversarial+Attacks+via+Penetration+Distortion+Maximization&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJgzzJHtDB",
        "title": "Triple Wins: Boosting Accuracy, Robustness and Efficiency Together by Enabling Input-Adaptive Inference",
        "track": "main",
        "status": "Poster",
        "tldr": "Is it possible to co-design model accuracy, robustness and efficiency to achieve their triple wins? Yes!",
        "abstract": "Deep networks were recently suggested to face the odds between accuracy (on clean natural images) and robustness (on adversarially perturbed images) (Tsipras et al., 2019). Such a dilemma is shown to be rooted in the inherently higher sample complexity (Schmidt et al., 2018) and/or model capacity (Nakkiran, 2019), for learning a high-accuracy and robust classifier. In view of that, give a classification task, growing the model capacity appears to help draw a win-win between accuracy and robustness, yet at the expense of model size and latency, therefore posing challenges for resource-constrained applications. Is it possible to co-design model accuracy, robustness and efficiency to achieve their triple wins? This paper studies multi-exit networks associated with input-adaptive efficient inference, showing their strong promise in achieving a \u201csweet point\" in co-optimizing model accuracy, robustness, and efficiency. Our proposed solution, dubbed Robust Dynamic Inference Networks (RDI-Nets), allows for each input (either clean or adversarial) to adaptively choose one of the multiple output layers (early branches or the final one) to output its prediction. That multi-loss adaptivity adds new variations and flexibility to adversarial attacks and defenses, on which we present a systematical investigation. We show experimentally that by equipping existing backbones with such robust adaptive inference, the resulting RDI-Nets can achieve better accuracy and robustness, yet with over 30% computational savings, compared to the defended original models.\n",
        "keywords": "adversarial robustness;efficient inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ting-Kuei Hu;Tianlong Chen;Haotao Wang;Zhangyang Wang",
        "authorids": "tkhu@tamu.edu;wiwjp619@tamu.edu;htwang@tamu.edu;atlaswang@tamu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nHu2020Triple,\ntitle={Triple Wins: Boosting Accuracy, Robustness and Efficiency Together by Enabling Input-Adaptive Inference},\nauthor={Ting-Kuei Hu and Tianlong Chen and Haotao Wang and Zhangyang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJgzzJHtDB}\n}",
        "github": "https://github.com/TAMU-VITA/triple-wins",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJgzzJHtDB",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "334;366;227",
        "wc_reply_reviewers": "20;0;0",
        "wc_reply_authors": "395;209;59",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            309.0,
            59.43624034767565
        ],
        "wc_reply_reviewers_avg": [
            6.666666666666667,
            9.428090415820632
        ],
        "wc_reply_authors_avg": [
            221.0,
            137.43362034087585
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 104,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16965650260059633977&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJl05AVtwB",
        "title": "Chordal-GCN: Exploiting sparsity in training large-scale graph convolutional networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Chordal-GCN is a scalable graph neural network which exploits the exact graph structure (i.e., without approximation or sampling) and requires limited memory usage.",
        "abstract": "Despite the impressive success of graph convolutional networks (GCNs) on numerous applications, training on large-scale sparse networks remains challenging. Current algorithms require large memory space for storing GCN outputs as well as all the intermediate embeddings. Besides, most of these algorithms involves either random sampling or an approximation of the adjacency matrix, which might unfortunately lose important structure information. In this paper, we propose Chordal-GCN for semi-supervised node classification. The proposed model utilizes the exact graph structure (i.e., without sampling or approximation), while requires limited memory resources compared with the original GCN. Moreover, it leverages the sparsity pattern as well as the clustering structure of the graph. The proposed model first decomposes a large-scale sparse network into several small dense subgraphs (called cliques), and constructs a clique tree. By traversing the tree, GCN training is performed clique by clique, and connections between cliques are exploited via the tree hierarchy. Furthermore, we implement Chordal-GCN on large-scale datasets and demonstrate superior performance.\n",
        "keywords": "graph convolutional network;semi-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin Jiang*;Kewei Cheng*;Song Jiang*;Yizhou Sun",
        "authorids": "jiangxjames@ucla.edu;viviancheng@cs.ucla.edu;songjiang@cs.ucla.edu;yzsun@cs.ucla.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njiang*2020chordalgcn,\ntitle={Chordal-{\\{}GCN{\\}}: Exploiting sparsity in training large-scale graph convolutional networks},\nauthor={Xin Jiang* and Kewei Cheng* and Song Jiang* and Yizhou Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=rJl05AVtwB}\n}",
        "github": "https://www.dropbox.com/s/0vby5gbu9qkbigr/chordal-gcn.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJl05AVtwB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "377;415;659",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "756;740;654",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            483.6666666666667,
            124.94621064904511
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            716.6666666666666,
            44.79087208597553
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1311095822175393924&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rJl0ceBtDH",
        "title": "Semi-Supervised Boosting via Self Labelling",
        "track": "main",
        "status": "Reject",
        "tldr": "In this paper we introduce Boosting via Self Labelling (BSL), a solution to semi-supervised boosting when there is a very limited access to labelled instances.",
        "abstract": "Attention to semi-supervised learning grows in machine learning as the price to expertly label data increases. Like most previous works in the area, we focus on improving an algorithm's ability to discover the inherent property of the entire dataset from a few expertly labelled samples. In this paper we introduce Boosting via Self Labelling (BSL), a solution to semi-supervised boosting when there is only limited access to labelled instances. Our goal is to learn a classifier that is trained on a data set that is generated by combining the generalization of different algorithms which have been trained with a limited amount of supervised training samples. Our method builds upon a combination of several different components. First, an inference aided ensemble algorithm developed on a set of weak classifiers will offer the initial noisy labels. Second, an agreement based estimation approach will return the average error rates of the noisy labels. Third and finally, a noise-resistant boosting algorithm will train over the noisy labels and their error rates to describe the underlying structure as closely as possible. We provide both analytical justifications and experimental results to back the performance of our model. Based on several benchmark datasets, our results demonstrate that BSL is able to outperform state-of-the-art semi-supervised methods consistently, achieving over 90% test accuracy with only 10% of the data being labelled.",
        "keywords": "semi-supervised learning;boosting;noise-resistant",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akul Goyal;Yang Liu",
        "authorids": "akulg2@illinois.edu;yangliu@ucsc.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngoyal2020semisupervised,\ntitle={Semi-Supervised Boosting via Self Labelling},\nauthor={Akul Goyal and Yang Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=rJl0ceBtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJl0ceBtDH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "735;564;194",
        "wc_reply_reviewers": "18;43;0",
        "wc_reply_authors": "60;44;127",
        "reply_reviewers": "1;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            497.6666666666667,
            225.78800283058047
        ],
        "wc_reply_reviewers_avg": [
            20.333333333333332,
            17.632041540584257
        ],
        "wc_reply_authors_avg": [
            77.0,
            35.95367389665021
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13816318501681644363&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJl31TNYPr",
        "title": "Fooling Detection Alone is Not Enough: Adversarial Attack against Multiple Object Tracking",
        "track": "main",
        "status": "Poster",
        "tldr": "We study the adversarial machine learning attacks against the Multiple Object Tracking mechanisms for the first time. ",
        "abstract": "Recent work in adversarial machine learning started to focus on the visual perception in autonomous driving and studied Adversarial Examples (AEs) for object detection models. However, in such visual perception pipeline the detected objects must also be tracked, in a process called Multiple Object Tracking (MOT), to build the moving trajectories of surrounding obstacles. Since MOT is designed to be robust against errors in object detection, it poses a general challenge to existing attack techniques that blindly target objection detection: we find that a success rate of over 98% is needed for them to actually affect the tracking results, a requirement that no existing attack technique can satisfy. In this paper, we are the first to study adversarial machine learning attacks against the complete visual perception pipeline in autonomous driving, and discover a novel attack technique, tracker hijacking, that can effectively fool MOT using AEs on object detection. Using our technique, successful AEs on as few as one single frame can move an existing object in to or out of the headway of an autonomous vehicle to cause potential safety hazards. We perform evaluation using the Berkeley Deep Drive dataset and find that on average when 3 frames are attacked, our attack can have a nearly 100% success rate while attacks that blindly target object detection only have up to 25%.",
        "keywords": "Adversarial examples;object detection;object tracking;security;autonomous vehicle;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yunhan Jia;Yantao Lu;Junjie Shen;Qi Alfred Chen;Hao Chen;Zhenyu Zhong;Tao Wei",
        "authorids": "jack0082010@gmail.com;ylu25@syr.edu;junjies1@uci.edu;alfchen@uci.edu;chen@ucdavis.edu;edwardzhong@baidu.com;lenx.wei@gmail.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nJia2020Fooling,\ntitle={Fooling Detection Alone is Not Enough: Adversarial Attack against Multiple Object Tracking},\nauthor={Yunhan Jia and Yantao Lu and Junjie Shen and Qi Alfred Chen and Hao Chen and Zhenyu Zhong and Tao Wei},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJl31TNYPr}\n}",
        "github": "https://github.com/anonymousjack/hijacking",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJl31TNYPr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "477;372;221",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "425;324;175",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            356.6666666666667,
            105.07246176911542
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            308.0,
            102.68722737841676
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 128,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15515055522275518315&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "rJl3YC4YPH",
        "title": "GUIDEGAN: ATTENTION BASED SPATIAL GUIDANCE FOR IMAGE-TO-IMAGE TRANSLATION",
        "track": "main",
        "status": "Reject",
        "tldr": "A general method that improves the image translation performance of GAN framework by using an attention embedded discriminator",
        "abstract": "Recently, Generative Adversarial Network (GAN) and numbers of its variants have been widely used to solve the image-to-image translation problem and achieved extraordinary results in both a supervised and unsupervised manner. However, most GAN-based methods suffer from the imbalance problem between the generator and discriminator in practice. Namely, the relative model capacities of the generator and discriminator do not match, leading to mode collapse and/or diminished gradients. To tackle this problem, we propose a GuideGAN based on attention mechanism. More specifically, we arm the discriminator with an attention mechanism so not only it estimates the probability that its input is real, but also does it create an attention map that highlights the critical features for such prediction. This attention map then assists the generator to produce more plausible and realistic images. We extensively evaluate the proposed GuideGAN framework on a  number of image transfer tasks. Both qualitative results and quantitative comparison demonstrate the superiority of our proposed approach.",
        "keywords": "Image-to-Image translation;Attention Learning;GAN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Lin;Yigong Wang;Yifan Li;Zhuoyi Wang;Yang Gao;Latifur Khan",
        "authorids": "yxl163430@utdallas.edu;yxw158830@utdallas.edu;yli@utdallas.edu;zhuoyi.wang1@utdallas.edu;yxg122530@utdallas.edu;lkhan@utdallas.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nlin2020guidegan,\ntitle={{\\{}GUIDEGAN{\\}}:  {\\{}ATTENTION{\\}}  {\\{}BASED{\\}}  {\\{}SPATIAL{\\}}  {\\{}GUIDANCE{\\}} {\\{}FOR{\\}}  {\\{}IMAGE{\\}}-{\\{}TO{\\}}-{\\{}IMAGE{\\}} {\\{}TRANSLATION{\\}}},\nauthor={Yu Lin and Yigong Wang and Yifan Li and Zhuoyi Wang and Yang Gao and Latifur Khan},\nyear={2020},\nurl={https://openreview.net/forum?id=rJl3YC4YPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJl3YC4YPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "389;224;621",
        "wc_reply_reviewers": "0;112;0",
        "wc_reply_authors": "296;542;841",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            411.3333333333333,
            162.84211849382072
        ],
        "wc_reply_reviewers_avg": [
            37.333333333333336,
            52.797306328595546
        ],
        "wc_reply_authors_avg": [
            559.6666666666666,
            222.84573637882826
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_BMtzZUCOo8J:scholar.google.com/&scioq=GUIDEGAN:+ATTENTION+BASED+SPATIAL+GUIDANCE+FOR+IMAGE-TO-IMAGE+TRANSLATION&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJl5MeHKvB",
        "title": "Learning Through Limited Self-Supervision: Improving Time-Series Classification Without Additional Data via Auxiliary Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that extra unlabeled data is not required for self-supervised auxiliary tasks to be useful for time series classification, and present new and effective auxiliary tasks.",
        "abstract": "Self-supervision, in which a target task is improved without external supervision, has primarily been explored in settings that assume the availability of additional data. However, in many cases, particularly in healthcare, one may not have access to additional data (labeled or otherwise). In such settings, we hypothesize that self-supervision based solely on the structure of the data at-hand can help. We explore a novel self-supervision framework for time-series data, in which multiple auxiliary tasks (e.g., forecasting) are included to improve overall performance on a sequence-level target task without additional training data. We call this approach limited self-supervision, as we limit ourselves to only the data at-hand. We demonstrate the utility of limited self-supervision on three sequence-level classification tasks, two pertaining to real clinical data and one using synthetic data. Within this framework, we introduce novel forms of self-supervision and demonstrate their utility in improving performance on the target task. Our results indicate that limited self-supervision leads to a consistent improvement over a supervised baseline, across a range of domains. In particular, for the task of identifying atrial fibrillation from small amounts of electrocardiogram data, we observe a nearly 13% improvement in the area under the receiver operating characteristics curve (AUC-ROC) relative to the baseline (AUC-ROC=0.55 vs. AUC-ROC=0.62). Limited self-supervision applied to sequential data can aid in learning intermediate representations, making it particularly applicable in settings where data collection is difficult.",
        "keywords": "Sequential Representation Learning;Self-Supervision;Function Approximation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ian Fox;Harry Rubin-Falcone;Jenna Wiens",
        "authorids": "ifox@umich.edu;hrf@umich.edu;wiensj@umich.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfox2020learning,\ntitle={Learning Through Limited Self-Supervision: Improving Time-Series Classification Without Additional Data via Auxiliary Tasks},\nauthor={Ian Fox and Harry Rubin-Falcone and Jenna Wiens},\nyear={2020},\nurl={https://openreview.net/forum?id=rJl5MeHKvB}\n}",
        "github": "https://bit.ly/2l2X4ax",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJl5MeHKvB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "181;180;212",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "285;504;331",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            191.0,
            14.854853303438128
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            373.3333333333333,
            94.2844396258235
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7338577031406889444&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJl5rRVFvH",
        "title": "Way Off-Policy Batch Deep Reinforcement Learning of Human Preferences in Dialog",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that KL-control from a pre-trained prior can allow RL models to learn from a static batch of collected data, without the ability to explore online in the environment.",
        "abstract": "Most deep reinforcement learning (RL) systems are not able to learn effectively from off-policy data, especially if they cannot explore online in the environment. This is a critical shortcoming for applying RL to real-world problems where collecting data is expensive, and models must be tested offline before being deployed to interact with the environment -- e.g. systems that learn from human interaction. Thus, we develop a novel class of off-policy batch RL algorithms which use KL-control to penalize divergence from a pre-trained prior model of probable actions. This KL-constraint reduces extrapolation error, enabling effective offline learning, without exploration, from a fixed batch of data. We also use dropout-based uncertainty estimates to lower bound the target Q-values as a more efficient alternative to Double Q-Learning. This Way Off-Policy (WOP) algorithm is tested on both traditional RL tasks from OpenAI Gym, and on the problem of open-domain dialog generation; a challenging reinforcement learning problem with a 20,000 dimensional action space. WOP allows for the extraction of multiple different reward functions post-hoc from collected human interaction data, and can learn effectively from all of these. We test real-world generalization by deploying dialog models live to converse with humans in an open-domain setting, and demonstrate that WOP achieves significant improvements over state-of-the-art prior methods in batch deep RL.\n",
        "keywords": "batch reinforcement learning;deep learning;dialog;off-policy;human preferences",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Natasha Jaques;Asma Ghandeharioun;Judy Hanwen Shen;Craig Ferguson;Agata Lapedriza;Noah Jones;Shixiang Gu;Rosalind Picard",
        "authorids": "jaquesn@mit.edu;asma_gh@mit.edu;judyshen@mit.edu;fergusoc@mit.edu;agata@mit.edu;ncjones@mit.edu;shanegu@google.com;picard@media.mit.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\njaques2020way,\ntitle={Way Off-Policy Batch Deep Reinforcement Learning of Human Preferences in Dialog},\nauthor={Natasha Jaques and Asma Ghandeharioun and Judy Hanwen Shen and Craig Ferguson and Agata Lapedriza and Noah Jones and Shixiang Gu and Rosalind Picard},\nyear={2020},\nurl={https://openreview.net/forum?id=rJl5rRVFvH}\n}",
        "github": "https://drive.google.com/open?id=1XG9c4HMXwDrxTOUrVPHwR32EdnJJMuMt",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJl5rRVFvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "434;690;251",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "190;158;66",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            458.3333333333333,
            180.04505608936398
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            138.0,
            52.56107558513873
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11441649626444968492&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rJlBQkrFvr",
        "title": "Perception-Driven Curiosity with Bayesian Surprise",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We introduce a method for computing an intrinsic reward for curiosity using metrics derived from sampling a latent variable model used to estimate dynamics.",
        "abstract": "Intrinsic rewards in reinforcement learning provide a powerful algorithmic capability for agents to learn how to interact with their environment in a task-generic way. However, increased incentives for motivation can come at the cost of increased fragility to stochasticity. We introduce a method for computing an intrinsic reward for curiosity using metrics derived from sampling a latent variable model used to estimate dynamics. Ultimately, an estimate of the conditional probability of observed states is used as our intrinsic reward for curiosity. In our experiments, a video game agent uses our model to autonomously learn how to play Atari games using our curiosity reward in combination with extrinsic rewards from the game to achieve improved performance on games with sparse extrinsic rewards. When stochasticity is introduced in the environment, our method still demonstrates improved performance over the baseline.",
        "keywords": "deep reinforcement learning;exploration;curiosity;variational methods;deep autoencoders",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bernadette Bucher;Anton Arapin;Ramanan Sekar;Marc Badger;Feifei Duan;Oleh Rybkin;Kostas Daniilidis",
        "authorids": "bucherb@seas.upenn.edu;aarapin@fandm.edu;ramanans@seas.upenn.edu;mbadger@seas.upenn.edu;feifeid@seas.upenn.edu;oleh@seas.upenn.edu;kostas@seas.upenn.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "https://github.com/anonymous-iclr2020/surprise",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJlBQkrFvr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "364;440;314",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            372.6666666666667,
            51.803045813499764
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4614507623213628131&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rJlCXlBtwH",
        "title": "CWAE-IRL: Formulating a supervised approach to Inverse Reinforcement Learning problem",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Using a supervised latent variable modeling framework to determine reward in inverse reinforcement learning task",
        "abstract": "Inverse reinforcement learning (IRL) is used to infer the reward function from the actions of an expert running a Markov Decision Process (MDP). A novel approach using variational inference for learning the reward function is proposed in this research. Using this technique, the intractable posterior distribution of the continuous latent variable (the reward function in this case) is analytically approximated to appear to be as close to the prior belief while trying to reconstruct the future state conditioned on the current state and action. The reward function is derived using a well-known deep generative model known as Conditional Variational Auto-encoder (CVAE) with Wasserstein loss function, thus referred to as Conditional Wasserstein Auto-encoder-IRL (CWAE-IRL), which can be analyzed as a combination of the backward and forward inference. This can then form an efficient alternative to the previous approaches to IRL while having no knowledge of the system dynamics of the agent. Experimental results on standard benchmarks such as objectworld and pendulum show that the proposed algorithm can effectively learn the latent reward function in complex, high-dimensional environments.",
        "keywords": "inverse reinforcement learning;conditional variational auto-encoder;wasserstein",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arpan Kusari",
        "authorids": "arpan.kusari@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJlCXlBtwH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "288;150;240",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            226.0,
            57.201398584300364
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ADzpPbrYnBYJ:scholar.google.com/&scioq=CWAE-IRL:+Formulating+a+supervised+approach+to+Inverse+Reinforcement+Learning+problem&hl=en&as_sdt=0,5",
        "gs_version_total": 4
    },
    {
        "id": "rJlDO64KPH",
        "title": "Self-Supervised Speech Recognition via Local Prior Matching",
        "track": "main",
        "status": "Reject",
        "tldr": "on-the-fly soft pseudo-labeling with LM weighting is better than [off-line hard pseudo-labeling | alternatives] for semi-supervised speech recognition",
        "abstract": "We propose local prior matching (LPM), a self-supervised objective for speech recognition. The LPM objective leverages a strong language model to provide learning signal given unlabeled speech. Since LPM uses a language model, it can take advantage of vast quantities of both unpaired text and speech. The loss is theoretically well-motivated and simple to implement. More importantly, LPM is effective. Starting from a model trained on 100 hours of labeled speech, with an additional 360 hours of unlabeled data LPM reduces the WER by 26% and 31% relative on a clean and noisy test set, respectively. This bridges the gap by 54% and 73% WER on the two test sets relative to a fully supervised model on the same 360 hours with labels. By augmenting LPM with an additional 500 hours of noisy data, we further improve the WER on the noisy test set by 15% relative. Furthermore, we perform extensive ablative studies to show the importance of various configurations of our self-supervised approach.",
        "keywords": "speech recognition;self-supervised learning;language model;semi-supervised learning;pseudo labeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei-Ning Hsu;Ann Lee;Gabriel Synnaeve;Awni Hannun",
        "authorids": "wnhsu@mit.edu;annl@fb.com;gab@fb.com;awni@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhsu2020selfsupervised,\ntitle={Self-Supervised Speech Recognition via Local Prior Matching},\nauthor={Wei-Ning Hsu and Ann Lee and Gabriel Synnaeve and Awni Hannun},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlDO64KPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJlDO64KPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "239;116;348",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "352;764;873",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            234.33333333333334,
            94.7710691907375
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            663.0,
            224.3672584551201
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 44,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5252197211237713101&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJlDoT4twr",
        "title": "Unified Probabilistic Deep Continual Learning through Generative Replay and Open Set Recognition",
        "track": "main",
        "status": "Reject",
        "tldr": "Deep continual learning with a single model with open set recognition and resulting improved generative replay",
        "abstract": "We introduce a unified probabilistic approach for deep continual learning based on variational Bayesian inference with open set recognition. Our model combines a joint probabilistic encoder with a generative model and a linear classifier that get shared across tasks. The open set recognition bounds the approximate posterior by fitting regions of high density on the basis of correctly classified data points and balances open set detection with recognition errors. Catastrophic forgetting is significantly alleviated through generative replay, where the open set recognition is used to sample from high density areas of the class specific posterior and reject statistical outliers. Our approach naturally allows for forward and backward transfer while maintaining past knowledge without the necessity of storing old data, regularization or inferring task labels. We demonstrate compelling results in the challenging scenario of incrementally expanding the single-head classifier for both class incremental visual and audio classification tasks, as well as incremental learning of datasets across modalities.",
        "keywords": "Continual Learning;Open Set Recognition;Probabilistic Deep Learning;Variational Inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Martin Mundt;Sagnik Majumder;Iuliia Pliushch;Visvanathan Ramesh",
        "authorids": "mmundt@em.uni-frankfurt.de;majumder@ccc.cs.uni-frankfurt.de;pliushch@em.uni-frankfurt.de;ramesh@fias.uni-frankfurt.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmundt2020unified,\ntitle={Unified Probabilistic Deep Continual Learning through Generative Replay and Open Set Recognition},\nauthor={Martin Mundt and Sagnik Majumder and Iuliia Pliushch and Visvanathan Ramesh},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlDoT4twr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJlDoT4twr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "510;388;503",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "775;1604;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            467.0,
            55.93448548674303
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            793.0,
            654.9539424010414
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.816496580927726
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 49,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17870268742204838545&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 13
    },
    {
        "id": "rJlHea4Kvr",
        "title": "Mitigating Posterior Collapse in Strongly Conditioned Variational Autoencoders",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a conditional variational autoencoder framework that mitigates the posterior collapse in scenarios where the conditioning signal strong enough for an expressive decoder to generate a plausible output from it.",
        "abstract": "Training conditional generative latent-variable models is challenging in scenarios where the conditioning signal is very strong and the decoder is expressive enough to generate a plausible output given only the condition; the generative model tends to ignore the latent variable, suffering from posterior collapse.  We find, and empirically show, that one of the major reasons behind posterior collapse is rooted in the way that generative models are conditioned, i.e., through concatenation of the latent variable and the condition. To mitigate this problem, we propose to explicitly make the latent variables depend on the condition by unifying the conditioning and latent variable sampling, thus coupling them so as to prevent the model from discarding the root of variations. To achieve this, we develop a conditional Variational Autoencoder architecture that learns a distribution not only of the latent variables, but also of the condition, the latter acting as prior on the former. Our experiments on the challenging tasks of conditional human motion prediction and image captioning demonstrate the effectiveness of our approach at avoiding posterior collapse. Video results of our approach are anonymously provided in http://bit.ly/iclr2020",
        "keywords": "conditional variational autoencoder;posterior collapse;generative models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohammad Sadegh Aliakbarian;Fatemeh Sadat Saleh;Mathieu Salzmann;Lars Petersson;Stephen Gould",
        "authorids": "sadegh.aliakbarian@anu.edu.au;fatemehsadat.saleh@anu.edu.au;mathieu.salzmann@epfl.ch;lars.petersson@data61.csiro.au;stephen.gould@anu.edu.au",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJlHea4Kvr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "808;865;1022",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "616;1512;782",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            898.3333333333334,
            90.48879611433794
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            970.0,
            389.1974648769782
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11921668640335901893&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJlJF1SYPB",
        "title": "Universality Theorems for Generative Models",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We shot that a wide class of manifolds can be generated by ReLU and sigmoid networks with arbitrary precision.",
        "abstract": "Despite the fact that generative models are extremely successful in practice, the theory underlying this phenomenon is only starting to catch up with practice. In this work we address the question of the universality of generative models: is it true that neural networks can approximate any data manifold arbitrarily well? We provide a positive answer to this question and show that under mild assumptions on the activation function one can always find a feedforward neural network that maps the latent space onto a set located within the specified Hausdorff distance from the desired data manifold. We also prove similar theorems for the case of multiclass generative models and cycle generative models, trained to map samples from one manifold to another and vice versa.",
        "keywords": "generative models;theory;universality;manifolds;differential geometry",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Valentin Khrulkov;Ivan Oseledets",
        "authorids": "khrulkov.v@gmail.com;i.oseledets@skoltech.ru",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer5",
        "site": "https://openreview.net/forum?id=rJlJF1SYPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "258;358;811",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            475.6666666666667,
            240.6052553226735
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15565566736069002384&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJlNKCNtPB",
        "title": "Adaptive Learned Bloom Filter (Ada-BF): Efficient Utilization of the Classifier",
        "track": "main",
        "status": "Reject",
        "tldr": "Propose an efficient algorithm to improve the Bloom filter by incorporating the machine learning model in a clever way",
        "abstract": "Recent work suggests improving the performance of Bloom filter by incorporating a machine learning model as a binary classifier. However, such learned Bloom filter does not take full advantage of the predicted probability scores. We proposed new algorithms that generalize the learned Bloom filter by using the complete spectrum of the scores regions. We proved our algorithms have lower False Positive Rate (FPR) and memory usage compared with the existing approaches to learned Bloom filter. We also demonstrated the improved performance of our algorithms on real-world datasets.",
        "keywords": "Ada-BF;Bloom filter;machine learning;memory efficient",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhenwei Dai;Anshumali Shrivastava",
        "authorids": "zd11@rice.edu;anshumali@rice.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ndai2020adaptive,\ntitle={Adaptive Learned Bloom Filter (Ada-{\\{}BF{\\}}): Efficient Utilization of the Classifier},\nauthor={Zhenwei Dai and Anshumali Shrivastava},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlNKCNtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJlNKCNtPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "249;108;173",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "487;107;256",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            176.66666666666666,
            57.62136949276911
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            283.3333333333333,
            156.33368869895645
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 29,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17773795693371856682&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJlS-ertwr",
        "title": "Influence-aware Memory for Deep Reinforcement Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Making the right decisions when some of the state variables are hidden, involves reasoning about all the possible states of the environment. An agent receiving only partial observations needs to infer the true values of these hidden variables based on the history of experiences. Recent deep reinforcement learning methods use recurrent models to keep track of past information. However, these models are sometimes expensive to train and have convergence difficulties, especially when dealing with high dimensional input spaces. Taking inspiration from influence-based abstraction, we show that effective policies can be learned in the presence of uncertainty by only memorizing a small subset of input variables. We also incorporate a mechanism in our network that learns to automatically choose the important pieces of information that need to be remembered. The results indicate that, by forcing the agent's internal memory to focus on the selected regions while treating the rest of the observable variables as Markovian, we can outperform ordinary recurrent architectures in situations where the amount of information that the agent needs to retain represents a small fraction of the entire observation input. The method also reduces training time and obtains better scores than methods that use a fixed window of experiences as input to remove partial observability in domains where long-term memory is required.",
        "keywords": "Deep Reinforcement Learning;POMDP;Influence;Memory;Recurrent Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Miguel Suau;Elena Congeduti;Rolf A.N. Starre;Aleksander Czechowski;Frans A. Oliehoek",
        "authorids": "m.suaudecastro@tudelft.nl;e.congeduti@tudelft.nl;a.t.czechowski@tudelft.nl;r.a.n.starre@tudelft.nl;f.a.oliehoek@tudelft.nl",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "https://www.dropbox.com/home/InfluenceNet",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJlS-ertwr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "511;167;792",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "982;625;606",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            490.0,
            255.58690628955674
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            737.6666666666666,
            172.94379305299034
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1517361307555697786&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJlTXxSFPr",
        "title": "A Quality-Diversity Controllable GAN for Text Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "A GAN that can control quality-diversity trade-off through a single hyper-parameter and is more competitive with MLE model than other GANs variants.",
        "abstract": "Text generation is a critical and difficult natural language processing task. Maximum likelihood estimate (MLE) based models have been arguably suffered from exposure bias in the inference stage and thus varieties of language generative adversarial networks (GANs) bypassing this problem have emerged. However, recent study has demonstrated that MLE models can constantly outperform GANs models over quality-diversity space under several metrics. In this paper, we propose a quality-diversity controllable language GAN.",
        "keywords": "text generation;GAN;quality-diversity;generalized Jensen-Shannon divergence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xingyu Lou;Kaihe Xu;Zhongliang Li;Tian Xia;Shaojun Wang;Jing Xiao",
        "authorids": "louxingyu83064256@163.com;xukaihenupt@gmail.com;zlli0520@gmail.com;summerrainet2008@gmail.com;swang.usa@gmail.com;jing.xiaoj@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nlou2020a,\ntitle={A Quality-Diversity Controllable {\\{}GAN{\\}} for Text Generation},\nauthor={Xingyu Lou and Kaihe Xu and Zhongliang Li and Tian Xia and Shaojun Wang and Jing Xiao},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlTXxSFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJlTXxSFPr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "89;305;520",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "184;294;215",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            304.6666666666667,
            175.95517105848927
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            231.0,
            46.31054595517814
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pQFrl5bkG-IJ:scholar.google.com/&scioq=A+Quality-Diversity+Controllable+GAN+for+Text+Generation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJlUhhVYvS",
        "title": "Understanding Isomorphism Bias in Graph Data Sets",
        "track": "main",
        "status": "Reject",
        "tldr": "Many graph classification data sets have duplicates, thus raising questions about generalization abilities and fair comparison of the models. ",
        "abstract": "In recent years there has been a rapid increase in classification methods on graph structured data. Both in graph kernels and graph neural networks, one of the implicit assumptions of successful state-of-the-art models was that incorporating graph isomorphism features into the architecture leads to better empirical performance. However, as we discover in this work, commonly used data sets for graph classification have repeating instances which cause the problem of isomorphism bias, i.e. artificially increasing the accuracy of the models by memorizing target information from the training set. This prevents fair competition of the algorithms and raises a question of the validity of the obtained results. We analyze 54 data sets, previously extensively used for graph-related tasks, on the existence of isomorphism bias, give a set of recommendations to machine learning practitioners to properly set up their models, and open source new data sets for the future experiments. ",
        "keywords": "graph classification;data sets;graph representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ivanov Sergey;Sviridov Sergey;Evgeny Burnaev",
        "authorids": "ivanovserg990@gmail.com;sergei.sviridov@gmail.com;e.burnaev@skoltech.ru",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsergey2020understanding,\ntitle={Understanding Isomorphism Bias in Graph Data Sets },\nauthor={Ivanov Sergey and Sviridov Sergey and Evgeny Burnaev},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlUhhVYvS}\n}",
        "github": "https://anonymous.4open.science/r/efed8114-e75f-48bd-a025-58a0bec0b9c9/",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJlUhhVYvS",
        "pdf_size": 0,
        "rating": "1;1;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "125;494;150;123",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "451;812;819;118",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            2.75,
            2.0463381929681126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            223.0,
            156.8231488014445
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            550.0,
            290.44362619964653
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 42,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9587376923815409782&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rJlUt0EYwS",
        "title": "Learning from Explanations with Neural Execution Tree",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "While deep neural networks have achieved impressive performance on a range of NLP tasks, these data-hungry models heavily rely on labeled data, which restricts their applications in scenarios where data annotation is expensive. Natural language (NL) explanations have been demonstrated very useful additional supervision, which can provide sufficient domain knowledge for generating more labeled data over new instances, while the annotation time only doubles. However, directly applying them for augmenting model learning encounters two challenges: (1) NL explanations are unstructured and inherently compositional, which asks for a modularized model to represent their semantics, (2) NL explanations often have large numbers of linguistic variants, resulting in low recall and limited generalization ability. In this paper, we propose a novel Neural Execution Tree (NExT) framework to augment training data for text classification using NL explanations. After transforming NL explanations into executable logical forms by semantic parsing, NExT generalizes different types of actions specified by the logical forms for labeling data instances, which substantially increases the coverage of each NL explanation. Experiments on two NLP tasks (relation extraction and sentiment analysis) demonstrate its superiority over baseline methods. Its extension to multi-hop question answering achieves performance gain with light annotation effort.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ziqi Wang*;Yujia Qin*;Wenxuan Zhou;Jun Yan;Qinyuan Ye;Leonardo Neves;Zhiyuan Liu;Xiang Ren",
        "authorids": "ziqi-wan16@mails.tsinghua.edu.cn;qinyj16@mails.tsinghua.edu.cn;zhouwenx@usc.edu;yanjun@usc.edu;qinyuany@usc.edu;lneves@snap.com;liuzy@tsinghua.edu.cn;xiangren@usc.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nWang*2020Learning,\ntitle={Learning from Explanations with Neural Execution Tree},\nauthor={Ziqi Wang* and Yujia Qin* and Wenxuan Zhou and Jun Yan and Qinyuan Ye and Leonardo Neves and Zhiyuan Liu and Xiang Ren},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlUt0EYwS}\n}",
        "github": "https://www.dropbox.com/sh/zkp19yr44yr8idt/AABpjFN3r2COIOub33L7DtfLa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJlUt0EYwS",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "614;260;162",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1906;750;132",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            345.3333333333333,
            194.14313162086253
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            929.3333333333334,
            735.2502219577284
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 41,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7878469874238216625&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJlVdREKDS",
        "title": "Learning from Imperfect Annotations: An End-to-End Approach",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Many machine learning systems today are trained on large amounts of human-annotated data. Annotation tasks that require a high level of competency make data acquisition  expensive, while the resulting labels are often subjective, inconsistent, and may contain a variety of human biases. To improve data quality, practitioners often need to collect multiple annotations per example and aggregate them before training models. Such a multi-stage approach results in redundant annotations and may often produce imperfect ``ground truth'' labels that limit the potential of training supervised machine learning models. We propose a new end-to-end framework that enables us to: (i) merge the aggregation step with model training, thus allowing deep learning systems to learn to predict ground truth estimates directly from the available data, and (ii) model difficulties of examples and learn representations of the annotators that allow us to estimate and take into account their competencies. Our approach is general and has many applications, including training more accurate models on crowdsourced data, ensemble learning, as well as classifier accuracy estimation from unlabeled data. We conduct an extensive experimental evaluation of our method on 5 crowdsourcing datasets of varied difficulty and show accuracy gains of up to 25% over the current state-of-the-art approaches for aggregating annotations, as well as significant reductions in the required annotation redundancy.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Emmanouil Antonios Platanios;Maruan Al-Shedivat;Eric Xing;Tom Mitchell",
        "authorids": "e.a.platanios@cs.cmu.edu;alshedivat@cs.cmu.edu;epxing@cs.cmu.edu;tom.mitchell@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nplatanios2020learning,\ntitle={Learning from Imperfect Annotations: An End-to-End Approach},\nauthor={Emmanouil Antonios Platanios and Maruan Al-Shedivat and Eric Xing and Tom Mitchell},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlVdREKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJlVdREKDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "281;621;573",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "275;754;632",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            491.6666666666667,
            150.24720370849576
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            553.6666666666666,
            203.24424277755625
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12138939477427361718&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJlYsn4YwS",
        "title": "Gradient-free Neural Network Training by Multi-convex Alternating Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel Deep Learning Alternating Minimization (DLAM) algorithm to solve the fully- connected neural network problem with convergence guarantee",
        "abstract": "In recent years, stochastic gradient descent (SGD) and its variants have been the dominant optimization methods for training deep neural networks. However, SGD suffers from limitations such as the lack of theoretical guarantees, vanishing gradients, excessive sensitivity to input, and difficulties solving highly non-smooth constraints and functions. To overcome these drawbacks, alternating minimization-based methods for deep neural network optimization have attracted fast-increasing attention recently. As an emerging and open domain, however, several new challenges need to be addressed, including 1) Convergence depending on the choice of hyperparameters, and 2) Lack of unified theoretical frameworks with general conditions. We, therefore, propose a novel Deep Learning Alternating Minimization (DLAM) algorithm to deal with these two challenges. Our innovative inequality-constrained formulation infinitely approximates the original problem with non-convex equality constraints, enabling our proof of global convergence of the DLAM algorithm under mild, practical conditions, regardless of the choice of hyperparameters and wide range of various activation functions. Experiments on benchmark datasets demonstrate the effectiveness of DLAM.",
        "keywords": "neural network;alternating minimization;global convergence",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junxiang Wang;Fuxun Yu;Xiang Chen;Liang Zhao",
        "authorids": "jwang40@gmu.edu;fyu2@gmu.edu;xchen26@gmu.edu;lzhao9@gmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020gradientfree,\ntitle={Gradient-free Neural Network Training by Multi-convex Alternating Optimization},\nauthor={Junxiang Wang and Fuxun Yu and Xiang Chen and Liang Zhao},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlYsn4YwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJlYsn4YwS",
        "pdf_size": 0,
        "rating": "1;6",
        "confidence": "0;0",
        "wc_review": "365;528",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "291;107",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            446.5,
            81.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            199.0,
            92.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4onarOXJkpQJ:scholar.google.com/&scioq=Gradient-free+Neural+Network+Training+by+Multi-convex+Alternating+Optimization&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJlaY3VYDr",
        "title": "Learning Out-of-distribution Detection without Out-of-distribution Data",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Deep neural networks have attained remarkable performance when applied to data that comes from the same distribution as that of the training set, but can significantly degrade otherwise. Therefore, detecting whether an example is out-of-distribution (OOD) is crucial to enable a system that can reject such samples or alert users. Recent works have made significant progress on OOD benchmarks consisting of small image datasets. However, such methods rely on training or tuning with both in-distribution and out-of-distribution data. The latter is generally hard to define \\textit{a-priori}, and its selection can easily bias the learning. In this work, we focus on the feasibility of learning OOD detection without OOD data, proposing two strategies for the problem. We specifically propose to decompose confidence scoring as well as a modified input pre-processing method. We show that both of these significantly help detection performance, all without tuning to any out-of-distribution data during training. Our further analysis on a larger scale image dataset shows that the two types of distribution shifts, specifically semantic shift and non-semantic shift, present a significant difference in the difficulty of the problem, providing an analysis of when the proposed strategies do or do not work.",
        "keywords": "out-of-distribution;deep learning;neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yen-Chang Hsu;Yilin Shen;Hongxia Jin;Zsolt Kira",
        "authorids": "yenchang.hsu@gatech.edu;yilin.shen@samsung.com;hongxia.jin@samsung.com;zkira@gatech.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJlaY3VYDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "891;441;1608",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            980.0,
            480.5642516875345
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:c40B-34asY4J:scholar.google.com/&scioq=Learning+Out-of-distribution+Detection+without+Out-of-distribution+Data&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJlcLaVFvB",
        "title": "Effect of top-down connections in Hierarchical Sparse Coding",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper experimentally demonstrates the beneficial effect of top-down connections in Hierarchical Sparse Coding algorithm.",
        "abstract": "Hierarchical Sparse Coding (HSC) is a powerful model to efficiently represent multi-dimensional, structured data such as images. The simplest solution to solve this computationally hard problem is to decompose it into independent layerwise subproblems. However, neuroscientific evidence would suggest inter-connecting these subproblems as in the Predictive Coding (PC) theory, which adds top-down connections between consecutive layers. In this study, a new model called Sparse Deep Predictive Coding (SDPC) is introduced to assess the impact of this inter-layer feedback connection. In particular, the SDPC is compared with a Hierarchical Lasso (Hi-La) network made out of a sequence of Lasso layers. A 2-layered SDPC and a Hi-La networks are trained on 3 different databases and with different sparsity parameters on each layer. First, we show that the overall prediction error generated by SDPC is lower thanks to the feedback mechanism as it transfers prediction error between layers. Second, we demonstrate that the inference stage of the SDPC is faster to converge than for the Hi-La model. Third, we show that the SDPC also accelerates the learning process. Finally, the qualitative analysis of both models dictionaries, supported by their activation probability, show that the SDPC features are more generic and informative.",
        "keywords": "Hierarchical Sparse Coding;Convolutional Sparse Coding;Top-down connections",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Victor Boutin;Angelo Franciosini;Franck Ruffier;Laurent Perrinet",
        "authorids": "victor.boutin@univ-amu.fr;angelo.franciosini@univ-amu.fr;franck.ruffier@univ-amu.fr;laurent.perrinet@univ-amu.fr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nboutin2020effect,\ntitle={Effect of top-down connections in Hierarchical Sparse Coding},\nauthor={Victor Boutin and Angelo Franciosini and Franck Ruffier and Laurent Perrinet},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlcLaVFvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJlcLaVFvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "169;150;454",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "437;278;754",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            257.6666666666667,
            139.04515653396763
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            489.6666666666667,
            197.86246626269357
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6792787220093461795&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 14
    },
    {
        "id": "rJld3hEYvS",
        "title": "Ranking Policy Gradient",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose ranking policy gradient that learns the optimal rank of actions to maximize return. We propose a general off-policy learning framework with the properties of optimality preserving, variance reduction, and sample-efficiency.",
        "abstract": "Sample inefficiency is a long-lasting problem in reinforcement learning (RL). The state-of-the-art estimates the optimal action values while it usually involves an extensive search over the state-action space and unstable optimization. Towards the sample-efficient RL, we propose ranking policy gradient (RPG), a policy gradient method that learns the optimal rank of a set of discrete actions. To accelerate the learning of policy gradient methods, we establish the equivalence between maximizing the lower bound of return and imitating a near-optimal policy without accessing any oracles. These results lead to a general off-policy learning framework, which preserves the optimality, reduces variance, and improves the sample-efficiency. We conduct extensive experiments showing that when consolidating with the off-policy learning framework, RPG substantially reduces the sample complexity, comparing to the state-of-the-art.",
        "keywords": "Sample-efficient reinforcement learning;off-policy learning.",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kaixiang Lin;Jiayu Zhou",
        "authorids": "linkaixi@msu.edu;jiayuz@msu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLin2020Ranking,\ntitle={Ranking Policy Gradient},\nauthor={Kaixiang Lin and Jiayu Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJld3hEYvS}\n}",
        "github": "[![github](/images/github_icon.svg) illidanlab/rpg](https://github.com/illidanlab/rpg)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJld3hEYvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "408;520;244",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "747;671;357",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            390.6666666666667,
            113.3411761992182
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            591.6666666666666,
            168.81021559398852
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15054324663691805917&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJleFREKDr",
        "title": "Learning to Control Latent Representations for Few-Shot Learning of Named Entities",
        "track": "main",
        "status": "Reject",
        "tldr": "We want to learning with small data by introducing a RL trainable controller that learn to write and read in an external memory.",
        "abstract": "Humans excel in continuously learning with small data without forgetting how to solve old problems.\nHowever, neural networks require large datasets to compute  latent representations across different tasks while minimizing a loss function. For example, a natural language understanding (NLU) system will often deal with emerging entities during its deployment as interactions with users in realistic scenarios will generate new and infrequent names, events, and locations. Here, we address this scenario by introducing a RL trainable controller that disentangles the  representation learning  of a  neural encoder from its memory management role. \n\nOur proposed solution is straightforward and simple: we train a controller to execute an optimal sequence of read and write operations on an external memory with the goal of  leveraging  diverse activations from the past and provide accurate predictions. Our approach is named Learning to Control (LTC) and allows few-shot learning with two degrees of memory plasticity.  We experimentally show that our system obtains accurate results for few-shot learning of entity recognition in the Stanford Task-Oriented Dialogue dataset.",
        "keywords": "Memory management;neuroscience;reinforcement learning;learning with small data",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Omar U. Florez;Erik Mueller",
        "authorids": "omar.florez@aggiemail.usu.edu;erikmueller@capitalone.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nflorez2020learning,\ntitle={Learning to Control Latent Representations for Few-Shot Learning of Named Entities},\nauthor={Omar U. Florez and Erik Mueller},\nyear={2020},\nurl={https://openreview.net/forum?id=rJleFREKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJleFREKDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "337;628;249",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.6666666666667,
            161.95541224533235
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2256047652517965861&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJleKgrKwS",
        "title": "Differentiable learning of numerical rules in knowledge graphs",
        "track": "main",
        "status": "Poster",
        "tldr": "We present an efficient approach to integrating numerical comparisons into differentiable rule learning in knowledge graphs",
        "abstract": "Rules over a knowledge graph (KG) capture interpretable patterns in data and can be used for KG cleaning and completion. Inspired by the TensorLog differentiable logic framework, which compiles rule inference into a sequence of differentiable operations, recently a method called Neural LP has been proposed for learning the parameters as well as the structure of rules. However, it is limited with respect to the treatment of numerical features like age, weight or scientific measurements. We address this limitation by extending Neural LP to learn rules with numerical values, e.g., \u201dPeople younger than 18 typically live with their parents\u201c. We demonstrate how dynamic programming and cumulative sum operations can be exploited to ensure efficiency of such extension. Our novel approach allows us to extract more expressive rules with aggregates, which are of higher quality and yield more accurate predictions compared to rules learned by the state-of-the-art methods, as shown by our experiments on synthetic and real-world datasets.",
        "keywords": "knowledge graphs;rule learning;differentiable neural logic",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Po-Wei Wang;Daria Stepanova;Csaba Domokos;J. Zico Kolter",
        "authorids": "poweiw@cs.cmu.edu;daria.stepanova@de.bosch.com;csaba.domokos@de.bosch.com;zkolter@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nWang2020Differentiable,\ntitle={Differentiable learning of numerical rules in knowledge graphs},\nauthor={Po-Wei Wang and Daria Stepanova and Csaba Domokos and J. Zico Kolter},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJleKgrKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJleKgrKwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "272;207;305",
        "wc_reply_reviewers": "36;25;65",
        "wc_reply_authors": "599;344;201",
        "reply_reviewers": "1;1;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            261.3333333333333,
            40.71308825863687
        ],
        "wc_reply_reviewers_avg": [
            42.0,
            16.87206764645835
        ],
        "wc_reply_authors_avg": [
            381.3333333333333,
            164.61335169285496
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 51,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16066875933492827900&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJlf_RVKwr",
        "title": "Sensible adversarial learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce sensible robustness in an effort to resolve the trade-off between robustness and accuracy of the current adversarial robustness framework.",
        "abstract": "The trade-off between robustness and standard accuracy has been consistently reported in the machine learning literature. Although the problem has been widely studied to understand and explain this trade-off, no studies have shown the possibility of a no trade-off solution. In this paper, motivated by the fact that the high dimensional distribution is poorly represented by limited data samples, we introduce sensible adversarial learning and demonstrate the synergistic effect between pursuits of natural accuracy and robustness. Specifically, we define a sensible adversary which is useful for learning a defense model and keeping a high natural accuracy simultaneously. We theoretically establish that the Bayes rule is the most robust multi-class classifier with the 0-1 loss under sensible adversarial learning. We propose a novel and efficient algorithm that trains a robust model with sensible adversarial examples, without a significant drop in natural accuracy. Our model on CIFAR10 yields state-of-the-art results against various attacks with perturbations restricted to l\u221e with \u03b5 = 8/255, e.g., the robust accuracy 65.17% against PGD attacks as well as the natural accuracy 91.51%.\n",
        "keywords": "adversarial learning;deep neural networks;trade-off;margins;sensible reversion;sensible robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jungeum Kim;Xiao Wang",
        "authorids": "kim2712@purdue.edu;wangxiao@purdue.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkim2020sensible,\ntitle={Sensible adversarial learning},\nauthor={Jungeum Kim and Xiao Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlf_RVKwr}\n}",
        "github": "https://drive.google.com/drive/folders/1-0HPLEBU_FcQJ_7aPHB7uE8RfYryOQOV?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJlf_RVKwr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "766;256;494",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1666;665;2000",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;4",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            505.3333333333333,
            208.360798189636
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1443.6666666666667,
            567.2332461655924
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            1.247219128924647
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1412537326382397585&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJlhYa4FPB",
        "title": "An Information Theoretic Perspective on Disentangled Representation Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Existing works on disentangled representation learning usually lie on a common assumption: all factors in a disentangled representation should be independent. We argue that this assumption is not sufficient and another assumption is vital for disentangled representation learning: information contained in each factor of a disentangled representation is irrelevant to others, i.e. the containing information about data of factors is isolated. We formulate this assumption into two equivalent equations via mutual information, and theoretically show its relation with independence and conditional independence of factors in a representation. Meanwhile, we prove that conditional independence is satisfied in encoders of VAEs due to ``no-sharing-parameter block\" and reparameterization trick. To highlight the importance of the proposed assumption, we show in experiments that violating the assumption leads to decline of disentanglement. Based on this assumption, we further propose to split the deeper layers in encoder to ensure parameters in these layers are not shared for different factors. The proposed encoder, called \\textit{Split Encoder}, can be applied into other models and shows significant improvement in unsupervised learning of disentangled representations and reconstructions.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiaojiang Yang;Wendong Bi;Yu Cheng;Junchi Yan",
        "authorids": "yangxiaojiang@sjtu.edu.cn;biwendong1997@gmail.com;yu.cheng@microsoft.com;yanjunchi@sjtu.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://github.com/website-for-iclr/our-dlib",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJlhYa4FPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "632;347;573",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "372;237;755",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            517.3333333333334,
            122.82869733449463
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            454.6666666666667,
            219.4027245855337
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CxDLndiTcMsJ:scholar.google.com/&scioq=An+Information+Theoretic+Perspective+on+Disentangled+Representation+Learning&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJljdh4KDH",
        "title": "Multi-Scale Representation Learning for Spatial Feature Distributions using Grid Cells",
        "track": "main",
        "status": "Spotlight",
        "tldr": " We propose a representation learning model called Space2vec to encode the absolute positions and spatial relationships of places.",
        "abstract": "Unsupervised text encoding models have recently fueled substantial progress in NLP. The key idea is to use neural networks to convert words in texts to vector space representations (embeddings) based on word positions in a sentence and their contexts, which are suitable for end-to-end training of downstream tasks. We see a strikingly similar situation in spatial analysis, which focuses on incorporating both absolute positions and spatial contexts of geographic objects such as POIs into models. A general-purpose representation model for space is valuable for a multitude of tasks. However, no such general model exists to date beyond simply applying discretization or feed-forward\u00a0nets to coordinates, and little effort has been put into jointly modeling distributions with vastly different characteristics, which commonly emerges from GIS data. Meanwhile, Nobel Prize-winning Neuroscience research shows that grid cells in mammals provide a multi-scale periodic representation that functions as a metric for location encoding and is critical for recognizing places and for path-integration. Therefore, we propose a representation learning model called Space2Vec to encode the absolute positions and spatial relationships of places. We conduct experiments on two real-world geographic data for two different tasks: 1) predicting types of POIs given their positions and context, 2) image classification leveraging their geo-locations. Results show that because of its multi-scale representations, Space2Vec outperforms well-established ML approaches such as RBF kernels, multi-layer feed-forward nets, and tile embedding approaches for location modeling and image classification tasks. Detailed analysis shows that all baselines can at most well handle distribution at one scale but show poor performances in other scales. In contrast, Space2Vec \u2019s multi-scale representation can handle distributions at different scales.",
        "keywords": "Grid cell;space encoding;spatially explicit model;multi-scale periodic representation;unsupervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gengchen Mai;Krzysztof Janowicz;Bo Yan;Rui Zhu;Ling Cai;Ni Lao",
        "authorids": "gengchen_mai@geog.ucsb.edu;janowicz@ucsb.edu;boyan1@linkedin.com;ruizhu@geog.ucsb.edu;lingcai@ucsb.edu;noon99@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nMai2020Multi-Scale,\ntitle={Multi-Scale Representation Learning  for Spatial Feature Distributions using Grid Cells},\nauthor={Gengchen Mai and Krzysztof Janowicz and Bo Yan and Rui Zhu and Ling Cai and Ni Lao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJljdh4KDH}\n}",
        "github": "https://github.com/gengchenmai/space2vec",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJljdh4KDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "311;205;510",
        "wc_reply_reviewers": "142;0;35",
        "wc_reply_authors": "683;343;912",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            342.0,
            126.43048155673009
        ],
        "wc_reply_reviewers_avg": [
            59.0,
            60.404194114868105
        ],
        "wc_reply_authors_avg": [
            646.0,
            233.76198721491625
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 146,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5890605928845244555&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "rJlk71rYvH",
        "title": "Counterfactual Regularization for Model-Based Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "When training a world model, you can encode useful assumptions into the loss by using training-time counterfactuals.",
        "abstract": "In sequential tasks, planning-based agents have a number of advantages over model-free agents, including sample efficiency and interpretability. Recurrent action-conditional latent dynamics models trained from pixel-level observations have been shown to predict future observations conditioned on agent actions accurately enough for planning in some pixel-based control tasks. Typically, models of this type are trained to reconstruct sequences of ground-truth observations, given ground-truth actions. However, an action-conditional model can take input actions and states other than the ground truth, to generate predictions of unobserved counterfactual states. Because counterfactual state predictions are generated by differentiable networks, relationships among counterfactual states can be included in a training objective. We explore the possibilities of counterfactual regularization terms applicable during training of action-conditional sequence models. We evaluate their effect on pixel-level prediction accuracy and model-based agent performance, and we show that counterfactual regularization improves the performance of model-based agents in test-time environments that differ from training.\n",
        "keywords": "Counterfactual;Model-Based Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lawrence Neal;Li Fuxin;Xiaoli Fern",
        "authorids": "nealla@oregonstate.edu;fuxin.li@oregonstate.edu;xiaoli.fern@oregonstate.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nneal2020counterfactual,\ntitle={Counterfactual Regularization for Model-Based Reinforcement Learning},\nauthor={Lawrence Neal and Li Fuxin and Xiaoli Fern},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlk71rYvH}\n}",
        "github": "https://drive.google.com/file/d/1aD-5x28ZuDo62i8cfAD7GnliK_fDXhfD/view",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJlk71rYvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "483;267;230",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            326.6666666666667,
            111.57160132498872
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DgLL4BXaLU4J:scholar.google.com/&scioq=Counterfactual+Regularization+for+Model-Based+Reinforcement+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJlnOhVYPS",
        "title": "Mutual Mean-Teaching: Pseudo Label Refinery for Unsupervised Domain Adaptation on Person Re-identification",
        "track": "main",
        "status": "Poster",
        "tldr": "A framework that conducts online refinement of pseudo labels with a novel soft softmax-triplet loss for unsupervised domain adaptation on person re-identification.",
        "abstract": "Person re-identification (re-ID) aims at identifying the same persons' images across different cameras. However, domain diversities between different datasets pose an evident challenge for adapting the re-ID model trained on one dataset to another one. State-of-the-art unsupervised domain adaptation methods for person re-ID transferred the learned knowledge from the source domain by optimizing with pseudo labels created by clustering algorithms on the target domain. Although they achieved state-of-the-art performances, the inevitable label noise caused by the clustering procedure was ignored. Such noisy pseudo labels substantially hinders the model's capability on further improving feature representations on the target domain. In order to mitigate the effects of noisy pseudo labels, we propose to softly refine the pseudo labels in the target domain by proposing an unsupervised framework, Mutual Mean-Teaching (MMT), to learn better features from the target domain via off-line refined hard pseudo labels and on-line refined soft pseudo labels in an alternative training manner.  In addition, the common practice is to adopt both the classification loss and the triplet loss jointly for achieving optimal performances in person re-ID models. However, conventional triplet loss cannot work with softly refined labels. To solve this problem, a novel soft softmax-triplet loss is proposed to support learning with soft pseudo triplet labels for achieving the optimal domain adaptation performance. The proposed MMT framework achieves considerable improvements of 14.4%, 18.2%, 13.1% and 16.4% mAP on Market-to-Duke, Duke-to-Market, Market-to-MSMT and Duke-to-MSMT unsupervised domain adaptation tasks.",
        "keywords": "Label Refinery;Unsupervised Domain Adaptation;Person Re-identification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yixiao Ge;Dapeng Chen;Hongsheng Li",
        "authorids": "yxge@link.cuhk.edu.hk;chendapeng@sensetime.com;hsli@ee.cuhk.edu.hk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nGe2020Mutual,\ntitle={Mutual Mean-Teaching: Pseudo Label Refinery for Unsupervised Domain Adaptation on Person Re-identification},\nauthor={Yixiao Ge and Dapeng Chen and Hongsheng Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlnOhVYPS}\n}",
        "github": "https://github.com/yxgeee/MMT",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJlnOhVYPS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "133;258;259",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "695;732;8",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            216.66666666666666,
            59.162675921751735
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            478.3333333333333,
            332.91874217125246
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 802,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5921437976740591026&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJlnfaNYvB",
        "title": "Adaptive Loss Scaling for Mixed Precision Training",
        "track": "main",
        "status": "Reject",
        "tldr": "We devise adaptive loss scaling to improve mixed precision training that surpass the state-of-the-art results.",
        "abstract": "Mixed precision training (MPT) is becoming a practical technique to improve the speed and energy efficiency of training deep neural networks by leveraging the fast hardware support for IEEE half-precision floating point that is available in existing GPUs. MPT is typically used in combination with a technique called loss scaling, that works by scaling up the loss value up before the start of backpropagation in order to minimize the impact of numerical underflow on training. Unfortunately, existing methods make this loss scale value a hyperparameter that needs to be tuned per-model, and a single scale cannot be adapted to different layers at different training stages. We introduce a loss scaling-based training method called adaptive loss scaling that makes MPT easier and more practical to use, by removing the need to tune a model-specific loss scale hyperparameter. We achieve this by introducing layer-wise loss scale values which are automatically computed during training to deal with underflow more effectively than existing methods. We present experimental results on a variety of networks and tasks that show our approach can shorten the time to convergence and improve accuracy, compared with using the existing state-of-the-art MPT and single-precision floating point.",
        "keywords": "Deep Learning;Mixed Precision Training;Loss Scaling;Backpropagation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruizhe Zhao;Brian Vogel;Tanvir Ahmed",
        "authorids": "ruizhe.zhao15@imperial.ac.uk;vogel@preferred.jp;tanvira@preferred.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhao2020adaptive,\ntitle={Adaptive Loss Scaling for Mixed Precision Training},\nauthor={Ruizhe Zhao and Brian Vogel and Tanvir Ahmed},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlnfaNYvB}\n}",
        "github": "https://github.com/ada-loss/ada-loss",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJlnfaNYvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "491;238;151",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "575;389;121",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            293.3333333333333,
            144.21357618322747
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            361.6666666666667,
            186.34973094217824
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14145291249570285020&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJlnxkSYPS",
        "title": "Unsupervised Clustering using Pseudo-semi-supervised Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "Using ensembles and pseudo labels for unsupervised clustering ",
        "abstract": "In this paper, we propose a framework that leverages semi-supervised models to improve unsupervised clustering performance. To leverage semi-supervised models, we first need to automatically generate labels, called pseudo-labels. We find that prior approaches for generating pseudo-labels hurt clustering performance because of their low accuracy. Instead, we use an ensemble of deep networks  to construct a similarity graph, from which we extract high accuracy pseudo-labels. The approach of finding high quality pseudo-labels using ensembles and training the semi-supervised model is iterated, yielding continued improvement. We show that our approach outperforms state of the art clustering results for multiple image and text datasets. For example, we achieve 54.6% accuracy for CIFAR-10 and 43.9% for 20news, outperforming state of the art by 8-12% in absolute terms.",
        "keywords": "Unsupervised Learning;Unsupervised Clustering;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Divam Gupta;Ramachandran Ramjee;Nipun Kwatra;Muthian Sivathanu",
        "authorids": "divam@cmu.edu;ramjee@microsoft.com;nipun.kwatra@microsoft.com;muthian@microsoft.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nGupta2020Unsupervised,\ntitle={Unsupervised Clustering using Pseudo-semi-supervised Learning},\nauthor={Divam Gupta and Ramachandran Ramjee and Nipun Kwatra and Muthian Sivathanu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlnxkSYPS}\n}",
        "github": "https://drive.google.com/open?id=1rvlTYnSDD9UVAy2FkKilM4fGSE75v7Id",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJlnxkSYPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "504;182;105",
        "wc_reply_reviewers": "141;0;0",
        "wc_reply_authors": "871;231;12",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            263.6666666666667,
            172.8242524133179
        ],
        "wc_reply_reviewers_avg": [
            47.0,
            66.46803743153546
        ],
        "wc_reply_authors_avg": [
            371.3333333333333,
            364.454234285855
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13121777105738991484&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJlqoTEtDB",
        "title": "PowerSGD: Powered Stochastic Gradient Descent Methods for Accelerated Non-Convex Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new class of optimizers for accelerated non-convex optimization via a nonlinear gradient transformation. ",
        "abstract": "In this paper, we propose a novel technique for improving the stochastic gradient descent (SGD) method to train deep networks, which we term \\emph{PowerSGD}. The proposed PowerSGD method simply raises the stochastic gradient to a certain power $\\gamma\\in[0,1]$ during iterations and introduces only one additional parameter, namely, the power exponent $\\gamma$ (when $\\gamma=1$, PowerSGD reduces to SGD). We further propose PowerSGD with momentum, which we term \\emph{PowerSGDM}, and provide convergence rate analysis on both PowerSGD and PowerSGDM methods. Experiments are conducted on popular deep learning models and benchmark datasets. Empirical results show that the proposed PowerSGD and PowerSGDM obtain faster initial training speed than adaptive gradient methods,  comparable generalization ability with SGD, and improved robustness to hyper-parameter selection and vanishing gradients. PowerSGD is essentially a gradient modifier via a nonlinear transformation. As such, it is orthogonal and complementary to other techniques for accelerating gradient-based optimization. ",
        "keywords": "stochastic gradient descent;non-convex optimization;powerball function;acceleration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jun Liu;Beitong Zhou;Weigao Sun;Ruijuan Chen;Claire J. Tomlin;Ye Yuan",
        "authorids": "j.liu@uwaterloo.ca;zhoubt@hust.edu.cn;sunweigao@outlook.com;ruijuanchen@hust.edu.cn;tomlin@eecs.berkeley.edu;yye@hust.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nliu2020powersgd,\ntitle={Power{\\{}SGD{\\}}: Powered Stochastic Gradient Descent Methods for Accelerated Non-Convex Optimization},\nauthor={Jun Liu and Beitong Zhou and Weigao Sun and Ruijuan Chen and Claire J. Tomlin and Ye Yuan},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlqoTEtDB}\n}",
        "github": "https://www.dropbox.com/s/kqfyq4xgelqdge3/PowerSGD_ICLR20_code.zip",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJlqoTEtDB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "408;110;416",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "460;336;298",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            311.3333333333333,
            142.40162296203727
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            364.6666666666667,
            69.1728912861743
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0fmX90z73V8J:scholar.google.com/&scioq=PowerSGD:+Powered+Stochastic+Gradient+Descent+Methods+for+Accelerated+Non-Convex+Optimization&hl=en&as_sdt=0,33",
        "gs_version_total": 2
    },
    {
        "id": "rJlvJCVFvS",
        "title": "FAN: Focused Attention Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Attention networks show promise for both vision and language tasks, by emphasizing relationships between constituent elements through weighting functions. Such elements could be regions in an image output by a region proposal network, or words in a sentence, represented by word embedding. Thus far the learning of attention weights has been driven solely by the minimization of task specific loss functions. We introduce a method for learning attention weights to better emphasize informative pair-wise relations between entities. The key component is a novel center-mass cross entropy loss, which can be applied in conjunction with the task specific ones. We further introduce a focused attention backbone to learn these attention weights for general tasks. We demonstrate that the focused supervision leads to improved attention distribution across meaningful entities, and that it enhances the representation by aggregating features from them. Our focused attention module leads to state-of-the-art recovery of relations in a relationship proposal task and boosts performance for various vision and language tasks.",
        "keywords": "Deep Learning;Attention Mechanism;Loss Functions;Computer Vision;Natural Language Processing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chu Wang;Babak Samari;Vladimir Kim;Siddhartha Chaudhuri;Kaleem Siddiqi",
        "authorids": "chuwang@cim.mcgill.ca;babak@cim.mcgill.ca;vokim@adobe.com;sidch@adobe.com;siddiqi@cim.mcgill.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJlvJCVFvS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "700;382;491",
        "wc_reply_reviewers": "0;0;91",
        "wc_reply_authors": "741;699;877",
        "reply_reviewers": "0;0;2",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            524.3333333333334,
            131.94527485118803
        ],
        "wc_reply_reviewers_avg": [
            30.333333333333332,
            42.897811391983886
        ],
        "wc_reply_authors_avg": [
            772.3333333333334,
            75.97075460699041
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3651450603876952395&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJlwAa4YwS",
        "title": "Lattice Representation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose to use lattices to represent objects and prove a fundamental result on how to train networks that use them.",
        "abstract": "We introduce the notion of \\emph{lattice representation learning}, in which the representation for some object of interest (e.g. a sentence or an image) is a lattice point in an Euclidean space. Our main contribution is a result for replacing an objective function which employs lattice quantization with an objective function in which quantization is absent, thus allowing optimization techniques based on gradient descent to apply; we call the resulting algorithms \\emph{dithered stochastic gradient descent} algorithms as they are designed explicitly to allow for an optimization procedure where only local information is employed. We also argue that a technique commonly used in Variational Auto-Encoders (Gaussian priors and Gaussian approximate posteriors) is tightly connected with the idea of lattice representations, as the quantization error in good high dimensional lattices can be modeled as a Gaussian distribution. We use a traditional encoder/decoder architecture to explore the idea of latticed valued representations, and provide experimental evidence of the potential of using lattice representations by modifying the \\texttt{OpenNMT-py} generic \\texttt{seq2seq} architecture  so that it can implement not only Gaussian dithering of representations, but also the well known straight-through estimator and its application to vector quantization. \n",
        "keywords": "lattices;representation learning;coding theory;lossy source coding;information theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Luis A Lastras",
        "authorids": "lastrasl@us.ibm.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nlastras2020lattice,\ntitle={Lattice Representation Learning},\nauthor={Luis A Lastras},\nyear={2020},\nurl={https://openreview.net/forum?id=rJlwAa4YwS}\n}",
        "github": "https://www.dropbox.com/s/y6pvq34xh7tkory/ICLR_SUBMISSION.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJlwAa4YwS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "356;734;860",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "406;557;497",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            650.0,
            214.15881957089695
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            486.6666666666667,
            62.0770131011114
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17029668955157263284&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJx0Q6EFPB",
        "title": "TinyBERT: Distilling BERT for Natural Language Understanding",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Language model pre-training, such as BERT, has significantly improved the performances of many natural language processing tasks. However, the pre-trained language models are usually computationally expensive and memory intensive, so it is difficult to effectively execute them on resource-restricted devices. To accelerate inference and reduce model size while maintaining accuracy, we firstly propose a novel Transformer distillation method that is specially designed for knowledge distillation (KD) of the Transformer-based models. By leveraging this new KD method, the plenty of knowledge encoded in a large \u201cteacher\u201d BERT can be well transferred to a small \u201cstudent\u201d TinyBERT. Moreover, we introduce a new two-stage learning framework for TinyBERT, which performs Transformer distillation at both the pre-training and task-specific learning stages. This framework ensures that TinyBERT can capture the general domain as well as the task-specific knowledge in BERT. TinyBERT is empirically effective and achieves comparable results with BERT on GLUE benchmark, while being 7.5x smaller and 9.4x faster on inference. TinyBERT is also significantly better than state-of-the-art baselines on BERT distillation, with only \u223c28% parameters and \u223c31% inference time of them.\n",
        "keywords": "BERT Compression;Transformer Distillation;TinyBERT",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiaoqi Jiao;Yichun Yin;Lifeng Shang;Xin Jiang;Xiao Chen;Linlin Li;Fang Wang;Qun Liu",
        "authorids": "jiaoxiaoqi@hust.edu.cn;yinyichun@huawei.com;shang.lifeng@huawei.com;jiang.xin@huawei.com;chen.xiao2@huawei.com;lynn.lilinlin@huawei.com;wangfang@hust.edu.cn;qun.liu@huawei.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\njiao2020tinybert,\ntitle={Tiny{\\{}BERT{\\}}: Distilling {\\{}BERT{\\}} for Natural Language Understanding},\nauthor={Xiaoqi Jiao and Yichun Yin and Lifeng Shang and Xin Jiang and Xiao Chen and Linlin Li and Fang Wang and Qun Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=rJx0Q6EFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJx0Q6EFPB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "476;289;474",
        "wc_reply_reviewers": "0;0;49",
        "wc_reply_authors": "694;640;1195",
        "reply_reviewers": "0;0;1",
        "reply_authors": "3;2;3",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            413.0,
            87.68504243408147
        ],
        "wc_reply_reviewers_avg": [
            16.333333333333332,
            23.098821518760552
        ],
        "wc_reply_authors_avg": [
            843.0,
            249.87596923273753
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2183,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15637903772900861097&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJx1Na4Fwr",
        "title": "MACER: Attack-free and Scalable Robust Training via Maximizing Certified Radius",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose MACER: a provable defense algorithm that trains robust models by maximizing the certified radius. It does not use adversarial training but performs better than all existing provable l2-defenses.",
        "abstract": "Adversarial training is one of the most popular ways to learn robust models but is usually attack-dependent and time costly. In this paper, we propose the MACER algorithm, which learns robust models without using adversarial training but performs better than all existing provable l2-defenses. Recent work shows that randomized smoothing can be used to provide a certified l2 radius to smoothed classifiers, and our algorithm trains provably robust smoothed classifiers via MAximizing the CErtified Radius (MACER). The attack-free characteristic makes MACER faster to train and easier to optimize. In our experiments, we show that our method can be applied to modern deep neural networks on a wide range of datasets, including Cifar-10, ImageNet, MNIST, and SVHN. For all tasks, MACER spends less training time than state-of-the-art adversarial training algorithms, and the learned models achieve larger average certified radius.",
        "keywords": "Adversarial Robustness;Provable Adversarial Defense;Randomized Smoothing;Robustness Certification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Runtian Zhai;Chen Dan;Di He;Huan Zhang;Boqing Gong;Pradeep Ravikumar;Cho-Jui Hsieh;Liwei Wang",
        "authorids": "zhairuntian@pku.edu.cn;cdan@cs.cmu.edu;dihe@microsoft.com;huan@huan-zhang.com;boqinggo@outlook.com;pradeepr@cs.cmu.edu;chohsieh@cs.ucla.edu;wanglw@cis.pku.edu.cn",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nZhai2020MACER:,\ntitle={MACER: Attack-free and Scalable Robust Training via Maximizing Certified Radius},\nauthor={Runtian Zhai and Chen Dan and Di He and Huan Zhang and Boqing Gong and Pradeep Ravikumar and Cho-Jui Hsieh and Liwei Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJx1Na4Fwr}\n}",
        "github": "https://github.com/RuntianZ/macer",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJx1Na4Fwr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "189;303;383",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "895;177;690",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            291.6666666666667,
            79.60457831613009
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            587.3333333333334,
            301.9782920822106
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 205,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17692253363082747545&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rJx2slSKDS",
        "title": "Latent Variables on Spheres for Sampling and Inference",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Variational inference is a fundamental problem in Variational AutoEncoder (VAE). The optimization with lower bound of marginal log-likelihood results in the distribution of latent variables approximate to a given prior probability, which is the dilemma of employing VAE to solve real-world problems. By virtue of high-dimensional geometry, we propose a very simple algorithm completely different from existing ones to alleviate the variational inference in VAE. We analyze the unique characteristics of random variables on spheres in high dimensions and prove that Wasserstein distance between two arbitrary data sets randomly drawn from a sphere are nearly identical when the dimension is sufficiently large. Based on our theory, a novel algorithm for distribution-robust sampling is devised. Moreover, we reform the latent space of VAE by constraining latent variables on the sphere, thus freeing VAE from the approximate optimization of posterior probability via variational inference. The new algorithm is named Spherical AutoEncoder (SAE). Extensive experiments by sampling and inference tasks validate our theoretical analysis and the superiority of SAE.",
        "keywords": "variational autoencoder;generative adversarial network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Deli Zhao;Jiapeng Zhu;Bo Zhang",
        "authorids": "zhaodeli@gmail.com;jengzhu0@gmail.com;zhangbo@xiaomi.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhao2020latent,\ntitle={Latent Variables on Spheres for Sampling and Inference},\nauthor={Deli Zhao and Jiapeng Zhu and Bo Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=rJx2slSKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJx2slSKDS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "623;90;294",
        "wc_reply_reviewers": "272;0;0",
        "wc_reply_authors": "686;321;470",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            335.6666666666667,
            219.58192599169502
        ],
        "wc_reply_reviewers_avg": [
            90.66666666666667,
            128.2220296551606
        ],
        "wc_reply_authors_avg": [
            492.3333333333333,
            149.84510521053252
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:r_Z_Gc6DsUEJ:scholar.google.com/&scioq=Latent+Variables+on+Spheres+for+Sampling+and+Inference&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJx4p3NYDB",
        "title": "Lazy-CFR: fast and near-optimal regret minimization for extensive games with imperfect information",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Counterfactual regret minimization (CFR) methods are effective for solving two-player zero-sum extensive games with imperfect information with  state-of-the-art results.  However,  the vanilla CFR has to traverse the whole game tree in each round, which is time-consuming in large-scale games. In this paper, we present Lazy-CFR, a CFR algorithm that adopts a lazy update strategy to avoid traversing the whole game tree in each round.  We prove that the regret of Lazy-CFR is almost the same to the regret of the vanilla CFR and only needs to visit a small portion of the game tree.  Thus, Lazy-CFR is provably faster than CFR. Empirical results consistently show that Lazy-CFR is significantly faster than the vanilla CFR.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yichi Zhou;Tongzheng Ren;Jialian Li;Dong Yan;Jun Zhu",
        "authorids": "vofhqn@gmail.com;rtz19970824@gmail.com;lijialia16@mails.tsinghua.edu.cn;sproblvem@gmail.com;dcszj@mail.tsinghua.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nZhou2020Lazy-CFR:,\ntitle={Lazy-CFR: fast and near-optimal regret minimization for extensive games with imperfect information},\nauthor={Yichi Zhou and Tongzheng Ren and Jialian Li and Dong Yan and Jun Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJx4p3NYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJx4p3NYDB",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "281;182;393",
        "wc_reply_reviewers": "0;0;85",
        "wc_reply_authors": "231;41;451",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            285.3333333333333,
            86.19486965913664
        ],
        "wc_reply_reviewers_avg": [
            28.333333333333332,
            40.069384267237695
        ],
        "wc_reply_authors_avg": [
            241.0,
            167.53109164172085
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10552483869250305131&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rJx5_hNFwr",
        "title": "SCL: Towards Accurate Domain Adaptive Object Detection via Gradient Detach Based Stacked Complementary Losses",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We introduce a new gradient detach based complementary objective training strategy for domain adaptive object detection.",
        "abstract": "Unsupervised domain adaptive object detection aims to learn a robust detector on the domain shift circumstance, where the training (source) domain is label-rich with bounding box annotations, while the testing (target) domain is label-agnostic and the feature distributions between training and testing domains are dissimilar or even totally different. In this paper, we propose a gradient detach based Stacked Complementary Losses (SCL) method that uses detection objective (cross entropy and smooth l1 regression) as the primary objective, and cuts in several auxiliary losses in different network stages to utilize information from the complement data (target images) that can be effective in adapting model parameters to both source and target domains. A gradient detach operation is applied between detection and context sub-networks during training to force networks to learn discriminative representations. We argue that the conventional training with primary objective mainly leverages the information from the source-domain for maximizing likelihood and ignores the complement data in shallow layers of networks, which leads to an insufficient integration within different domains. Thus, our proposed method is a more syncretic adaptation learning process. We conduct comprehensive experiments on seven datasets, the results demonstrate that our method performs favorably better than the state-of-the-art methods by a large margin. For instance, from Cityscapes to FoggyCityscapes, we achieve 37.9% mAP, outperforming the previous art Strong-Weak by 3.6%.",
        "keywords": "Domain Adaptation;Object Detection;Gradient Detach;Stacked Complementary Losses",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiqiang Shen;Harsh Maheshwari;Weichen Yao;Marios Savvides",
        "authorids": "zhiqians@andrew.cmu.edu;harshmaheshwari135@gmail.com;wyao2@andrew.cmu.edu;marioss@andrew.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://drive.google.com/open?id=1qImaVtKtLGKyrtg5T5S_vjbfrH0iViAP",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJx5_hNFwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "257;454;674",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "804;885;1165",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            461.6666666666667,
            170.32583153734754
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            951.3333333333334,
            154.66163784928125
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 127,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9885354576566277111&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJx7wlSYvB",
        "title": "Differentiable Bayesian Neural Network Inference for Data Streams",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose approximate Bayesian neural network that predicts results for data stream as fast as deterministic deep neural network does",
        "abstract": "While deep neural networks (NNs) do not provide the confidence of its prediction, Bayesian neural network (BNN) can estimate the uncertainty of the  prediction. However, BNNs have not been widely used in practice due to the computational cost of predictive inference. This prohibitive computational cost is a hindrance especially when processing stream data with low-latency. To address this problem, we propose a novel model which approximate BNNs for data streams. Instead of generating separate prediction for each data sample independently, this model estimates the increments of prediction for a new data sample from the previous predictions. The computational cost of this model is almost the same as that of non-Bayesian deep NNs. Experiments including semantic segmentation on real-world data show that this model performs significantly faster than BNNs, estimating uncertainty comparable to the results of BNNs.\n",
        "keywords": "Bayesian neural network;approximate predictive inference;data stream;histogram",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Namuk Park;Taekyu Lee;Songkuk Kim",
        "authorids": "namuk.park@yonsei.ac.kr;taekyu.lee@yonsei.ac.kr;songkuk@yonsei.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npark2020differentiable,\ntitle={Differentiable Bayesian Neural Network Inference for Data Streams},\nauthor={Namuk Park and Taekyu Lee and Songkuk Kim},\nyear={2020},\nurl={https://openreview.net/forum?id=rJx7wlSYvB}\n}",
        "github": "https://anonymous.4open.science/r/dbnn/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJx7wlSYvB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "264;440;475",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "825;1082;449",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            393.0,
            92.32912144424783
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            785.3333333333334,
            259.93888170533904
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GeqNETqrsVgJ:scholar.google.com/&scioq=Differentiable+Bayesian+Neural+Network+Inference+for+Data+Streams&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJx8I1rFwr",
        "title": "Meta-Learning by Hallucinating Useful Examples",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Learning to hallucinate additional examples has recently been shown as a promising direction to address few-shot learning tasks, which aim to learn novel concepts from very few examples. The hallucination process, however, is still far from generating effective samples for learning. In this work, we investigate two important requirements for the hallucinator --- (i) precision: the generated examples should lead to good classifier performance, and (ii) collaboration: both the hallucinator and the classification component need to be trained jointly. By integrating these requirements as novel loss functions into a general meta-learning with hallucination framework, our model-agnostic PrecisE Collaborative hAlluciNator (PECAN) facilitates data hallucination to improve the performance of new classification tasks. Extensive experiments demonstrate state-of-the-art performance on competitive miniImageNet and ImageNet based few-shot benchmarks in various scenarios.",
        "keywords": "few-shot learning;meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu-Xiong Wang;Yuki Uchiyama;Martial Hebert;Karteek Alahari",
        "authorids": "yuxiongw@cs.cmu.edu;braverthan2@gmail.com;hebert@ri.cmu.edu;karteek.alahari@inria.fr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020metalearning,\ntitle={Meta-Learning by Hallucinating Useful Examples},\nauthor={Yu-Xiong Wang and Yuki Uchiyama and Martial Hebert and Karteek Alahari},\nyear={2020},\nurl={https://openreview.net/forum?id=rJx8I1rFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJx8I1rFwr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "144;304;452",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "657;465;726",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            300.0,
            125.77228099492616
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            616.0,
            110.42644610780518
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vlreiOr6rRUJ:scholar.google.com/&scioq=Meta-Learning+by+Hallucinating+Useful+Examples&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJx8ylSKvr",
        "title": "Leveraging Entanglement Entropy for Deep Understanding of Attention Matrix in Text Matching",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The formal understanding of deep learning has made great progress based on quantum many-body physics.  For example, the entanglement entropy in quantum many-body systems can interpret the inductive bias of neural network and then guide the design of network structure and parameters for certain tasks. However, there are two unsolved problems in the current study of entanglement entropy, which limits its application potential. First, the theoretical benefits of entanglement entropy was only investigated in the representation of a single object (e.g., an image or a sentence), but has not been well studied in the matching of two objects (e.g., question-answering pairs). Second,  the entanglement entropy can not be qualitatively calculated since  the exponentially increasing dimension of the matching matrix. In this paper, we are trying to address these two problem by investigating the fundamental connections between the entanglement entropy and the attention matrix. We prove that by a mapping (via the trace operator) on the high-dimensional matching matrix,  a low-dimensional attention matrix can be derived. Based on such a attention matrix, we can provide a feasible solution to the entanglement entropy that describes the correlation between the two objects in matching tasks. Inspired by the theoretical property of the entanglement entropy, we can design the network architecture adaptively in a typical text matching task, i.e., question-answering task.",
        "keywords": "Quantum entanglement entropy;Attention Matrix",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Peng Zhang;XiaoLiu Mao;XinDian Ma;BenYou Wang;Jing Zhang;Jun Wang;DaWei Song",
        "authorids": "pzhang@tju.edu.cn;xiaoliumao@tju.edu.cn;xindianma@tju.edu.cn;wang@dei.unipd.it;18738996120@163.com;jun.wang@cs.ucl.ac.uk;dwsong@bit.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nzhang2020leveraging,\ntitle={Leveraging Entanglement Entropy for Deep Understanding of  Attention Matrix in Text Matching},\nauthor={Peng Zhang and XiaoLiu Mao and XinDian Ma and BenYou Wang and Jing Zhang and Jun Wang and DaWei Song},\nyear={2020},\nurl={https://openreview.net/forum?id=rJx8ylSKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJx8ylSKvr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "252;633;266",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "145;21;153",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            383.6666666666667,
            176.3979087807512
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            106.33333333333333,
            60.42810236599819
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ehd7MxJ2DggJ:scholar.google.com/&scioq=Leveraging+Entanglement+Entropy+for+Deep+Understanding+of+Attention+Matrix+in+Text+Matching&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJx9vaVtDS",
        "title": "Individualised Dose-Response Estimation using Generative Adversarial Nets",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The problem of estimating treatment responses from observational data is by now a well-studied one. Less well studied, though, is the problem of treatment response estimation when the treatments are accompanied by a continuous dosage parameter. In this paper, we tackle this lesser studied problem by building on a modification of the generative adversarial networks (GANs) framework that has already demonstrated effectiveness in the former problem. Our model, DRGAN, is flexible, capable of handling multiple treatments each accompanied by a dosage parameter. The key idea is to use a significantly modified GAN model to generate entire dose-response curves for each sample in the training data which will then allow us to use standard supervised methods to learn an inference model capable of estimating these curves for a new sample. Our model consists of 3 blocks: (1) a generator, (2) a discriminator, (3) an inference block. In order to address the challenge presented by the introduction of dosages, we propose novel architectures for both our generator and discriminator. We model the generator as a multi-task deep neural network. In order to address the increased complexity of the treatment space (because of the addition of dosages), we develop a hierarchical discriminator consisting of several networks: (a) a treatment discriminator, (b) a dosage discriminator for each treatment. In the experiments section, we introduce a new semi-synthetic data simulation for use in the dose-response setting and demonstrate improvements over the existing benchmark models.",
        "keywords": "individualised dose-response estimation;treatment effects;causal inference;generative adversarial networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ioana Bica;James Jordon;Mihaela van der Schaar",
        "authorids": "ioana.bica@eng.ox.ac.uk;james.jordon@wolfson.ox.ac.uk;mschaar@turing.ac.uk",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nbica2020individualised,\ntitle={Individualised Dose-Response Estimation using Generative Adversarial Nets},\nauthor={Ioana Bica and James Jordon and Mihaela van der Schaar},\nyear={2020},\nurl={https://openreview.net/forum?id=rJx9vaVtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJx9vaVtDS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "558;375;441",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "400;774;857",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            458.0,
            75.67033764957046
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            677.0,
            198.77793304757614
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15815467515885144498&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJxAo2VYwr",
        "title": "Transferable Perturbations of Deep Feature Distributions",
        "track": "main",
        "status": "Poster",
        "tldr": "We show that perturbations based-on intermediate feature distributions yield more transferable adversarial examples and allow for analysis of the affects of adversarial perturbations on intermediate representations.",
        "abstract": "Almost all current adversarial attacks of CNN classifiers rely on information derived from the output layer of the network. This work presents a new adversarial attack based on the modeling and exploitation of class-wise and layer-wise deep feature distributions. We achieve state-of-the-art targeted blackbox transfer-based attack results for undefended ImageNet models. Further, we place a priority on explainability and interpretability of the attacking process. Our methodology affords an analysis of how adversarial attacks change the intermediate feature distributions of CNNs, as well as a measure of layer-wise and class-wise feature distributional separability/entanglement. We also conceptualize a transition from task/data-specific to model-specific features within a CNN architecture that directly impacts the transferability of adversarial examples. ",
        "keywords": "adversarial attacks;transferability;interpretability",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nathan Inkawhich;Kevin Liang;Lawrence Carin;Yiran Chen",
        "authorids": "nathan.inkawhich@duke.edu;kevin.liang@duke.edu;lcarin@duke.edu;yiran.chen@duke.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nInkawhich2020Transferable,\ntitle={Transferable Perturbations of Deep Feature Distributions},\nauthor={Nathan Inkawhich and Kevin Liang and Lawrence Carin and Yiran Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxAo2VYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJxAo2VYwr",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "396;439;237",
        "wc_reply_reviewers": "61;0;0",
        "wc_reply_authors": "1255;230;247",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.3333333333333,
            86.88050542107949
        ],
        "wc_reply_reviewers_avg": [
            20.333333333333332,
            28.755675768252935
        ],
        "wc_reply_authors_avg": [
            577.3333333333334,
            479.23295193697004
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 100,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16609346203945225693&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "rJxBa1HFvS",
        "title": "Value-Driven Hindsight Modelling",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Value estimation is a critical component of the reinforcement learning (RL) paradigm. The question of how to effectively learn predictors for value from data is one of the major problems studied by the RL community, and different approaches exploit structure in the problem domain in different ways. Model learning can make use of the rich transition structure present in sequences of observations, but this approach is usually not sensitive to the reward function. In contrast, model-free methods directly leverage the quantity of interest from the future but have to compose with a potentially weak scalar signal (an estimate of the return). In this paper we develop an approach for representation learning in RL that sits in between these two extremes: we propose to learn what to model in a way that can directly help value prediction. To this end we determine which features of the future trajectory provide useful information to predict the associated return. This provides us with tractable prediction targets that are directly relevant for a task, and can thus accelerate learning of the value function. The idea can be understood as reasoning, in hindsight, about which aspects of the future observations could help past value prediction. We show how this can help dramatically even in simple policy evaluation settings. We then test our approach at scale in challenging domains, including on 57 Atari 2600 games.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arthur Guez;Fabio Viola;Theophane Weber;Lars Buesing;Steven Kapturowski;Doina Precup;David Silver;Nicolas Heess",
        "authorids": "aguez@google.com;fviola@google.com;theophane@google.com;lbuesing@google.com;skapturowski@google.com;doinap@google.com;davidsilver@google.com;heess@google.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nguez2020valuedriven,\ntitle={Value-Driven Hindsight Modelling},\nauthor={Arthur Guez and Fabio Viola and Theophane Weber and Lars Buesing and Steven Kapturowski and Doina Precup and David Silver and Nicolas Heess},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxBa1HFvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJxBa1HFvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "1094;914;681",
        "wc_reply_reviewers": "0;68;0",
        "wc_reply_authors": "761;410;214",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            896.3333333333334,
            169.06869083961766
        ],
        "wc_reply_reviewers_avg": [
            22.666666666666668,
            32.05550741379015
        ],
        "wc_reply_authors_avg": [
            461.6666666666667,
            226.280553492537
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6439953290848983015&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "rJxFpp4Fvr",
        "title": "Feature-Robustness, Flatness and Generalization Error for Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a novel measure of flatness at local minima of the loss surface of deep neural networks which is invariant with respect to layer-wise reparameterizations and we connect flatness to feature robustness and generalization.",
        "abstract": "The performance of deep neural networks is often attributed to their automated, task-related feature construction. It remains an open question, though, why this leads to solutions with good generalization, even in cases where the number of parameters is larger than the number of samples. Back in the 90s, Hochreiter and Schmidhuber observed that flatness of the loss surface around a local minimum correlates with low generalization error. For several flatness measures, this correlation has been empirically validated. However, it has recently been shown that existing measures of flatness cannot theoretically be related to generalization: if a network uses ReLU activations, the network function can be reparameterized without changing its output in such a way that flatness is changed almost arbitrarily. This paper proposes a natural modification of existing flatness measures that results in invariance to reparameterization. The proposed measures imply a robustness of the network to changes in the input and the hidden layers. Connecting this feature robustness to generalization leads to a generalized definition of the representativeness of data. With this, the generalization error of a model trained on representative data can be bounded by its feature robustness which depends on our novel flatness measure.",
        "keywords": "robustness;flatness;generalization error;loss surface;deep neural networks;feature space",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Henning Petzka;Linara Adilova;Michael Kamp;Cristian Sminchisescu",
        "authorids": "henning.petzka@gmail.com;adylova.linara.r@gmail.com;info@michaelkamp.org;cristian.sminchisescu@math.lth.se",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\npetzka2020featurerobustness,\ntitle={Feature-Robustness, Flatness and Generalization Error for Deep Neural Networks},\nauthor={Henning Petzka and Linara Adilova and Michael Kamp and Cristian Sminchisescu},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxFpp4Fvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJxFpp4Fvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "466;360;613",
        "wc_reply_reviewers": "135;257;0",
        "wc_reply_authors": "163;580;245",
        "reply_reviewers": "1;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            479.6666666666667,
            103.7379176364918
        ],
        "wc_reply_reviewers_avg": [
            130.66666666666666,
            104.96454427832074
        ],
        "wc_reply_authors_avg": [
            329.3333333333333,
            180.38169406997918
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4102645852113917841&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJxG3pVKPB",
        "title": "Translation Between Waves, wave2wave",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The understanding of sensor data has been greatly improved by advanced deep learning methods with big data. However, available sensor data in the real world are still limited, which is called the opportunistic sensor problem.  This paper proposes a new variant of neural machine translation seq2seq to deal with continuous signal waves by introducing the window-based (inverse-) representation to adaptively represent partial shapes of waves and the iterative back-translation model for high-dimensional data.  Experimental results are shown for two real-life data: earthquake and activity translation.  The performance improvements of one-dimensional data was about 46 % in test loss and that of high-dimensional data was about 1625 % in perplexity with regard to the original seq2seq.\n",
        "keywords": "sequence to sequence model;signal to signal;deep learning;RNN;encoder-decoder model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tsuyoshi Okita;Hirotaka Hachiya;Sozo Inoue;Naonori Ueda",
        "authorids": "tsuyoshi.okita@gmail.com;hirotaka.hachiya@riken.jp;sozo.inoue@riken.jp;naonori.ueda@riken.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nokita2020translation,\ntitle={Translation Between Waves,  wave2wave},\nauthor={Tsuyoshi Okita and Hirotaka Hachiya and Sozo Inoue and Naonori Ueda},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxG3pVKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJxG3pVKPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "341;161;364",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            288.6666666666667,
            90.76098035805671
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4OT3DMYeNoYJ:scholar.google.com/&scioq=Translation+Between+Waves,+wave2wave&hl=en&as_sdt=0,33",
        "gs_version_total": 4
    },
    {
        "id": "rJxGGlSKwH",
        "title": "Sentence embedding with contrastive multi-views learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We aim to exploit the diversity of linguistic structures to build sentence representations.",
        "abstract": "In this work, we propose a self-supervised method to learn sentence representations with an injection of linguistic knowledge. Multiple linguistic frameworks propose diverse sentence structures from which semantic meaning might be expressed out of compositional words operations. We aim to take advantage of this linguist diversity and learn to represent sentences by contrasting these diverse views. Formally, multiple views of the same sentence are mapped to close representations. On the contrary, views from other sentences are mapped further. By contrasting different linguistic views, we aim at building embeddings which better capture semantic and which are less sensitive to the sentence outward form.\n",
        "keywords": "contrastive;multi-views;linguistic;embedding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Antoine Simoulin",
        "authorids": "antoine.simoulin@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nsimoulin2020sentence,\ntitle={Sentence embedding with contrastive multi-views learning},\nauthor={Antoine Simoulin},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxGGlSKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJxGGlSKwH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "365;894;280",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            513.0,
            271.6333312880926
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:D02tqL5mLDAJ:scholar.google.com/&scioq=Sentence+embedding+with+contrastive+multi-views+learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJxGLlBtwH",
        "title": "On the interaction between supervision and self-play in emergent communication",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "A promising approach for teaching artificial agents to use natural language involves using human-in-the-loop training. However, recent work suggests that current machine learning methods are too data inefficient to be trained in this way from scratch. In this paper, we investigate the relationship between two categories of learning signals with the ultimate goal of improving sample efficiency: imitating human language data via supervised learning, and maximizing reward in a simulated multi-agent environment via self-play (as done in emergent communication), and introduce the term supervised self-play (S2P) for algorithms using both of these signals. We find that first training agents via supervised learning on human data followed by self-play outperforms the converse, suggesting that it is not beneficial to emerge languages from scratch. We then empirically investigate various S2P schedules that begin with supervised learning in two environments: a Lewis signaling game with symbolic inputs, and an image-based referential game with natural language descriptions. Lastly, we introduce population based approaches to S2P, which further improves the performance over single-agent methods.",
        "keywords": "multi-agent communication;self-play;emergent languages",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ryan Lowe*;Abhinav Gupta*;Jakob Foerster;Douwe Kiela;Joelle Pineau",
        "authorids": "rlowe1@cs.mcgill.ca;abhinav.gupta@umontreal.ca;jakobfoerster@gmail.com;dkiela@fb.com;jpineau@cs.mcgill.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLowe*2020On,\ntitle={On the interaction between supervision and self-play in emergent communication},\nauthor={Ryan Lowe* and Abhinav Gupta* and Jakob Foerster and Douwe Kiela and Joelle Pineau},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxGLlBtwH}\n}",
        "github": "https://github.com/backpropper/s2p",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJxGLlBtwH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "559;684;950",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "511;582;818",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            731.0,
            163.04805017744513
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            637.0,
            131.22753776043604
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 77,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3074457436364179887&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJxHcgStwr",
        "title": "Handwritten Amharic Character Recognition System Using Convolutional Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Recognition of handwritten Amharic characters based on convolutional neural network.",
        "abstract": "Amharic language is an official language of the federal government of the Federal Democratic Republic of Ethiopia. Accordingly, there is a bulk of handwritten Amharic documents available in libraries, information centres, museums, and offices. Digitization of these documents enables to harness already available language technologies to local information needs and developments. Converting these documents will have a lot of advantages including (i) to preserve and transfer history of the country (ii) to save storage space (ii) proper handling of documents (iv) enhance retrieval of information through internet and other applications. Handwritten Amharic character recognition system becomes a challenging task due to inconsistency of a writer, variability in writing styles of different writers, relatively large number of characters of the script, high interclass similarity, structural complexity and degradation of documents due to different reasons. In order to recognize handwritten Amharic character a novel method based on deep neural networks is used which has recently shown exceptional performance in various pattern recognition and machine learning applications, but has not been endeavoured for Ethiopic script. The CNN model is trained and tested our database that contains 132,500 datasets of handwritten Amharic characters. Common machine learning methods usually apply a combination of feature extractor and trainable classifier. The use of CNN leads to significant improvements across different machine-learning classification algorithms. Our proposed CNN model is giving an accuracy of 91.83% on training data and 90.47% on validation data.",
        "keywords": "Amharic;Handwritten;Character;Convolutional neural network;Recognition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fetulhak Abdurahman",
        "authorids": "afetulhak@yahoo.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nabdurahman2020handwritten,\ntitle={Handwritten Amharic Character Recognition System Using Convolutional Neural Networks},\nauthor={Fetulhak Abdurahman},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxHcgStwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJxHcgStwr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "92;233;750",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            358.3333333333333,
            282.869031335862
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=40455597044376108&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJxRJeStvB",
        "title": "Learning scalable and transferable multi-robot/machine sequential assignment planning via graph embedding",
        "track": "main",
        "status": "Reject",
        "tldr": "RL can solve (stochastic) multi-robot/scheduling problems scalably and transferably using graph embedding",
        "abstract": "Can the success of reinforcement learning methods for simple combinatorial optimization problems be extended to multi-robot sequential assignment planning? In addition to the challenge of achieving near-optimal performance in large problems, transferability to an unseen number of robots and tasks is another key challenge for real-world applications. In this paper, we suggest a method that achieves the first success in both challenges for robot/machine scheduling problems.\n  \nOur method comprises of three components. First, we show any robot scheduling problem can be expressed as a random probabilistic graphical model (PGM). We develop a mean-field inference method for random PGM and use it for Q-function inference. Second, we show that transferability can be achieved by carefully designing two-step sequential encoding of problem state. Third, we resolve the computational scalability issue of fitted Q-iteration by suggesting a heuristic auction-based Q-iteration fitting method enabled by transferability we achieved.\n  \nWe apply our method to discrete-time, discrete space problems (Multi-Robot Reward Collection (MRRC)) and scalably achieve 97% optimality with transferability. This optimality is maintained under stochastic contexts. By extending our method to continuous time, continuous space formulation, we claim to be the first learning-based method with scalable performance in any type of multi-machine scheduling problems; our method scalability achieves comparable performance to popular metaheuristics in Identical parallel machine scheduling (IPMS) problems.",
        "keywords": "reinforcement learning;multi-robot/machine;scheduling;planning;scalability;transferability;mean-field inference;graph embedding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hyunwook Kang;Aydar Mynbay;James R. Morrison;Jinkyoo Park",
        "authorids": "hwkang@tamu.edu;aydar.mynbay@bluehole.net;james.morrison@kaist.edu;jinkyoo.park@kaist.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkang2020learning,\ntitle={Learning scalable and transferable multi-robot/machine sequential assignment planning via graph embedding},\nauthor={Hyunwook Kang and Aydar Mynbay and James R. Morrison and Jinkyoo Park},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxRJeStvB}\n}",
        "github": "https://drive.google.com/drive/folders/1vUAR7OmNxCi9MYV3Kwo3RuxHBsemZ-rA?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJxRJeStvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "168;328;350",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "707;138;677",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            282.0,
            81.10898018509829
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            507.3333333333333,
            261.4451291486524
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14873497906449250997&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rJxRmlStDB",
        "title": "Self-Induced Curriculum Learning in Neural Machine Translation",
        "track": "main",
        "status": "Reject",
        "tldr": "Analysis of the self-induced curriculum of a self-supervised neural machine translation system.",
        "abstract": "Self-supervised neural machine translation (SS-NMT) learns how to extract/select suitable training data from comparable (rather than parallel) corpora and how to translate, in a way that the two tasks support each other in a virtuous circle.  SS-NMT has been shown to be competitive with state-of-the-art unsupervised NMT. In this study we provide an in-depth analysis of the sampling choices the SS-NMT model takes during training. We show that, without it having been told to do so, the model selects samples of increasing (i) complexity and (ii) task-relevance in combination with (iii) a denoising curriculum. We observe that the dynamics of the mutual-supervision of both system internal representation types is vital for the extraction and hence translation performance. We show that in terms of the human Gunning-Fog Readability index (GF), SS-NMT starts by extracting and learning from Wikipedia data suitable for high school (GF=10--11) and quickly moves towards content suitable for first year undergraduate students (GF=13).",
        "keywords": "curriculum learning;neural machine translation;self-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dana Ruiter;Cristina Espa\u00f1a-Bonet;Josef van Genabith",
        "authorids": "druiter@lsv.uni-saarland.de;cristinae@dfki.de;josef.van_genabith@dfki.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nruiter2020selfinduced,\ntitle={Self-Induced Curriculum Learning in Neural Machine Translation},\nauthor={Dana Ruiter and Cristina Espa{\\~n}a-Bonet and Josef van Genabith},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxRmlStDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rJxRmlStDB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "306;222;169",
        "wc_reply_reviewers": "0;0;75",
        "wc_reply_authors": "518;397;356",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            232.33333333333334,
            56.40527950073075
        ],
        "wc_reply_reviewers_avg": [
            25.0,
            35.35533905932738
        ],
        "wc_reply_authors_avg": [
            423.6666666666667,
            68.77176423180923
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8608946719180461791&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJxWxxSYvB",
        "title": "Spike-based causal inference for weight alignment",
        "track": "main",
        "status": "Poster",
        "tldr": "We present a learning rule for feedback weights in a spiking neural network that addresses the weight transport problem.",
        "abstract": "In artificial neural networks trained with gradient descent, the weights used for processing stimuli are also used during backward passes to calculate gradients. For the real brain to approximate gradients, gradient information would have to be propagated separately, such that one set of synaptic weights is used for processing and another set is used for backward passes. This produces the so-called \"weight transport problem\" for biological models of learning, where the backward weights used to calculate gradients need to mirror the forward weights used to process stimuli. This weight transport problem has been considered so hard that popular proposals for biological learning assume that the backward weights are simply random, as in the feedback alignment algorithm. However, such random weights do not appear to work well for large networks. Here we show how the discontinuity introduced in a spiking system can lead to a solution to this problem. The resulting algorithm is a special case of an estimator used for causal inference in econometrics, regression discontinuity design. We show empirically that this algorithm rapidly makes the backward weights approximate the forward weights. As the backward weights become correct, this improves learning performance over feedback alignment on tasks such as Fashion-MNIST and CIFAR-10. Our results demonstrate that a simple learning rule in a spiking network can allow neurons to produce the right backward connections and thus solve the weight transport problem.",
        "keywords": "causal;inference;weight;transport;rdd;regression;discontinuity;design;cifar10;biologically;plausible",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jordan Guerguiev;Konrad Kording;Blake Richards",
        "authorids": "jordan.guerguiev@utoronto.ca;koerding@gmail.com;blake.richards@mcgill.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nGuerguiev2020Spike-based,\ntitle={Spike-based causal inference for weight alignment},\nauthor={Jordan Guerguiev and Konrad Kording and Blake Richards},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxWxxSYvB}\n}",
        "github": "https://anonfile.com/51V8Ge66n3/Code_zip",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJxWxxSYvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "470;379;182",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "676;67;5",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            343.6666666666667,
            120.20075799354271
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            249.33333333333334,
            302.7587965508003
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 29,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5547410407590014223&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJxX8T4Kvr",
        "title": "Learning Efficient Parameter Server Synchronization Policies for Distributed SGD",
        "track": "main",
        "status": "Poster",
        "tldr": "We apply a reinforcement learning based approach to learning optimal synchronization policies used for Parameter Server-based distributed training  of SGD.",
        "abstract": "We apply a reinforcement learning (RL) based approach to learning optimal synchronization policies used for Parameter Server-based distributed training of machine learning models with Stochastic Gradient Descent (SGD). Utilizing a formal synchronization policy description in the PS-setting, we are able to derive a suitable and compact description of states and actions, allowing us to efficiently use the standard off-the-shelf deep Q-learning algorithm. As a result, we are able to learn synchronization policies which generalize to different cluster environments, different training datasets and small model variations and (most importantly) lead to considerable decreases in training time when compared to standard policies such as bulk synchronous parallel (BSP), asynchronous parallel (ASP), or stale synchronous parallel (SSP). To support our claims we present extensive numerical results obtained from experiments performed in simulated cluster environments. In our experiments training time is reduced by 44 on average and learned policies generalize to multiple unseen circumstances.",
        "keywords": "Distributed SGD;Paramter-Server;Synchronization Policy;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rong Zhu;Sheng Yang;Andreas Pfadler;Zhengping Qian;Jingren Zhou",
        "authorids": "red.zr@alibaba-inc.com;yangsheng@hit.edu.cn;andreaswernerrober@alibaba-inc.com;zhengping.qzp@alibaba-inc.com;jingren.zhou@alibaba-inc.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nZhu2020Learning,\ntitle={Learning Efficient Parameter Server Synchronization Policies for Distributed SGD},\nauthor={Rong Zhu and Sheng Yang and Andreas Pfadler and Zhengping Qian and Jingren Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxX8T4Kvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJxX8T4Kvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "215;461;277",
        "wc_reply_reviewers": "76;0;0",
        "wc_reply_authors": "1473;1222;1293",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;3;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            317.6666666666667,
            104.46477343530157
        ],
        "wc_reply_reviewers_avg": [
            25.333333333333332,
            35.82674358011841
        ],
        "wc_reply_authors_avg": [
            1329.3333333333333,
            105.64195294589277
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7343863668599986738&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJxYMCEFDr",
        "title": "Leveraging Adversarial Examples to Obtain Robust Second-Order Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a robust plug-in representation based on relative positioning derived from targeted multi-class adversarial image generation.",
        "abstract": "Deep neural networks represent data as projections on trained weights in a high dimensional manifold. This is a first-order based absolute representation that is widely used due to its interpretable nature and simple mathematical functionality. However, in the application of visual recognition, first-order representations trained on pristine images have shown a vulnerability to distortions. Visual distortions including imaging acquisition errors and challenging environmental conditions like blur, exposure, snow and frost cause incorrect classification in first-order neural nets. To eliminate vulnerabilities under such distortions, we propose representing data points by their relative positioning in a high dimensional manifold instead of their absolute positions. Such a positioning scheme is based on a data point\u2019s second-order property. We obtain a data point\u2019s second-order representation by creating adversarial examples to all possible decision boundaries and tracking the movement of corresponding boundaries. We compare our representation against first-order methods and show that there is an increase of more than 14% under severe distortions for ResNet-18. We test the generalizability of the proposed representation on larger networks and on 19 complex and real-world distortions from CIFAR-10-C. Furthermore, we show how our proposed representation can be used as a plug-in approach on top of any network. We also provide methodologies to scale our proposed representation to larger datasets.",
        "keywords": "Second-order representation;adversarial examples;robustness;gradients",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohit Prabhushankar;Gukyeong Kwon;Dogancan Temel;Ghassan AlRegib",
        "authorids": "mohit.p@gatech.edu;gukyeong.kwon@gatech.edu;cantemel@gatech.edu;alregib@gatech.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nprabhushankar2020leveraging,\ntitle={Leveraging Adversarial Examples to Obtain Robust Second-Order Representations},\nauthor={Mohit Prabhushankar and Gukyeong Kwon and Dogancan Temel and Ghassan AlRegib},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxYMCEFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJxYMCEFDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "629;662;208",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            499.6666666666667,
            206.6790318881483
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5rCSfx_Rz1gJ:scholar.google.com/&scioq=Leveraging+Adversarial+Examples+to+Obtain+Robust+Second-Order+Representations&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJxbJeHFPS",
        "title": "What Can Neural Networks Reason About?",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We develop a theoretical framework to characterize what a neural network can learn to reason about.",
        "abstract": "Neural networks have succeeded in many reasoning tasks. Empirically, these tasks require specialized network structures, e.g., Graph Neural Networks (GNNs) perform well on many such tasks, but less structured networks fail. Theoretically, there is limited understanding of why and when a network structure generalizes better than others, although they have equal expressive power. In this paper, we develop a framework to characterize which reasoning tasks a network can learn well, by studying how well its computation structure aligns with the algorithmic structure of the relevant reasoning process. We formally define this algorithmic alignment and derive a sample complexity bound that decreases with better alignment. This framework offers an explanation for the empirical success of popular reasoning models, and suggests their limitations. As an example, we unify seemingly different reasoning tasks, such as intuitive physics, visual question answering, and shortest paths, via the lens of a powerful algorithmic paradigm, dynamic programming (DP). We show that GNNs align with DP and thus are expected to solve these tasks. On several reasoning tasks, our theory is supported by empirical results.",
        "keywords": "reasoning;deep learning theory;algorithmic alignment;graph neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Keyulu Xu;Jingling Li;Mozhi Zhang;Simon S. Du;Ken-ichi Kawarabayashi;Stefanie Jegelka",
        "authorids": "keyulu@mit.edu;jingling@cs.umd.edu;mozhi@cs.umd.edu;ssdu@ias.edu;k_keniti@nii.ac.jp;stefje@mit.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nXu2020What,\ntitle={What Can Neural Networks Reason About?},\nauthor={Keyulu Xu and Jingling Li and Mozhi Zhang and Simon S. Du and Ken-ichi Kawarabayashi and Stefanie Jegelka},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxbJeHFPS}\n}",
        "github": "[![github](/images/github_icon.svg) NNReasoning/What-Can-Neural-Networks-Reason-About](https://github.com/NNReasoning/What-Can-Neural-Networks-Reason-About) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rJxbJeHFPS)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJxbJeHFPS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "480;201;358",
        "wc_reply_reviewers": "0;13;126",
        "wc_reply_authors": "227;91;182",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            346.3333333333333,
            114.19963027766578
        ],
        "wc_reply_reviewers_avg": [
            46.333333333333336,
            56.58229012293118
        ],
        "wc_reply_authors_avg": [
            166.66666666666666,
            56.570506646327836
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 325,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9843737946108249280&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rJxcBpNKPr",
        "title": "OvA-INN: Continual Learning with Invertible Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose to train an Invertible Neural Network for each class to perform class-by-class Continual Learning.",
        "abstract": "In the field of Continual Learning, the objective is to learn several tasks one after the other without access to the data from previous tasks. Several solutions have been proposed to tackle this problem but they usually  assume that the user knows which of the tasks to perform at test time on a particular sample, or rely on small samples from previous data and most of them suffer of a substantial drop in accuracy when updated with batches of only one class at a time. In this article, we propose a new method, OvA-INN, which is able to learn one class at a time and without storing any of the previous data. To achieve this, for each class, we train a specific Invertible Neural Network to output the zero vector for its class. At test time, we can predict the class of a sample by identifying which network outputs the vector with the smallest norm. With this method, we show that we can take advantage of pretrained models by stacking an invertible network on top of a features extractor. This way, we are able to outperform state-of-the-art approaches that rely on features learning for the Continual Learning of MNIST and CIFAR-100 datasets. In our experiments, we are reaching 72% accuracy on CIFAR-100 after training our model one class at a time.",
        "keywords": "Deep Learning;Continual Learning;Invertible Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "HOCQUET Guillaume;BICHLER Olivier;QUERLIOZ Damien",
        "authorids": "guillaume.hocquet@live.fr;olivier.bichler@cea.fr;damien.querlioz@c2n.upsaclay.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nguillaume2020ovainn,\ntitle={OvA-{\\{}INN{\\}}: Continual Learning with Invertible Neural Networks},\nauthor={HOCQUET Guillaume and BICHLER Olivier and QUERLIOZ Damien},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxcBpNKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJxcBpNKPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "510;361;1218",
        "wc_reply_reviewers": "132;0;0",
        "wc_reply_authors": "242;616;793",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            696.3333333333334,
            373.85588429530196
        ],
        "wc_reply_reviewers_avg": [
            44.0,
            62.22539674441618
        ],
        "wc_reply_authors_avg": [
            550.3333333333334,
            229.6872269461718
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8201909084410112146&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJxe3xSYDS",
        "title": "Extreme Classification via Adversarial Softmax Approximation",
        "track": "main",
        "status": "Poster",
        "tldr": "An efficient, unbiased approximation of the softmax loss function for extreme classification",
        "abstract": "Training a classifier over a large number of classes, known as 'extreme classification', has become a topic of major interest with applications in technology, science, and e-commerce. Traditional softmax regression induces a gradient cost proportional to the number of classes C, which often is prohibitively expensive. A popular scalable softmax approximation relies on uniform negative sampling, which suffers from slow convergence due a poor signal-to-noise ratio. In this paper, we propose a simple training method for drastically enhancing the gradient signal by drawing negative samples from an adversarial model that mimics the data distribution. Our contributions are three-fold: (i) an adversarial sampling mechanism that produces negative samples at a cost only logarithmic in C, thus still resulting in cheap gradient updates; (ii) a mathematical proof that this adversarial sampling minimizes the gradient variance while any bias due to non-uniform sampling can be removed; (iii) experimental results on large scale data sets that show a reduction of the training time by an order of magnitude relative to several competitive baselines.\n",
        "keywords": "Extreme classification;negative sampling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Robert Bamler;Stephan Mandt",
        "authorids": "rbamler@uci.edu;stephan.mandt@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nBamler2020Extreme,\ntitle={Extreme Classification via Adversarial Softmax Approximation},\nauthor={Robert Bamler and Stephan Mandt},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxe3xSYDS}\n}",
        "github": "https://github.com/mandt-lab/adversarial-negative-sampling",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJxe3xSYDS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "637;447;134",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "496;745;55",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            406.0,
            207.3853096693849
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            432.0,
            285.3033473340262
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 29,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14613263140871789751&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJxilTNtDB",
        "title": "Target-directed Atomic Importance Estimation via Reverse Self-attention",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We first propose a fully-automated and target-directed atomic importance estimator based on the graph neural networks and a new concept of reverse self-attention.",
        "abstract": "Estimating the importance of each atom in a molecule is one of the most appealing and challenging problems in chemistry, physics, and material engineering. The most common way to estimate the atomic importance is to compute the electronic structure using density-functional theory (DFT), and then to interpret it using domain knowledge of human experts. However, this conventional approach is impractical to the large molecular database because DFT calculation requires huge computation, specifically, O(n^4) time complexity w.r.t. the number of electrons in a molecule. Furthermore, the calculation results should be interpreted by the human experts to estimate the atomic importance in terms of the target molecular property. To tackle this problem, we first exploit machine learning-based approach for the atomic importance estimation. To this end, we propose reverse self-attention on graph neural networks and integrate it with graph-based molecular description. Our method provides an efficiently-automated and target-directed way to estimate the atomic importance without any domain knowledge on chemistry and physics.",
        "keywords": "Scientific Application;Knowledge Discovery;Graph Neural Network;Attention Mechanism",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gyoung S. Na;Hyun Woo Kim",
        "authorids": "ngs0726@gmail.com;ahwk@krict.re.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJxilTNtDB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "351;214;249",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "326;252;192",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            271.3333333333333,
            58.116740751314985
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            256.6666666666667,
            54.804703772172296
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pbV4me61cusJ:scholar.google.com/&scioq=Target-directed+Atomic+Importance+Estimation+via+Reverse+Self-attention&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJxlc0EtDr",
        "title": "MEMO: A Deep Network for Flexible Combination of Episodic Memories",
        "track": "main",
        "status": "Poster",
        "tldr": "A memory architecture that support inferential reasoning.",
        "abstract": "Recent research developing neural network architectures with external memory have often used the benchmark bAbI question and answering dataset which provides a challenging number of tasks requiring reasoning. Here we employed a classic associative inference task from the human neuroscience literature in order to more carefully probe the reasoning capacity of existing memory-augmented architectures. This task is thought to capture the essence of reasoning -- the appreciation of distant relationships among elements distributed across multiple facts or memories. Surprisingly, we found that current architectures struggle to reason over long distance associations. Similar results were obtained on a more complex task involving finding the shortest path between nodes in a path. We therefore developed a novel architecture, MEMO, endowed with the capacity to reason over longer distances. This was accomplished with the addition of two novel components. First, it introduces a separation between memories/facts stored in external memory and the items that comprise these facts in external memory. Second, it makes use of an adaptive retrieval mechanism, allowing a variable number of \u2018memory hops\u2019 before the answer is produced. MEMO is capable of solving our novel reasoning tasks, as well as all 20 tasks in bAbI.",
        "keywords": "Memory Augmented Neural Networks;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrea Banino;Adri\u00e0 Puigdom\u00e8nech Badia;Raphael K\u00f6ster;Martin J. Chadwick;Vinicius Zambaldi;Demis Hassabis;Caswell Barry;Matthew Botvinick;Dharshan Kumaran;Charles Blundell",
        "authorids": "abanino@google.com;adriap@google.com;rkoster@google.com;mjchadwick@google.com;vzambaldi@google.com;dhteam@google.com;caswell.barry@ucl.ac.uk;botvinick@google.com;dkumaran@google.com;cblundell@google.com",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@inproceedings{\nBanino2020MEMO:,\ntitle={MEMO: A Deep Network for Flexible Combination of Episodic Memories},\nauthor={Andrea Banino and Adri\u00e0 Puigdom\u00e8nech Badia and Raphael K\u00f6ster and Martin J. Chadwick and Vinicius Zambaldi and Demis Hassabis and Caswell Barry and Matthew Botvinick and Dharshan Kumaran and Charles Blundell},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxlc0EtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJxlc0EtDr",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "372;294",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "620;820",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            333.0,
            39.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            720.0,
            100.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "rJxoi1HtPr",
        "title": "Task-agnostic Continual Learning via Growing Long-Term Memory Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We introduce a continual learning setup based on language modelling where no explicit task segmentation signal is given and propose a neural network model with growing long term memory to tackle it.",
        "abstract": "As our experience shows, humans can learn and deploy a myriad of different skills to tackle the situations they encounter daily. Neural networks, in contrast, have a fixed memory capacity that prevents them from learning more than a few sets of skills before starting to forget them. \nIn this work, we make a step to bridge neural networks with human-like learning capabilities. For this, we propose a model with a growing and open-bounded memory capacity that can be accessed based on the model\u2019s current demands. To test this system, we introduce a continual learning task based on language modelling where the model is exposed to multiple languages and domains in sequence, without providing any explicit signal on the type of input it is currently dealing with. The proposed system exhibits improved adaptation skills in that it can recover faster than comparable baselines after a switch in the input language or domain.",
        "keywords": "growing;long term memory;continual learning;catastrophic forgetting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Germ\u00e1n Kruszewski;Ionut Teodor Sorodoc;Tomas Mikolov",
        "authorids": "germank@gmail.com;ionutteodor.sorodoc@upf.edu;tmikolov@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJxoi1HtPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "680;311;342",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            444.3333333333333,
            167.12137970814973
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HkpzBqrSwZcJ:scholar.google.com/&scioq=Task-agnostic+Continual+Learning+via+Growing+Long-Term+Memory+Networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJxok1BYPr",
        "title": "Black Box Recursive Translations for Molecular Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a black box algorithm for repeated optimization of compounds using a translation framework.",
        "abstract": "Machine learning algorithms for generating molecular structures offer a promising new approach to drug discovery. We cast molecular optimization as a translation problem, where the goal is to map an input compound to a target compound with improved biochemical properties. Remarkably, we observe that when generated molecules are iteratively fed back into the translator, molecular compound attributes improve with each step. We show that this finding is invariant to the choice of translation model, making this a \"black box\" algorithm. We call this method Black Box Recursive Translation (BBRT), a new inference method for molecular property optimization. This simple, powerful technique operates strictly on the inputs and outputs of any translation model. We obtain new state-of-the-art results for molecular property optimization tasks using our simple drop-in replacement with well-known sequence and graph-based models. Our method provides a significant boost in performance relative to its non-recursive peers with just a simple \"``for\" loop. Further, BBRT is highly interpretable, allowing users to map the evolution of newly discovered compounds from known starting points. ",
        "keywords": "molecules;chemistry;drug design;generative models;application;translation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Farhan Damani;Vishnu Sresht;Stephen Ra",
        "authorids": "farhand7@gmail.com;vishnu.sresht@pfizer.com;stephen.ra@pfizer.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndamani2020black,\ntitle={Black Box Recursive Translations for Molecular Optimization},\nauthor={Farhan Damani and Vishnu Sresht and Stephen Ra},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxok1BYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJxok1BYPr",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "430;1013;741;429",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "503;1556;2165;935",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;3;3;2",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            653.25,
            243.5409359840764
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1289.75,
            628.8431342552767
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.25,
            0.82915619758885
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16087907695606535221&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rJxotpNYPS",
        "title": "DIVA: Domain Invariant Variational Autoencoder",
        "track": "main",
        "status": "Reject",
        "tldr": "Domain Invariant Variational Autoencoder that learns three independent latent subspaces, one for the domain, one for the class, and one for any residual variations.",
        "abstract": "We consider the problem of domain generalization, namely, how to learn representations given data from a set of domains that generalize to data from a previously unseen domain. We propose the Domain Invariant Variational Autoencoder (DIVA), a generative model that tackles this problem by learning three independent latent subspaces, one for the domain, one for the class, and one for any residual variations. We highlight that due to the generative nature of our model we can also incorporate unlabeled data from known or previously unseen domains. To the best of our knowledge this has not been done before in a domain generalization setting. This property is highly desirable in fields like medical imaging where labeled data is scarce. We experimentally evaluate our model on the rotated MNIST benchmark and a malaria cell images dataset where we show that (i) the learned subspaces are indeed complementary to each other, (ii) we improve upon recent works on this task and (iii) incorporating unlabelled data can boost the performance even further.",
        "keywords": "representation learning;generative models;domain generalization;invariance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Maximilian Ilse;Jakub M. Tomczak;Christos Louizos;Max Welling",
        "authorids": "ilse.maximilian@gmail.com;jakubmkt@gmail.com;chr.louizos@gmail.com;welling.max@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nilse2020diva,\ntitle={{\\{}DIVA{\\}}: Domain Invariant Variational Autoencoder},\nauthor={Maximilian Ilse and Jakub M. Tomczak and Christos Louizos and Max Welling},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxotpNYPS}\n}",
        "github": "https://anonymousfiles.io/7TilkTt4/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJxotpNYPS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "683;295;398",
        "wc_reply_reviewers": "210;0;28",
        "wc_reply_authors": "1008;376;372",
        "reply_reviewers": "1;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            458.6666666666667,
            164.1063340913107
        ],
        "wc_reply_reviewers_avg": [
            79.33333333333333,
            93.09970760188003
        ],
        "wc_reply_authors_avg": [
            585.3333333333334,
            298.87492738973975
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 262,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9620483590955349060&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "rJxq3kHKPH",
        "title": "A Simple Approach to the Noisy Label Problem Through the Gambler's Loss",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose to a simple loss function and an analytical early stopping criterion to deal with the label noise problem.",
        "abstract": "Learning in the presence of label noise is a challenging yet important task. It is crucial to design models that are robust to noisy labels. In this paper, we discover that a new class of loss functions called the gambler's loss provides strong robustness to label noise across various levels of corruption. Training with this modified loss function reduces memorization of data points with noisy labels and is a simple yet effective method to improve robustness and generalization. Moreover, using this loss function allows us to derive an analytical early stopping criterion that accurately estimates when memorization of noisy labels begins to occur. Our overall approach achieves strong results and outperforming existing baselines.",
        "keywords": "noisy labels;robust learning;early stopping;generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liu Ziyin;Ru Wang;Paul Pu Liang;Ruslan Salakhutdinov;Louis-Philippe Morency;Masahito Ueda",
        "authorids": "zliu@cat.phys.s.u-tokyo.ac.jp;wangru1994305@gmail.com;pliang@cs.cmu.edu;rsalakhu@cs.cmu.edu;morency@cs.cmu.edu;ueda@phys.s.u-tokyo.ac.jp",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nziyin2020a,\ntitle={A Simple Approach to the Noisy Label Problem Through the Gambler's Loss},\nauthor={Liu Ziyin and Ru Wang and Paul Pu Liang and Ruslan Salakhutdinov and Louis-Philippe Morency and Masahito Ueda},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxq3kHKPH}\n}",
        "github": "https://github.com/codesubmiter/b3124134",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJxq3kHKPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "265;690;698",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1232;1776;799",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;3;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            551.0,
            202.258909980912
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1269.0,
            399.7157323231932
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10541398963775680317&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rJxqZkSFDB",
        "title": "Searching to Exploit Memorization Effect in Learning from Corrupted Labels",
        "track": "main",
        "status": "Reject",
        "tldr": "Using automated machine learning techniques to exploit memorization effect in learning from corrupted labels",
        "abstract": "Sample-selection approaches, which attempt to pick up clean instances from the training data set, have become one promising direction to robust learning from corrupted labels. These methods all build on the memorization effect, which means deep networks learn easy patterns first and then gradually over-fit the training data set. In this paper, we show how to properly select instances so that the training process can benefit the most from the memorization effect is a hard problem. Specifically, memorization can heavily depend on many factors, e.g., data set and network architecture. Nonetheless, there still exists general patterns of how memorization can occur. These facts motivate us to exploit memorization by automated machine learning (AutoML) techniques. First, we designed an expressive but compact search space based on observed general patterns. Then, we propose to use the natural gradient-based search algorithm to efficiently search through space. Finally, extensive experiments on both synthetic data sets and benchmark data sets demonstrate that the proposed method can not only be much efficient than existing AutoML algorithms but can also achieve much better performance than the state-of-the-art approaches for learning from corrupted labels.",
        "keywords": "Noisy Label;Deep Learning;Automated Machine Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hansi Yang;Quanming Yao;Bo Han;Gang Niu",
        "authorids": "yhs17@mails.tsinghua.edu.cn;qyaoaa@connect.ust.hk;bo.han@riken.jp;gang.niu@riken.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nyang2020searching,\ntitle={Searching to Exploit Memorization Effect in Learning from Corrupted Labels},\nauthor={Hansi Yang and Quanming Yao and Bo Han and Gang Niu},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxqZkSFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJxqZkSFDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "200;468;478",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "414;1055;399",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            382.0,
            128.7581712617365
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            622.6666666666666,
            305.7671590533482
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12676167446884667465&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJxs5p4twr",
        "title": "Auto-Encoding Explanatory Examples",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We generate examples to explain a classifier desicion via interpolations in latent space. The variational auto encoder cost is extended with a functional of the classifier over the generated example path in data space.",
        "abstract": "In this paper, we ask for the main factors that determine a classifier's decision making and uncover such factors by studying latent codes produced by auto-encoding frameworks. To deliver an explanation of a classifier's behaviour, we propose a method that provides series of examples highlighting semantic differences between the classifier's decisions. We generate these examples through interpolations in latent space. We introduce and formalize the notion of a semantic stochastic path, as a suitable stochastic process defined in feature space via latent code interpolations. We then introduce the concept of semantic Lagrangians as a way to incorporate the desired classifier's behaviour and find that the solution of the associated variational problem allows for highlighting differences in the classifier decision.\nVery importantly, within our framework the classifier is used as a black-box, and only its evaluation is required.",
        "keywords": "Variational Auto Encoders;Interpolations;Explanations;Stochastic Processes",
        "primary_area": "",
        "supplementary_material": "",
        "author": "C\u00e9sar Ojeda;David Biesner;Ramses Sanchez;Kostadin Cvejoski;Jannis Schuecker;Christian Bauckhage;Bodgan Georgiev",
        "authorids": "cesarali07@gmail.com;david.biesner@iais.fraunhofer.de;sanchez@bit.uni-bonn.de;kostadin.cvejoski@iais.fraunhofer.de;jannis.schuecker@iais.fraunhofer.de;christian.bauckhage@iais.fraunhofer.de;bogdan.georgiev@iais.fraunhofer.de",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "https://figshare.com/s/91c74f38da02fb37877a",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rJxs5p4twr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "294;210;506",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            336.6666666666667,
            124.55074824700527
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WJtwcyemmy4J:scholar.google.com/&scioq=Auto-Encoding+Explanatory+Examples&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rJxt0JHKvS",
        "title": "Coloring graph neural networks for node disambiguation",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper introduces a coloring scheme for node disambiguation in graph neural networks based on separability, proven to be a universal MPNN extension.",
        "abstract": "In this paper, we show that a simple coloring scheme can improve, both theoretically and empirically, the expressive power of Message Passing Neural Networks (MPNNs). More specifically, we introduce a graph neural network called Colored Local Iterative Procedure (CLIP) that uses colors to disambiguate identical node attributes, and show that this representation is a universal approximator of continuous functions on graphs with node attributes. Our method relies on separability, a key topological characteristic that allows to extend well-chosen neural networks into universal representations. Finally, we show experimentally that CLIP is capable of capturing structural characteristics that traditional MPNNs fail to distinguish, while being state-of-the-art on benchmark graph classification datasets.",
        "keywords": "Graph neural networks;separability;node disambiguation;universal approximation;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "George Dasoulas;Ludovic Dos Santos;Kevin Scaman;Aladin Virmaux",
        "authorids": "george.dasoulas1@gmail.com;kevin.scaman@gmail.com;ludovic.dos.santos@huawei.com;aladin.virmaux@huawei.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ndasoulas2020coloring,\ntitle={Coloring graph neural networks for node disambiguation},\nauthor={George Dasoulas and Ludovic Dos Santos and Kevin Scaman and Aladin Virmaux},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxt0JHKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJxt0JHKvS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "164;563;335",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "493;1004;348",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            354.0,
            163.44418007380992
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            615.0,
            281.36216281985514
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 95,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3236360894568616269&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "rJxtgJBKDr",
        "title": "SNOW: Subscribing to Knowledge via Channel Pooling for Transfer & Lifelong Learning of Convolutional Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose SNOW, an efficient way of transfer and lifelong learning by subscribing knowledge of a source model for new tasks through a novel channel pooling block.",
        "abstract": "SNOW is an efficient learning method to improve training/serving throughput as well as accuracy for transfer and lifelong learning of convolutional neural networks based on knowledge subscription. SNOW selects the top-K useful intermediate\nfeature maps for a target task from a pre-trained and frozen source model through a novel channel pooling scheme, and utilizes them in the task-specific delta model. The source model is responsible for generating a large number of generic feature maps. Meanwhile, the delta model selectively subscribes to those feature maps and fuses them with its local ones to deliver high accuracy for the target task. Since a source model takes part in both training and serving of all target tasks\nin an inference-only mode, one source model can serve multiple delta models, enabling significant computation sharing. The sizes of such delta models are fractional of the source model, thus SNOW also provides model-size efficiency.\nOur experimental results show that SNOW offers a superior balance between accuracy and training/inference speed for various image classification tasks to the existing transfer and lifelong learning practices.",
        "keywords": "channel pooling;efficient training and inferencing;lifelong learning;transfer learning;multi task",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chungkuk Yoo;Bumsoo Kang;Minsik Cho",
        "authorids": "ckyoo@ibm.com;steve.kang@kaist.ac.kr;thyeros@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nYoo2020SNOW:,\ntitle={SNOW: Subscribing to Knowledge via Channel Pooling for Transfer & Lifelong Learning of Convolutional Neural Networks},\nauthor={Chungkuk Yoo and Bumsoo Kang and Minsik Cho},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxtgJBKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJxtgJBKDr",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "263;716;378",
        "wc_reply_reviewers": "0;34;0",
        "wc_reply_authors": "588;863;529",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.3333333333333,
            192.26081821895542
        ],
        "wc_reply_reviewers_avg": [
            11.333333333333334,
            16.027753706895076
        ],
        "wc_reply_authors_avg": [
            660.0,
            145.54953337838865
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3474071005880244363&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rJxvD3VKvr",
        "title": "Wide Neural Networks are Interpolating Kernel Methods: Impact of Initialization on Generalization",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that the generalization behavior of wide neural networks depends strongly on their initialization.",
        "abstract": "The recently developed link between strongly overparametrized neural networks (NNs) and kernel methods has opened a new way to understand puzzling features of NNs, such as their convergence and generalization behaviors. In this paper, we make the bias of initialization on strongly overparametrized NNs under gradient descent explicit. We prove that fully-connected wide ReLU-NNs trained with squared loss are essentially a sum of two parts: The first is the minimum complexity solution of an interpolating kernel method, while the second contributes to the test error only and depends heavily on the initialization. This decomposition has two consequences: (a) the second part becomes negligible in the regime of small initialization variance, which allows us to transfer generalization bounds from minimum complexity interpolating kernel methods to NNs; (b) in the opposite regime, the test error of wide NNs increases significantly with the initialization variance, while still interpolating the training data perfectly. Our work shows that -- contrary to common belief -- the initialization scheme has a strong effect on generalization performance, providing a novel criterion to identify good initialization strategies.",
        "keywords": "overparametrization;generalization;initialization;gradient descent;kernel methods;deep learning theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Manuel Nonnenmacher;David Reeb;Ingo Steinwart",
        "authorids": "manuel.nonnenmacher@de.bosch.com;david.reeb@de.bosch.com;ingo.steinwart@mathematik.uni-stuttgart.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nnonnenmacher2020wide,\ntitle={Wide Neural Networks are Interpolating Kernel Methods: Impact of Initialization on Generalization},\nauthor={Manuel Nonnenmacher and David Reeb and Ingo Steinwart},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxvD3VKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJxvD3VKvr",
        "pdf_size": 0,
        "rating": "1;1;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "358;228;415;192",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "192;161;204;170",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            2.75,
            2.0463381929681126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            298.25,
            91.41218463640392
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            181.75,
            17.09349291397168
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:D4FttOg0aQQJ:scholar.google.com/&scioq=Wide+Neural+Networks+are+Interpolating+Kernel+Methods:+Impact+of+Initialization+on+Generalization&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rJxwDTVFDB",
        "title": "Pushing the bounds of dropout",
        "track": "main",
        "status": "Reject",
        "tldr": "A new view of dropout training as optimizing lower bound for an entire family of models.",
        "abstract": "We push on the boundaries of our knowledge about dropout by showing theoretically that dropout training can be understood as performing MAP estimation concurrently for an entire family of conditional models whose objectives are themselves lower bounded by the original dropout objective. This discovery allows us to pick any model from this family after training, which leads to a substantial improvement on regularisation-heavy language modelling. The family includes models that compute a power mean over the sampled dropout masks, and their less stochastic subvariants with tighter and higher lower bounds than the fully stochastic dropout objective. The deterministic subvariant's bound is equal to its objective, and the highest amongst these models. It also exhibits the best model fit in our experiments. Together, these results suggest that the predominant view of deterministic dropout as a good approximation to MC averaging is misleading. Rather, deterministic dropout is the best available approximation to the true objective.",
        "keywords": "dropout;language",
        "primary_area": "",
        "supplementary_material": "",
        "author": "G\u00e1bor Melis;Charles Blundell;Tom\u00e1\u0161 Ko\u010disk\u00fd;Karl Moritz Hermann;Chris Dyer;Phil Blunsom",
        "authorids": "melisgl@google.com;cblundell@google.com;tkocisky@google.com;kmh@google.com;cdyer@google.com;pblunsom@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nmelis2020pushing,\ntitle={Pushing the bounds of dropout},\nauthor={G{\\'a}bor Melis and Charles Blundell and Tom{\\'a}{\\v{s}} Ko{\\v{c}}isk{\\'y} and Karl Moritz Hermann and Chris Dyer and Phil Blunsom},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxwDTVFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rJxwDTVFDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "579;445;867",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            630.3333333333334,
            176.06311999456963
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13779156736675663380&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJxycxHKDS",
        "title": "Domain Adaptive Multibranch Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "A Multiflow Network is a dynamic architecture for domain adaptation that learns potentially different computational graphs per domain, so as to map them to a common representation where inference can be performed in a domain-agnostic fashion.",
        "abstract": "We tackle unsupervised domain adaptation by accounting for the fact that different domains may need to be processed differently to arrive to a common feature representation effective for recognition. To this end, we introduce a deep learning framework where each domain undergoes a different sequence of operations, allowing some, possibly more complex, domains to go through more computations than others.\nThis contrasts with  state-of-the-art domain adaptation techniques that force all domains to be processed with the same series of operations, even when using multi-stream architectures whose parameters are not shared.\nAs evidenced by our experiments, the greater flexibility of our method translates to higher accuracy. Furthermore, it allows us to handle any number of domains simultaneously.",
        "keywords": "Domain Adaptation;Computer Vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "R\u00f3ger Berm\u00fadez-Chac\u00f3n;Mathieu Salzmann;Pascal Fua",
        "authorids": "roger.bermudez@epfl.ch;mathieu.salzmann@epfl.ch;pascal.fua@epfl.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nBerm\u00fadez-Chac\u00f3n2020Domain,\ntitle={Domain Adaptive Multibranch Networks},\nauthor={R\u00f3ger Berm\u00fadez-Chac\u00f3n and Mathieu Salzmann and Pascal Fua},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxycxHKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJxycxHKDS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "358;297;202",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "663;540;326",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            285.6666666666667,
            64.18895716727468
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            509.6666666666667,
            139.24159659463197
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14651635838873749761&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rJxyqkSYDH",
        "title": "A Simple Dynamic Learning Rate Tuning Algorithm For Automated Training of DNNs",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an automated, adaptive LR tuning algorithm for training DNNs that works as well or better than SOTA for different model-dataset combinations tried for natural as well as adversarial training, with theoretical convergence analysis.",
        "abstract": "Training neural networks on image datasets generally require extensive experimentation to find the optimal learning rate regime. Especially, for the cases of adversarial training or for training a newly synthesized model, one would not know the best learning rate regime beforehand. We propose an automated algorithm for determining the learning rate trajectory, that works across datasets and models for both natural and adversarial training, without requiring any dataset/model specific tuning. It is a stand-alone, parameterless, adaptive approach with no computational overhead. We theoretically discuss the algorithm's convergence behavior. We empirically validate our algorithm extensively. Our results show that our proposed approach \\emph{consistently} achieves top-level accuracy compared to SOTA baselines in the literature in natural training, as well as in adversarial training.",
        "keywords": "adaptive LR tuning algorithm;generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Koyel Mukherjee;Alind Khare;Yogish Sabharwal;Ashish Verma",
        "authorids": "koyelmjee@gmail.com;kharealind@gmail.com;ysabharwal@in.ibm.com;ashish.verma1@ibm.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmukherjee2020a,\ntitle={A Simple Dynamic Learning Rate Tuning Algorithm For Automated Training of {\\{}DNN{\\}}s},\nauthor={Koyel Mukherjee and Alind Khare and Yogish Sabharwal and Ashish Verma},\nyear={2020},\nurl={https://openreview.net/forum?id=rJxyqkSYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rJxyqkSYDH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "535;1053;605",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "255;419;530",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;4",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            731.0,
            229.47476259202594
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            401.3333333333333,
            112.96115359813844
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 19,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15823779290604398731&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rke-f6NKvS",
        "title": "Learning Self-Correctable Policies and Value Functions from Demonstrations with Negative Sampling",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce a notion of conservatively-extrapolated value functions, which provably lead to policies that can self-correct to stay close to the demonstration states, and learn them with a novel negative sampling technique.",
        "abstract": "Imitation learning, followed by reinforcement learning algorithms, is a promising paradigm to solve complex control tasks sample-efficiently. However, learning from demonstrations often suffers from the covariate shift problem, which results\nin cascading errors of the learned policy. We introduce a notion of conservatively extrapolated value functions, which provably lead to policies with self-correction. We design an algorithm Value Iteration with Negative Sampling (VINS) that practically learns such value functions with conservative extrapolation. We show that VINS can correct mistakes of the behavioral cloning policy on simulated robotics benchmark tasks. We also propose the algorithm of using VINS to initialize a reinforcement learning algorithm, which is shown to outperform prior works in sample efficiency.",
        "keywords": "imitation learning;model-based imitation learning;model-based RL;behavior cloning;covariate shift",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuping Luo;Huazhe Xu;Tengyu Ma",
        "authorids": "yupingl@cs.princeton.edu;huazhe_xu@eecs.berkeley.edu;tengyuma@stanford.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLuo2020Learning,\ntitle={Learning Self-Correctable Policies and Value Functions from Demonstrations with Negative Sampling},\nauthor={Yuping Luo and Huazhe Xu and Tengyu Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rke-f6NKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rke-f6NKvS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "581;532;927",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "683;423;2221",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;3",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            680.0,
            175.79723168089612
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1109.0,
            793.4347274140871
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6595376556503205742&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rke2HRVYvH",
        "title": "Stochastic Prototype Embeddings",
        "track": "main",
        "status": "Reject",
        "tldr": "The paper proposes a probabilistic extension of Prototypical Networks that achieves superior few-shot, large-, and open-set classification performance, while gracefully handling label noise and out-of-distribution inputs.",
        "abstract": "Supervised deep-embedding methods project inputs of a domain to a representational space in which same-class instances lie near one another and different-class instances lie far apart. We propose a probabilistic method that treats embeddings as random variables. Extending a state-of-the-art deterministic method, Prototypical Networks (Snell et al., 2017), our approach supposes the existence of a class prototype around which class instances are Gaussian distributed. The prototype posterior is a product distribution over labeled instances, and query instances are classified by marginalizing relative prototype proximity over embedding uncertainty. We describe an efficient sampler for approximate inference that allows us to train the model at roughly the same space and time cost as its deterministic sibling. Incorporating uncertainty improves performance on few-shot learning and gracefully handles label noise and out-of-distribution inputs. Compared to the state-of-the-art stochastic method, Hedged Instance Embeddings (Oh et al., 2019), we achieve superior large- and open-set classification accuracy. Our method also aligns class-discriminating features with the axes of the embedding space, yielding an interpretable, disentangled representation.",
        "keywords": "deep embeddings;stochastic embeddings;probabilistic embeddings;deep metric learning;few-shot learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tyler R. Scott;Karl Ridgeway;Michael C. Mozer",
        "authorids": "tysc7237@colorado.edu;karl.ridgeway@colorado.edu;mcmozer@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nscott2020stochastic,\ntitle={Stochastic Prototype Embeddings},\nauthor={Tyler R. Scott and Karl Ridgeway and Michael C. Mozer},\nyear={2020},\nurl={https://openreview.net/forum?id=rke2HRVYvH}\n}",
        "github": "https://drive.google.com/open?id=1QtShsWAHxfZRoxLG0Wcy4yVEfvGaqreG",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rke2HRVYvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1096;337;307",
        "wc_reply_reviewers": "0;0;219",
        "wc_reply_authors": "707;408;732",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            580.0,
            365.07259552039784
        ],
        "wc_reply_reviewers_avg": [
            73.0,
            103.23759005323593
        ],
        "wc_reply_authors_avg": [
            615.6666666666666,
            147.19676928821804
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15422716860843517991&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rke2P1BFwS",
        "title": "Tensor Decompositions for Temporal Knowledge Base Completion",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose new tensor decompositions and associated regularizers to obtain state of the art performances on temporal knowledge base completion.",
        "abstract": "Most algorithms for representation learning and link prediction in relational data have been designed for static data. However, the data they are applied to usually evolves with time, such as friend graphs in social networks or user interactions with items in recommender systems. This is also the case for knowledge bases, which contain facts such as (US, has president, B. Obama, [2009-2017]) that are valid only at certain points in time. For the problem of link prediction under temporal constraints, i.e., answering queries of the form (US, has president, ?, 2012), we propose a solution inspired by the canonical decomposition of tensors of order 4.\nWe introduce new regularization schemes and present an extension of ComplEx that achieves state-of-the-art performance. Additionally, we propose a new dataset for knowledge base completion constructed from Wikidata, larger than previous benchmarks by an order of magnitude, as a new reference for evaluating temporal and non-temporal link prediction methods. ",
        "keywords": "knowledge base completion;temporal embeddings",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Timoth\u00e9e Lacroix;Guillaume Obozinski;Nicolas Usunier",
        "authorids": "timothee.lax@gmail.com;guillaume.obozinski@epfl.ch;usunier@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLacroix2020Tensor,\ntitle={Tensor Decompositions for Temporal Knowledge Base Completion},\nauthor={Timoth\u00e9e Lacroix and Guillaume Obozinski and Nicolas Usunier},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rke2P1BFwS}\n}",
        "github": "http://s000.tinyupload.com/?file_id=37064871945432677939",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rke2P1BFwS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "164;802;416",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "200;812;480",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            460.6666666666667,
            262.37039128343395
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            497.3333333333333,
            250.14840039908754
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 355,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18234698389055794905&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rke3OxSKwr",
        "title": "Improved Training Techniques for Online Neural Machine Translation",
        "track": "main",
        "status": "Reject",
        "tldr": "Improved training of wait-k decoders for online machine translation",
        "abstract": "Neural sequence-to-sequence models are at the basis of state-of-the-art solutions for sequential prediction problems such as machine translation and speech recognition. The models typically assume that the entire input is available when starting target generation. In some applications, however, it is desirable to start the decoding process before the entire input is available, e.g. to reduce the latency in automatic speech recognition. We consider state-of-the-art wait-k decoders, that first read k tokens from the source and then alternate between reading tokens from the input and writing to the output. We investigate the sensitivity of such models to the value of k that is used during training and when deploying the model, and the effect of updating the hidden states in transformer models as new source tokens are read. We experiment with German-English translation on the IWSLT14 dataset and the larger WMT15 dataset. Our results significantly improve over earlier state-of-the-art results for  German-English translation on the WMT15 dataset across different latency levels.",
        "keywords": "Deep learning;natural language processing;Machine translation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Maha Elbayad;Laurent Besacier;Jakob Verbeek",
        "authorids": "maha.elbayad@inria.fr;laurent.besacier@univ-grenoble-alpes.fr;jakob.verbeek@inria.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nelbayad2020improved,\ntitle={Improved Training Techniques for Online Neural Machine Translation},\nauthor={Maha Elbayad and Laurent Besacier and Jakob Verbeek},\nyear={2020},\nurl={https://openreview.net/forum?id=rke3OxSKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rke3OxSKwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "149;181;137",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "291;644;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            155.66666666666666,
            18.571184369578827
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            311.6666666666667,
            263.3177210561838
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1728881000491841256&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rke3TJrtPS",
        "title": "Projection-Based Constrained Policy Optimization",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a new algorithm that learns constraint-satisfying policies, and provide theoretical analysis and empirical demonstration in the context of reinforcement learning with constraints.",
        "abstract": "We consider the problem of learning control policies that optimize a reward function while satisfying constraints due to considerations of safety, fairness, or other costs. We propose a new algorithm - Projection-Based Constrained Policy Optimization (PCPO), an iterative method for optimizing policies in a two-step process - the first step performs an unconstrained update while the second step reconciles the constraint violation by projecting the policy back onto the constraint set. We theoretically analyze PCPO and provide a lower bound on reward improvement, as well as an upper bound on constraint violation for each policy update. We further characterize the convergence of PCPO with projection based on two different metrics - L2 norm and Kullback-Leibler divergence. Our empirical results over several control tasks demonstrate that our algorithm achieves superior performance, averaging more than 3.5 times less constraint violation and around 15% higher reward compared to state-of-the-art methods.",
        "keywords": "Reinforcement learning with constraints;Safe reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tsung-Yen Yang;Justinian Rosca;Karthik Narasimhan;Peter J. Ramadge",
        "authorids": "ty3@princeton.edu;justinian.rosca@siemens.com;karthikn@cs.princeton.edu;ramadge@princeton.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nYang2020Projection-Based,\ntitle={Projection-Based Constrained Policy Optimization},\nauthor={Tsung-Yen Yang and Justinian Rosca and Karthik Narasimhan and Peter J. Ramadge},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rke3TJrtPS}\n}",
        "github": "https://sites.google.com/view/iclr2020-pcpo",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rke3TJrtPS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "580;413;616",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "306;460;603",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            536.3333333333334,
            88.43955877069692
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            456.3333333333333,
            121.27745966263568
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 307,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10282638909574344091&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rke3U6NtwH",
        "title": "MxPool: Multiplex Pooling for Hierarchical Graph Representation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Graphs are known to have complicated structures and have myriad applications. How to utilize deep learning methods for graph classification tasks has attracted considerable research attention in the past few years. Two properties of graph data have imposed significant challenges on existing graph learning techniques. (1) Diversity: each graph has a variable size of unordered nodes and diverse node/edge types. (2) Complexity: graphs have not only node/edge features but also complex topological features. These two properties motivate us to use multiplex structure to learn graph features in a diverse way. In this paper, we propose a simple but effective approach, MxPool, which concurrently uses multiple graph convolution networks and graph pooling networks to build hierarchical learning structure for graph representation learning tasks. Our experiments on numerous graph classification benchmarks show that our MxPool has marked superiority over other state-of-the-art graph representation learning methods. For example, MxPool achieves 92.1% accuracy on the D&D dataset while the second best method DiffPool only achieves 80.64% accuracy.",
        "keywords": "GNN;graph pooling;graph representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yanyan Liang;Yanfeng Zhang;Fangjing Wang;Qian Xu",
        "authorids": "13354227340@163.com;zhangyf@mail.neu.edu.cn;inggraph@qq.com;xuqian1286@163.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nliang2020mxpool,\ntitle={MxPool: Multiplex Pooling for Hierarchical Graph Representation Learning},\nauthor={Yanyan Liang and Yanfeng Zhang and Fangjing Wang and Qian Xu},\nyear={2020},\nurl={https://openreview.net/forum?id=rke3U6NtwH}\n}",
        "github": "https://github.com/JucatL/MxPool",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rke3U6NtwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "252;264;367",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            294.3333333333333,
            51.61610429141493
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14587979061866384547&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rke5R1SFwS",
        "title": "Learning to Remember from a Multi-Task Teacher",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new meta-learning algorithm for sequential representation learning",
        "abstract": "Recent studies on catastrophic forgetting during sequential learning typically focus on fixing the accuracy of the predictions for a previously learned task. In this paper we argue that the outputs of neural networks are subject to rapid changes when learning a new data distribution, and networks that appear to \"forget\" everything still contain useful representation towards previous tasks. We thus propose to enforce the output accuracy to stay the same, we should aim to reduce the effect of catastrophic forgetting on the representation level, as the output layer can be quickly recovered later with a small number of examples. Towards this goal, we propose an experimental setup that measures the amount of representational forgetting, and develop a novel meta-learning algorithm to overcome this issue. The proposed meta-learner produces weight updates of a sequential learning network,  mimicking a multi-task teacher network's representation. We show that our meta-learner can improve its learned representations on new tasks, while maintaining a good representation for old tasks.",
        "keywords": "Meta-learning;sequential learning;catastrophic forgetting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuwen Xiong;Mengye Ren;Raquel Urtasun",
        "authorids": "yuwen@cs.toronto.edu;mren@cs.toronto.edu;urtasun@uber.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nxiong2020learning,\ntitle={Learning to Remember from a Multi-Task Teacher},\nauthor={Yuwen Xiong and Mengye Ren and Raquel Urtasun},\nyear={2020},\nurl={https://openreview.net/forum?id=rke5R1SFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rke5R1SFwS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "1327;1407;613",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "923;854;428",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1115.6666666666667,
            356.93634664398576
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            735.0,
            218.90180446949267
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11627284569711136491&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rke7geHtwH",
        "title": "Keep Doing What Worked: Behavior Modelling Priors for Offline Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We develop a method for stable offline reinforcement learning from logged data. The key is to regularize the RL policy towards a learned \"advantage weighted\" model of the data.",
        "abstract": "Off-policy reinforcement learning algorithms promise to be applicable in settings where only a fixed data-set (batch) of environment interactions is available and no new experience can be acquired. This property makes these algorithms appealing for real world problems such as robot control. In practice, however, standard off-policy algorithms fail in the batch setting for continuous control. In this paper, we propose a simple solution to this problem. It admits the use of data generated by arbitrary behavior policies and uses a learned prior -- the advantage-weighted behavior model (ABM) -- to bias the RL policy towards actions that have previously been executed and are likely to be successful on the new task. Our method can be seen as an extension of recent work on batch-RL that enables stable learning from conflicting data-sources. We find  improvements on competitive baselines in a variety of RL tasks -- including standard continuous control benchmarks and multi-task learning for simulated and real-world robots. ",
        "keywords": "Reinforcement Learning;Off-policy;Multitask;Continuous Control",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Noah Siegel;Jost Tobias Springenberg;Felix Berkenkamp;Abbas Abdolmaleki;Michael Neunert;Thomas Lampe;Roland Hafner;Nicolas Heess;Martin Riedmiller",
        "authorids": "siegeln@google.com;springenberg@google.com;befelix@inf.ethz.ch;aabdolmaleki@google.com;neunertm@google.com;thomaslampe@google.com;rhafner@google.com;heess@google.com;riedmiller@google.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@inproceedings{\nSiegel2020Keep,\ntitle={Keep Doing What Worked: Behavior Modelling Priors for Offline Reinforcement Learning},\nauthor={Noah Siegel and Jost Tobias Springenberg and Felix Berkenkamp and Abbas Abdolmaleki and Michael Neunert and Thomas Lampe and Roland Hafner and Nicolas Heess and Martin Riedmiller},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rke7geHtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rke7geHtwH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "657;99;140",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "796;105;84",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            298.6666666666667,
            253.9321869231145
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            328.3333333333333,
            330.80138384770333
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 330,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15172591181451008488&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rkeH6AEYvr",
        "title": "Image Classification Through Top-Down Image Pyramid Traversal",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a novel architecture that traverses an image pyramid in a top-down fashion, while it visits only the most informative regions along the way.",
        "abstract": "The available resolution in our visual world is extremely high, if not infinite. Existing CNNs can be applied in a fully convolutional way to images of arbitrary resolution, but as the size of the input increases, they can not capture contextual information. In addition, computational requirements scale linearly to the number of input pixels, and resources are allocated uniformly across the input, no matter how informative different image regions are. We attempt to address these problems by proposing a novel architecture that traverses an image pyramid in a top-down fashion, while it uses a hard attention mechanism to selectively process only the most informative image parts. We conduct experiments on MNIST and ImageNet datasets, and we show that our models can significantly outperform fully convolutional counterparts, when the resolution of the input is that big that the receptive field of the baselines can not adequately cover the objects of interest. Gains in performance come for less FLOPs, because of the selective processing that we follow. Furthermore, our attention mechanism makes our predictions more interpretable, and creates a trade-off between accuracy and complexity that can be tuned both during training and testing time.",
        "keywords": "image classification;multi-scale processing;attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Athanasios Papadopoulos;Pawel Korus;Nasir Memon",
        "authorids": "ap4094@nyu.edu;pkorus@nyu.edu;memon@nyu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkeH6AEYvr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "403;361;497",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "218;352;420",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            420.3333333333333,
            56.858498827254394
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            330.0,
            83.92059739221753
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Bu7ACtDfJGUJ:scholar.google.com/&scioq=Image+Classification+Through+Top-Down+Image+Pyramid+Traversal&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkeIIkHKvS",
        "title": "Measuring and Improving the Use of Graph Information in Graph Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Graph neural networks (GNNs) have been widely used for representation learning on graph data. However, there is limited understanding on how much performance GNNs actually gain from graph data. This paper introduces a context-surrounding GNN framework and proposes two smoothness metrics to measure the quantity and quality of information obtained from graph data. A new, improved GNN model, called CS-GNN, is then devised to improve the use of graph information based on the smoothness values of a graph. CS-GNN is shown to achieve better performance than existing methods in different types of real graphs. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yifan Hou;Jian Zhang;James Cheng;Kaili Ma;Richard T. B. Ma;Hongzhi Chen;Ming-Chang Yang",
        "authorids": "yfhou@cse.cuhk.edu.hk;jzhang@cse.cuhk.edu.hk;jcheng@cse.cuhk.edu.hk;klma@cse.cuhk.edu.hk;tbma@comp.nus.edu.sg;hzchen@cse.cuhk.edu.hk;mcyang@cse.cuhk.edu.hk",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nHou2020Measuring,\ntitle={Measuring and Improving the Use of Graph Information in Graph Neural Networks},\nauthor={Yifan Hou and Jian Zhang and James Cheng and Kaili Ma and Richard T. B. Ma and Hongzhi Chen and Ming-Chang Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeIIkHKvS}\n}",
        "github": "https://github.com/yifan-h/CS-GNN",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkeIIkHKvS",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "287;164;400",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1272;69;510",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            283.6666666666667,
            96.37542333096245
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            617.0,
            496.91649197827996
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "rkeIq2VYPr",
        "title": "Deep Learning of Determinantal Point Processes via Proper Spectral Sub-gradient",
        "track": "main",
        "status": "Poster",
        "tldr": "We proposed a specific back-propagation method via proper spectral sub-gradient to integrate determinantal point process to deep learning framework.",
        "abstract": "Determinantal point processes (DPPs) is an effective tool to deliver diversity on multiple machine learning and computer vision tasks. Under deep learning framework, DPP is typically optimized via approximation, which is not straightforward and has some conflict with diversity requirement. We note, however, there has been no deep learning paradigms to optimize DPP directly since it involves matrix inversion which may result in highly computational instability. This fact greatly hinders the wide use of DPP on some specific objectives where DPP serves as a term to measure the feature diversity. In this paper, we devise a simple but effective algorithm to address this issue to optimize DPP term directly expressed with L-ensemble in spectral domain over gram matrix, which is more flexible than learning on parametric kernels. By further taking into account some geometric constraints, our algorithm seeks to generate valid sub-gradients of DPP term in case when the DPP gram matrix is not invertible (no gradients exist in this case). In this sense, our algorithm can be easily incorporated with multiple deep learning tasks. Experiments show the effectiveness of our algorithm, indicating promising performance for practical learning problems. ",
        "keywords": "determinantal point processes;deep learning;optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianshu Yu;Yikang Li;Baoxin Li",
        "authorids": "tianshuy@asu.edu;yikang.li@asu.edu;baoxin.li@asu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nYu2020Deep,\ntitle={Deep Learning of Determinantal Point Processes via Proper Spectral Sub-gradient},\nauthor={Tianshu Yu and Yikang Li and Baoxin Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeIq2VYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkeIq2VYPr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "446;144;239",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "837;96;24",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            276.3333333333333,
            126.08550890390043
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            319.0,
            367.458841232593
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12930722858980817779&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkeJRhNYDH",
        "title": "TabFact: A Large-scale Dataset for Table-based Fact Verification",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a new dataset to investigate the entailment problem under semi-structured table as premise",
        "abstract": "The problem of verifying whether a textual hypothesis holds based on the given evidence, also known as fact verification, plays an important role in the study of natural language understanding and semantic representation. However, existing studies are mainly restricted to dealing with unstructured evidence (e.g., natural language sentences and documents, news, etc), while verification under structured evidence, such as tables, graphs, and databases, remains unexplored. This paper specifically aims to study the fact verification given semi-structured data as evidence. To this end, we construct a large-scale dataset called TabFact with 16k Wikipedia tables as the evidence for 118k human-annotated natural language statements, which are labeled as either ENTAILED or REFUTED. TabFact is challenging since it involves both soft linguistic reasoning and hard symbolic reasoning. To address these reasoning challenges, we design two different models: Table-BERT and Latent Program Algorithm (LPA). Table-BERT leverages the state-of-the-art pre-trained language model to encode the linearized tables and statements into continuous vectors for verification. LPA parses statements into LISP-like programs and executes them against the tables to obtain the returned binary value for verification. Both methods achieve similar accuracy but still lag far behind human performance. We also perform a comprehensive analysis to demonstrate great future opportunities.",
        "keywords": "Fact Verification;Tabular Data;Symbolic Reasoning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wenhu Chen;Hongmin Wang;Jianshu Chen;Yunkai Zhang;Hong Wang;Shiyang Li;Xiyou Zhou;William Yang Wang",
        "authorids": "wenhuchen@ucsb.edu;hongmin@ucsb.edu;chenjianshu@gmail.com;yunkai_zhang@ucsb.edu;hongwang600@ucsb.edu;shiyangli@ucsb.edu;xiyou@ucsb.edu;william@cs.ucsb.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nChen2020TabFact:,\ntitle={TabFact: A Large-scale Dataset for Table-based Fact Verification},\nauthor={Wenhu Chen and Hongmin Wang and Jianshu Chen and Yunkai Zhang and Hong Wang and Shiyang Li and Xiyou Zhou and William Yang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeJRhNYDH}\n}",
        "github": "https://github.com/wenhuchen/Table-Fact-Checking",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkeJRhNYDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "504;788;506",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "993;390;284",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            599.3333333333334,
            133.4099779709982
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            555.6666666666666,
            312.25452581009756
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 517,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17043210713635846770&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkeJzpNtPS",
        "title": "Locally adaptive activation functions with slope recovery term for deep and physics-informed neural networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Proposing locally adaptive activation functions in deep and physics-informed neural networks for faster convergence",
        "abstract": "We propose two approaches of locally adaptive activation functions namely, layer-wise and neuron-wise locally adaptive activation functions, which improve the performance of deep and physics-informed neural networks. The local adaptation of activation function is achieved by introducing scalable hyper-parameters in each layer (layer-wise) and for every neuron separately (neuron-wise), and then optimizing it using the stochastic gradient descent algorithm. Introduction of neuron-wise activation function acts like a vector activation function as opposed to the traditional scalar activation function given by fixed, global and layer-wise activations. In order to further increase the training speed, an activation slope based slope recovery term is added in the loss function, which further accelerate convergence, thereby reducing the training cost. For numerical experiments, a nonlinear discontinuous function is approximated using a deep neural network with layer-wise and neuron-wise locally adaptive activation functions with and without the slope recovery term and compared with its global counterpart. Moreover, solution of the nonlinear Burgers equation, which exhibits steep gradients, is also obtained using the proposed methods. On the theoretical side, we prove that in the proposed method the gradient descent algorithms are not attracted to sub-optimal critical points or local minima under practical conditions on the initialization and learning rate. Furthermore, the proposed adaptive activation functions with the slope recovery are shown to accelerate the training process in standard deep learning benchmarks using CIFAR-10, CIFAR-100, SVHN, MNIST, KMNIST, Fashion-MNIST, and Semeion data sets with and without data augmentation.",
        "keywords": "PINN;machine learning;stochastic gradients;accelerated training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ameya D. Jagtap;Kenji Kawaguchi;George Em Karniadakis",
        "authorids": "ameya_jagtap@brown.edu;kawaguch@mit.edu;george_karniadakis@brown.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkeJzpNtPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "121;281;374",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            258.6666666666667,
            104.48710712597779
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 381,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6532006446361732603&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "rkeMj3VtvB",
        "title": "Multi-Task Adapters for On-Device Audio Inference",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "The deployment of deep networks on mobile devices requires to efficiently use\nthe scarce computational resources, expressed as either available memory or\ncomputing cost. When addressing multiple tasks simultaneously, it is extremely\nimportant to share resources across tasks, especially when they all consume the\nsame input data, e.g., audio samples captured by the on-board microphones. In\nthis paper we propose a multi-task model architecture that consists of a shared\nencoder and multiple task-specific adapters. During training, we learn the model\nparameters as well as the allocation of the task-specific additional resources\nacross both tasks and layers. A global tuning parameter can be used to obtain\ndifferent multi-task network configurations finding the desired trade-off\nbetween cost and the level of accuracy across tasks. Our results show that this\nsolution significantly outperforms a multi-head model baseline. Interestingly,\nwe observe that the optimal resource allocation depends on both the task\nintrinsic characteristics as well as on the targeted cost measure (e.g., memory\nor computing cost).",
        "keywords": "Audio;multi-task learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "M. Tagliasacchi;F. de Chaumont Quitry;D. Roblek",
        "authorids": "mtagliasacchi@google.com;fcq@google.com;droblek@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkeMj3VtvB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1173;324;363",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            620.0,
            391.35405964420505
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6477011153659582385&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkeNfp4tPr",
        "title": "Escaping Saddle Points Faster with Stochastic Momentum",
        "track": "main",
        "status": "Poster",
        "tldr": "Higher momentum parameter $\\beta$ helps for escaping saddle points faster",
        "abstract": "Stochastic gradient descent (SGD) with stochastic momentum is popular in nonconvex stochastic optimization and particularly for the training of deep neural networks. In standard SGD, parameters are updated by improving along the path of the gradient at the current iterate on a batch of examples, where the addition of a ``momentum'' term biases the update in the direction of the previous change in parameters. In non-stochastic convex optimization one can show that a momentum adjustment provably reduces convergence time in many settings, yet such results have been elusive in the stochastic and non-convex settings. At the same time, a widely-observed empirical phenomenon is that in training deep networks stochastic momentum appears to significantly improve convergence time, variants of it have flourished in the development of other popular update methods, e.g. ADAM, AMSGrad, etc. Yet theoretical justification for the use of stochastic momentum has remained a significant open question. In this paper we propose an answer: stochastic momentum improves deep network training because it modifies SGD to escape saddle points faster and, consequently, to more quickly find a second order stationary point. Our theoretical results also shed light on the related question of how to choose the ideal momentum parameter--our analysis suggests that $\\beta \\in [0,1)$ should be large (close to 1), which comports with empirical findings. We also provide experimental findings that further validate these conclusions.",
        "keywords": "SGD;momentum;escaping saddle point",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jun-Kun Wang;Chi-Heng Lin;Jacob Abernethy",
        "authorids": "jimwang@gatech.edu;cl3385@gatech.edu;prof@gatech.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nWang2020Escaping,\ntitle={Escaping Saddle Points Faster with Stochastic Momentum},\nauthor={Jun-Kun Wang and Chi-Heng Lin and Jacob Abernethy},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeNfp4tPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkeNfp4tPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "608;519;255",
        "wc_reply_reviewers": "179;0;18",
        "wc_reply_authors": "2287;1424;268",
        "reply_reviewers": "1;0;1",
        "reply_authors": "4;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            460.6666666666667,
            149.8984841669707
        ],
        "wc_reply_reviewers_avg": [
            65.66666666666667,
            80.47497885816574
        ],
        "wc_reply_authors_avg": [
            1326.3333333333333,
            827.141395946841
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10605969342089824801&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkeNqkBFPB",
        "title": "Deep automodulators",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel autoencoder model that supports mutually independent decoder layers, enabling e.g. style mixing.",
        "abstract": "We introduce a novel autoencoder model that deviates from traditional autoencoders by using the full latent vector to independently modulate each layer in the decoder. We demonstrate how such an 'automodulator' allows for a principled approach to enforce latent space disentanglement, mixing of latent codes, and a straightforward way to utilize prior information that can be construed as a scale-specific invariance. Unlike GANs, autoencoder models can directly operate on new real input samples. This makes our model directly suitable for applications involving real-world inputs. As the architectural backbone, we extend recent generative autoencoder models that retain input identity and image sharpness at high resolutions better than VAEs. We show that our model achieves state-of-the-art latent space disentanglement and achieves high quality and diversity of output samples, as well as faithfulness of reconstructions.",
        "keywords": "unsupervised learning;generative models;autoencoders;disentanglement;style transfer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ari Heljakka;Yuxin Hou;Juho Kannala;Arno Solin",
        "authorids": "ari.heljakka@aalto.fi;yuxin.hou@aalto.fi;arno.solin@aalto.fi;juho.kannala@aalto.fi",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nheljakka2020deep,\ntitle={Deep automodulators},\nauthor={Ari Heljakka and Yuxin Hou and Juho Kannala and Arno Solin},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeNqkBFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkeNqkBFPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "901;488;335",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "525;215;308",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            574.6666666666666,
            239.05694347209877
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            349.3333333333333,
            129.88798592975752
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15567958931014667232&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rkeNr6EKwB",
        "title": "Small-GAN: Speeding up GAN Training using Core-Sets",
        "track": "main",
        "status": "Reject",
        "tldr": "We use Core-set sampling to help alleviate the problem of using large mini-batches for GAN training.",
        "abstract": "BigGAN suggests that Generative Adversarial Networks (GANs) benefit disproportionately from large minibatch sizes. This finding is interesting but also discouraging -- large batch sizes  are slow and expensive to emulate on conventional hardware. Thus, it would be nice if there were some trick by which we could generate batches that were effectively big though small in practice. In this work, we propose such a trick, inspired by the use of Coreset-selection in active learning. When training a GAN, we draw a large batch of samples from the prior and then compress that batch using Coreset-selection. To create effectively large batches of real images, we create a cached dataset of Inception activations of each training image, randomly project them down to a smaller dimension, and then use Coreset-selection on those projected embeddings at training time. We conduct experiments showing that this technique substantially reduces training time and memory usage for modern GAN variants, that it reduces the fraction of dropped modes in a synthetic dataset, and that it helps us use GANs to reach a new state of the art in anomaly detection.",
        "keywords": "GANs;Coreset",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Samarth Sinha;Han Zhang;Anirudh Goyal;Yoshua Bengio;Hugo Larochelle;Augustus Odena",
        "authorids": "samarth.sinha@mail.utoronto.ca;zhanghan@google.com;anirudhgoyal9119@gmail.com;yoshua.bengio@mila.quebec;hugolarochelle@google.com;augustusodena@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nsinha2020smallgan,\ntitle={Small-{\\{}GAN{\\}}: Speeding up {\\{}GAN{\\}} Training using Core-Sets},\nauthor={Samarth Sinha and Han Zhang and Anirudh Goyal and Yoshua Bengio and Hugo Larochelle and Augustus Odena},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeNr6EKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkeNr6EKwB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "603;326;118",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "929;61;71",
        "reply_reviewers": "0;0;0",
        "reply_authors": "5;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            349.0,
            198.66722594999575
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            353.6666666666667,
            406.84258490095266
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.8856180831641267
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 99,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13508359636798287456&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkeO-lrYwr",
        "title": "Mode Connectivity and Sparse Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Whether or not a sparse subnetwork trains to the same accuracy of the full network depends on whether two runs of SGD on the subnetwork land in the same convex levelset.",
        "abstract": "We uncover a connection between two seemingly unrelated empirical phenomena: mode connectivity and sparsity. On the one hand, there is growing catalog of situations where, across multiple runs, SGD learns weights that fall into minima that are connected (mode connectivity). A striking example is described by Nagarajan & Kolter (2019). They observe that test error on MNIST does not change along the linear path connecting the end points of two independent SGD runs, starting from the same random initialization. On the other hand, there is the lottery ticket hypothesis of Frankle & Carbin (2019), where dense, randomly initialized networks have sparse subnetworks capable of training in isolation to full accuracy.\n\nHowever, neither phenomenon scales beyond small vision networks. We start by proposing a technique to find sparse subnetworks after initialization. We observe that these subnetworks match the accuracy of the full network only when two SGD runs for the same subnetwork are connected by linear paths with the no change in test error. Our findings connect the existence of sparse subnetworks that train to high accuracy with the dynamics of optimization via mode connectivity. In doing so, we identify analogues of the phenomena uncovered by Nagarajan & Kolter and Frankle & Carbin in ImageNet-scale architectures at state-of-the-art sparsity levels.",
        "keywords": "sparsity;mode connectivity;lottery ticket;optimization landscape",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jonathan Frankle;Gintare Karolina Dziugaite;Daniel M. Roy;Michael Carbin",
        "authorids": "jfrankle@csail.mit.edu;karolina.dziugaite@gmail.com;droy@utstat.toronto.edu;mcarbin@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfrankle2020mode,\ntitle={Mode Connectivity and Sparse Neural Networks},\nauthor={Jonathan Frankle and Gintare Karolina Dziugaite and Daniel M. Roy and Michael Carbin},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeO-lrYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkeO-lrYwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "454;821;359",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "353;1346;659",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            544.6666666666666,
            199.2089913187209
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            786.0,
            415.2180150234332
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13918463380856385324&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkePU0VYDr",
        "title": "A Perturbation Analysis of Input Transformations for Adversarial Attacks",
        "track": "main",
        "status": "Reject",
        "tldr": "We identify a family of defense techniques and show that both deterministic lossy compression and randomized perturbations to the input lead to similar gains in robustness.",
        "abstract": "The existence of adversarial examples, or intentional mis-predictions constructed from small changes to correctly predicted examples, is one of the most significant challenges in neural network research today. Ironically, many new defenses are based on a simple observation - the adversarial inputs themselves are not robust and small perturbations to the attacking input often recover the desired prediction. While the intuition is somewhat clear, a detailed understanding of this phenomenon is missing from the research literature. This paper presents a comprehensive experimental analysis of when and why perturbation defenses work and potential mechanisms that could explain their effectiveness (or ineffectiveness) in different settings.",
        "keywords": "adversarial examples;defenses;stochastic channels;deterministic channels;input transformations;compression;noise;convolutional neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Adam Dziedzic;Sanjay Krishnan",
        "authorids": "ady@uchicago.edu;skr@uchicago.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ndziedzic2020a,\ntitle={A Perturbation Analysis of Input Transformations for Adversarial Attacks},\nauthor={Adam Dziedzic and Sanjay Krishnan},\nyear={2020},\nurl={https://openreview.net/forum?id=rkePU0VYDr}\n}",
        "github": "https://github.com/anonymous-user-commits/stochastic-channels-iclr",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkePU0VYDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "779;419;358",
        "wc_reply_reviewers": "0;0;51",
        "wc_reply_authors": "969;469;173",
        "reply_reviewers": "0;0;1",
        "reply_authors": "3;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            518.6666666666666,
            185.76029954995468
        ],
        "wc_reply_reviewers_avg": [
            17.0,
            24.041630560342615
        ],
        "wc_reply_authors_avg": [
            537.0,
            328.5036783152765
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rEeASzc1JAYJ:scholar.google.com/&scioq=A+Perturbation+Analysis+of+Input+Transformations+for+Adversarial+Attacks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkeS1RVtPS",
        "title": "Cyclical Stochastic Gradient MCMC for Bayesian Deep Learning",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "The posteriors over neural network weights are high dimensional and multimodal. Each mode typically characterizes a meaningfully different representation of the data. We develop Cyclical Stochastic Gradient MCMC (SG-MCMC) to automatically explore such distributions. In particular, we propose a cyclical stepsize schedule, where larger steps discover new modes, and smaller steps characterize each mode. We prove non-asymptotic convergence theory of our proposed algorithm. Moreover, we provide extensive experimental results, including ImageNet, to demonstrate the effectiveness of cyclical SG-MCMC in learning complex multimodal distributions, especially for fully Bayesian inference with modern deep neural networks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruqi Zhang;Chunyuan Li;Jianyi Zhang;Changyou Chen;Andrew Gordon Wilson",
        "authorids": "rz297@cornell.edu;chunyuan.li@duke.edu;jz318@duke.edu;cchangyou@gmail.com;andrewgw@cims.nyu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nZhang2020Cyclical,\ntitle={Cyclical Stochastic Gradient MCMC for Bayesian Deep Learning},\nauthor={Ruqi Zhang and Chunyuan Li and Jianyi Zhang and Changyou Chen and Andrew Gordon Wilson},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeS1RVtPS}\n}",
        "github": "[![github](/images/github_icon.svg) ruqizhang/csgmcmc](https://github.com/ruqizhang/csgmcmc) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=rkeS1RVtPS)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkeS1RVtPS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "185;120;352",
        "wc_reply_reviewers": "0;0;73",
        "wc_reply_authors": "293;113;683",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            219.0,
            97.71727926352978
        ],
        "wc_reply_reviewers_avg": [
            24.333333333333332,
            34.41253001774532
        ],
        "wc_reply_authors_avg": [
            363.0,
            237.90754506740637
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 357,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10285617544422902301&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkeUcRNtwS",
        "title": "Salient Explanation for Fine-grained Classification",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Explaining the prediction of deep models has gained increasing attention to increase its applicability, even spreading it to life-affecting decisions. However there has been no attempt to pinpoint only the most discriminative features contributing specifically to separating different classes in a fine-grained classification task. This paper introduces a novel notion of salient explanation and proposes a simple yet effective salient explanation method called Gaussian light and shadow (GLAS), which estimates the spatial impact of deep models by the feature perturbation inspired by light and shadow in nature. GLAS provides a useful coarse-to-fine control benefiting from scalability of Gaussian mask. We also devised the ability to identify multiple instances through recursive GLAS. We prove the effectiveness of GLAS for fine-grained classification using the fine-grained classification dataset. To show the general applicability, we also illustrate that GLAS has state-of-the-art performance at high speed (about 0.5 sec per 224$\\times$224 image) via the ImageNet Large Scale Visual Recognition Challenge. ",
        "keywords": "Visual explanation;XAI;Constitutional Neural Network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kanghan Oh;Sungchan Kim;Il-Seok Oh",
        "authorids": "blastps@gmail.com;s.k@jbnu.ac.kr;iosh@jbnu.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\noh2020salient,\ntitle={Salient Explanation for Fine-grained Classification},\nauthor={Kanghan Oh and Sungchan Kim and Il-Seok Oh},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeUcRNtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkeUcRNtwS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "259;572;286",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            372.3333333333333,
            141.6152847996603
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2622733307464489297&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkeXTaNKPS",
        "title": "Matching Distributions via Optimal Transport for Semi-Supervised Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose a new algorithm based on the optimal transport to train a CNN in an SSL fashion.",
        "abstract": "Semi-Supervised Learning (SSL) approaches have been an influential framework for the usage of unlabeled data when there is not a sufficient amount of labeled data available over the course of training. SSL methods based on Convolutional Neural Networks (CNNs) have recently provided successful results on standard benchmark tasks such as image classification. In this work, we consider the general setting of  SSL problem where the labeled and unlabeled data  come from the same underlying probability distribution. We  propose a new approach that adopts  an Optimal Transport (OT) technique serving as a metric of similarity between discrete empirical probability measures to  provide pseudo-labels for the unlabeled data, which can then be used in conjunction with the initial labeled data to train the CNN model in an SSL manner. We have evaluated and compared our proposed method with state-of-the-art SSL algorithms on standard datasets to demonstrate the superiority and effectiveness of our  SSL algorithm.",
        "keywords": "Semi-Supervised Learning;Optimal Transport;CNN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fariborz Taherkhani;Hadi Kazemi;Ali Dabouei;Jeremy Dawson;Nasser M. Nasrabadi",
        "authorids": "ft0009@mix.wvu.edu;hakazemi@mix.wvu.edu;ad0046@mix.wvu.edu;jeremy.dawson@mail.wvu.edu;nasser.nasrabadi@mail.wvu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=rkeXTaNKPS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13934843602923724233&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkeYL1SFvH",
        "title": "WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia",
        "track": "main",
        "status": "Reject",
        "tldr": "Large-scale bitext extraction from Wikipedia: 1620 language pairs in 85 languages, 135M parallel sentences, Systematic NMT evaluation on TED test set.",
        "abstract": "We present an approach based on multilingual sentence embeddings to automatically extract parallel sentences from the content of Wikipedia articles in 85 languages, including several dialects or low-resource languages.  We do not limit the extraction process to alignments with English, but systematically consider all possible language pairs.  In total, we are able to extract 135M parallel sentences for 1620 different language pairs, out of which only 34M are aligned with English.  This corpus of parallel sentences is freely available (URL anonymized)\n  \nTo get an indication on the quality of the extracted bitexts, we train neural MT baseline systems on the mined data only for 1886 languages pairs, and evaluate them on the TED corpus, achieving strong BLEU scores for many language pairs.  The WikiMatrix bitexts seem to be particularly interesting to train MT systems between distant languages without the need to pivot through English.",
        "keywords": "multilinguality;bitext mining;neural MT;Wikipedia;low-resource languages;joint sentence representation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Holger Schwenk;Vishrav Chaudhary;Shuo Sun;Hongyu Gong;Francisco Guzm\u00e1n",
        "authorids": "schwenk@fb.com;vishrav@fb.com;ssun32@jhu.edu;hgong6@illinois.edu;fguzman@fb.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nschwenk2020wikimatrix,\ntitle={WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia},\nauthor={Holger Schwenk and Vishrav Chaudhary and Shuo Sun and Hongyu Gong and Francisco Guzm{\\'a}n},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeYL1SFvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkeYL1SFvH",
        "pdf_size": 0,
        "rating": "3;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "131;364;342;126",
        "wc_reply_reviewers": "0;0;184;0",
        "wc_reply_authors": "160;0;319;0",
        "reply_reviewers": "0;0;1;0",
        "reply_authors": "1;0;1;1",
        "rating_avg": [
            5.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            240.75,
            112.53305070067194
        ],
        "wc_reply_reviewers_avg": [
            46.0,
            79.67433714816836
        ],
        "wc_reply_authors_avg": [
            119.75,
            132.28827423471816
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            0.75,
            0.4330127018922193
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 385,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4098303515424775200&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rkeYvaNKPr",
        "title": "Trajectory representation learning for Multi-Task NMRDPs planning",
        "track": "main",
        "status": "Reject",
        "tldr": "Expanding NMRDPs into MDPs using trajectory representation learning",
        "abstract": "Expanding Non Markovian Reward Decision Processes (NMRDP) into Markov Decision Processes (MDP) enables the use of state of the art Reinforcement Learning (RL) techniques to identify optimal policies. In this paper an approach to exploring NMRDPs and expanding them into MDPs, without the prior knowledge of the reward structure, is proposed. The non Markovianity of the reward function is disentangled under the assumption that sets of similar and dissimilar trajectory batches can be sampled. More precisely, within the same batch, measuring the similarity between any couple of trajectories is permitted, although comparing trajectories from different batches is not possible. A modified version of the triplet loss is optimised to construct a representation of the trajectories under which rewards become Markovian.",
        "keywords": "Representation Learning;State Estimation;Non Markovian Decision Process",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Firas JARBOUI;Vianney PERCHET;Roman EGGER",
        "authorids": "firasjarboui@gmail.com;vianney.perchet@gmail.com;roman.egger@fh-salzburg.ac.at",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\njarboui2020trajectory,\ntitle={Trajectory representation learning for Multi-Task {\\{}NMRDP{\\}}s planning},\nauthor={Firas JARBOUI and Vianney PERCHET and Roman EGGER},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeYvaNKPr}\n}",
        "github": "https://iclr2020submission.blob.core.windows.net/iclr2020/Trajectory representation learning for Multi-Task NMRDPs planning.tar.xz",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkeYvaNKPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "743;666;122",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1097;818;234",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            510.3333333333333,
            276.38660523903025
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            716.3333333333334,
            359.5778388919737
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=884526500478909094&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkeZ9a4Fwr",
        "title": "Disentangling Improves VAEs' Robustness to Adversarial Attacks",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that disentangled VAEs are more robust than vanilla VAEs to adversarial attacks that aim to trick them into decoding the adversarial input to a chosen target. We then develop an even more robust hierarchical disentangled VAE, Seatbelt-VAE.",
        "abstract": "This paper is concerned with the robustness of VAEs to adversarial attacks. We highlight that conventional VAEs are brittle under attack but that methods recently introduced for disentanglement such as \u03b2-TCVAE (Chen et al., 2018) improve robustness, as demonstrated through a variety of previously proposed adversarial attacks (Tabacof et al. (2016); Gondim-Ribeiro et al. (2018); Kos et al.(2018)). This motivated us to develop Seatbelt-VAE, a new hierarchical disentangled VAE that is designed to be significantly more robust to adversarial attacks than existing approaches, while retaining high quality reconstructions.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matthew Willetts;Alexander Camuto;Stephen Roberts;Chris Holmes",
        "authorids": "mwilletts@turing.ac.uk;acamuto@turing.ac.uk;sroberts@turing.ac.uk;cholmes@turing.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwilletts2020disentangling,\ntitle={Disentangling Improves {\\{}VAE{\\}}s' Robustness to Adversarial Attacks},\nauthor={Matthew Willetts and Alexander Camuto and Stephen Roberts and Chris Holmes},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeZ9a4Fwr}\n}",
        "github": "https://www.dropbox.com/sh/1x3vctui9oo5max/AACSSHTaxl6AkNkpgevXU1KVa?dl=1",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkeZ9a4Fwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "450;460;178",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1412;1080;465",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            362.6666666666667,
            130.64285497322675
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            985.6666666666666,
            392.32327939539294
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8115030331610427828&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkeZIJBYvr",
        "title": "Learning to Balance: Bayesian Meta-Learning for Imbalanced and Out-of-distribution Tasks",
        "track": "main",
        "status": "Talk",
        "tldr": "A novel meta-learning model that adaptively balances the effect of the meta-learning and task-specific learning, and also class-specific learning within each task.",
        "abstract": "While tasks could come with varying the number of instances and classes in realistic settings, the existing meta-learning approaches for few-shot classification assume that number of instances per task and class is fixed. Due to such restriction, they learn to equally utilize the meta-knowledge across all the tasks, even when the number of instances per task and class largely varies. Moreover, they do not consider distributional difference in unseen tasks, on which the meta-knowledge may have less usefulness depending on the task relatedness. To overcome these limitations, we propose a novel meta-learning model that adaptively balances the effect of the meta-learning and task-specific learning within each task. Through the learning of the balancing variables, we can decide whether to obtain a solution by relying on the meta-knowledge or task-specific learning. We formulate this objective into a Bayesian inference framework and tackle it using variational inference. We validate our Bayesian Task-Adaptive Meta-Learning (Bayesian TAML) on two realistic task- and class-imbalanced datasets, on which it significantly outperforms existing meta-learning approaches. Further ablation study confirms the effectiveness of each balancing component and the Bayesian learning framework. ",
        "keywords": "meta-learning;few-shot learning;Bayesian neural network;variational inference;learning to learn;imbalanced and out-of-distribution tasks for few-shot learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hae Beom Lee;Hayeon Lee;Donghyun Na;Saehoon Kim;Minseop Park;Eunho Yang;Sung Ju Hwang",
        "authorids": "haebeom.lee@kaist.ac.kr;hayeon926@kaist.ac.kr;donghyun.na@kaist.ac.kr;shkim@aitrics.com;mike_seop@aitrics.com;eunhoy@kaist.ac.kr;sjhwang82@kaist.ac.kr",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nLee2020Learning,\ntitle={Learning to Balance: Bayesian Meta-Learning for Imbalanced and Out-of-distribution Tasks},\nauthor={Hae Beom Lee and Hayeon Lee and Donghyun Na and Saehoon Kim and Minseop Park and Eunho Yang and Sung Ju Hwang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeZIJBYvr}\n}",
        "github": "https://github.com/haebeom-lee/l2b",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkeZIJBYvr",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "365;214;929",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "420;361;675",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            502.6666666666667,
            307.7015148195117
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            485.3333333333333,
            136.26037167455874
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 144,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5580496720042011830&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkeZNREFDr",
        "title": "Not All Features Are Equal: Feature Leveling Deep Neural Networks for Better Interpretation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Self-explaining models are models that reveal decision making parameters in an interpretable manner so that the model reasoning process can be directly understood by human beings. General Linear Models (GLMs) are self-explaining because the model weights directly show how each feature contributes to the output value. However, deep neural networks (DNNs) are in general not self-explaining due to the non-linearity of the activation functions, complex architectures, obscure feature extraction and transformation process. In this work, we illustrate the fact that existing deep architectures are hard to interpret because each hidden layer carries a mix of low level features and high level features. As a solution, we propose a novel feature leveling architecture that isolates low level features from high level features on a per-layer basis to better utilize the GLM layer in the proposed architecture for interpretation. Experimental results show that our modified models are able to achieve competitive results comparing to main-stream architectures on standard datasets while being more self-explainable. Our implementations and configurations are publicly available for reproductions.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yingjing Lu;Runde Yang",
        "authorids": "yingjinl@andrew.cmu.edu;ry82@cornell.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nlu2020not,\ntitle={Not All Features Are Equal: Feature Leveling Deep Neural Networks for Better Interpretation},\nauthor={Yingjing Lu and Runde Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeZNREFDr}\n}",
        "github": "https://drive.google.com/file/d/1yj4XdmQeyYyJuYn24lUVg2SlpZcDYTzi/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer7;AnonReviewer2;AnonReviewer1;AnonReviewer6",
        "site": "https://openreview.net/forum?id=rkeZNREFDr",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "420;535;267;536",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            439.5,
            110.19187810360617
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8929003060992228628&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkeZO1BFDB",
        "title": "Cross-Lingual Vision-Language Navigation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We introduce a new task and dataset on cross-lingual vision-language navigation, and propose a general cross-lingual VLN framework for the task.",
        "abstract": "Vision-Language Navigation (VLN) is the task where an agent is commanded to navigate in photo-realistic unknown environments with natural language instructions. Previous research on VLN is primarily conducted on the Room-to-Room (R2R) dataset with only English instructions.  The ultimate goal of VLN, however, is to serve people speaking arbitrary languages. Towards multilingual VLN with numerous languages, we collect a cross-lingual R2R dataset, which extends the original benchmark with corresponding Chinese instructions. But it is time-consuming and expensive to collect large-scale human instructions for every existing language. Based on the newly introduced dataset, we propose a general cross-lingual VLN framework to enable instruction-following navigation for different languages. We first explore the possibility of building a cross-lingual agent when no training data of the target language is available. The cross-lingual agent is equipped with a meta-learner to aggregate cross-lingual representations and a visually grounded cross-lingual alignment module to align textual representations of different languages. Under the zero-shot learning scenario, our model shows competitive results even compared to a model trained with all target language instructions. In addition, we introduce an adversarial domain adaption loss to improve the transferring ability of our model when given a certain amount of target language data. Our methods and dataset demonstrate the potentials of building a cross-lingual agent to serve speakers with different languages.",
        "keywords": "Vision-Language Navigation;Cross-lingual Representation Learning;Cross-lingual Adaptation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "An Yan;Xin Wang;Jiangtao Feng;Lei Li;William Wang",
        "authorids": "ayan@ucsd.edu;xwang@cs.ucsb.edu;fengjiangtao@bytedance.com;lileilab@bytedance.com;william@cs.ucsb.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkeZO1BFDB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "184;838;434",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            485.3333333333333,
            269.45046957753766
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4011692763591913308&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkecJ6VFvr",
        "title": "Logic and the 2-Simplicial Transformer",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce the 2-simplicial Transformer and show that this architecture is a useful inductive bias for logical reasoning in the context of deep reinforcement learning.",
        "abstract": "We introduce the 2-simplicial Transformer, an extension of the Transformer which includes a form of higher-dimensional attention generalising the dot-product attention, and uses this attention to update entity representations with tensor products of value vectors. We show that this architecture is a useful inductive bias for logical reasoning in the context of deep reinforcement learning.\n",
        "keywords": "transformer;logic;reinforcement learning;reasoning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "James Clift;Dmitry Doryn;Daniel Murfet;James Wallbridge",
        "authorids": "jamesedwardclift@gmail.com;dmitry.doryn@gmail.com;d.murfet@unimelb.edu.au;james.wallbridge@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nClift2020Logic,\ntitle={Logic and the 2-Simplicial Transformer},\nauthor={James Clift and Dmitry Doryn and Daniel Murfet and James Wallbridge},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkecJ6VFvr}\n}",
        "github": "https://github.com/dmurfet/2simplicialtransformer",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkecJ6VFvr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "346;163;599",
        "wc_reply_reviewers": "0;174;128",
        "wc_reply_authors": "641;450;92",
        "reply_reviewers": "0;1;1",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.3333333333333,
            178.75930434214854
        ],
        "wc_reply_reviewers_avg": [
            100.66666666666667,
            73.61763073853677
        ],
        "wc_reply_authors_avg": [
            394.3333333333333,
            227.5585394769638
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3081517893804157897&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkecl1rtwB",
        "title": "PairNorm: Tackling Oversmoothing in GNNs",
        "track": "main",
        "status": "Poster",
        "tldr": "We proposed a normalization layer for GNN models to solve the oversmoothing problem.",
        "abstract": "The performance of graph neural nets (GNNs) is known to gradually decrease with increasing number of layers. This decay is partly attributed to oversmoothing, where repeated graph convolutions eventually make node embeddings indistinguishable. We take a closer look at two different interpretations, aiming to quantify oversmoothing. Our main contribution is PairNorm, a novel normalization layer that is based on a careful analysis of the graph convolution operator, which prevents all node embeddings from becoming too similar. What is more, PairNorm is fast, easy to implement without any change to network architecture nor any additional parameters, and is broadly applicable to any GNN. Experiments on real-world graphs demonstrate that PairNorm makes deeper GCN, GAT, and SGC models more robust against oversmoothing, and significantly boosts performance for a new problem setting that benefits from deeper GNNs. Code is available at https://github.com/LingxiaoShawn/PairNorm.",
        "keywords": "Graph Neural Network;oversmoothing;normalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lingxiao Zhao;Leman Akoglu",
        "authorids": "lingxiao@cmu.edu;lakoglu@andrew.cmu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nZhao2020PairNorm:,\ntitle={PairNorm: Tackling Oversmoothing in GNNs},\nauthor={Lingxiao Zhao and Leman Akoglu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkecl1rtwB}\n}",
        "github": "https://github.com/LingxiaoShawn/PairNorm",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkecl1rtwB",
        "pdf_size": 0,
        "rating": "3;8",
        "confidence": "0;0",
        "wc_review": "326;543",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "785;365",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            5.5,
            2.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            434.5,
            108.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            575.0,
            210.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 685,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=244277682967965047&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkedXgrKDH",
        "title": "Trajectory growth through random deep ReLU networks",
        "track": "main",
        "status": "Reject",
        "tldr": "The expected trajectory growth of a random sparsely connected deep neural network is exponential in depth across many distributions including the default initialisations used in Tensorflow and Pytorch",
        "abstract": "This paper considers the growth in the length of one-dimensional trajectories as they are passed through deep ReLU neural networks,  which, among other things, is  one measure of the expressivity of deep networks. We generalise existing  results, providing an alternative, simpler method for lower bounding expected trajectory growth through random networks, for a more general class of weights distributions, including sparsely connected  networks. We illustrate this approach by deriving bounds for sparse-Gaussian, sparse-uniform, and sparse-discrete-valued random nets. We prove that trajectory growth can remain exponential in depth with these new distributions, including their sparse variants, with the sparsity parameter appearing in the base of the exponent.",
        "keywords": "Deep networks;expressivity;trajectory growth;sparse neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ilan Price;Jared Tanner",
        "authorids": "ilan.price@maths.ox.ac.uk;tanner@maths.ox.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nprice2020trajectory,\ntitle={Trajectory growth through random deep Re{\\{}LU{\\}} networks},\nauthor={Ilan Price and Jared Tanner},\nyear={2020},\nurl={https://openreview.net/forum?id=rkedXgrKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkedXgrKDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "351;172;173",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1356;1011;626",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            232.0,
            84.14669730100324
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            997.6666666666666,
            298.17034653961747
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:L08VhWfb__cJ:scholar.google.com/&scioq=Trajectory+growth+through+random+deep+ReLU+networks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rked_6NFwH",
        "title": "Path Space for Recurrent Neural Networks with ReLU Activations",
        "track": "main",
        "status": "Reject",
        "tldr": "We construct a new parameter space, called path space, for the ReLU RNN and employ optimization algorithms in it. We can obtain more effective RNN models in path space than using conventional optimization methods in the weight space.",
        "abstract": "It is well known that neural networks with rectified linear units (ReLU) activation functions are positively scale-invariant (i.e., the neural network is invariant to positive rescaling of weights). Optimization algorithms like stochastic gradient descent that optimize the neural networks in the vector space of weights, which are not positively scale-invariant. To solve this mismatch, a new parameter space called path space has been proposed for feedforward and convolutional neural networks. The path space is positively scale-invariant and optimization algorithms operating in path space have been shown to be superior than that in the original weight space. However, the theory of path space and the corresponding optimization algorithm cannot be naturally extended to more complex neural networks, like Recurrent Neural Networks(RNN)  due to the recurrent structure and the parameter sharing scheme over time. In this work, we aim to construct path space for RNN with ReLU activations so that we can employ optimization algorithms in path space.  To achieve the goal, we propose leveraging the reduction graph of RNN which removes the influence of time-steps, and prove that all the values of whose paths can serve as a sufficient representation of the RNN with ReLU activations. We then prove that the path space for RNN is composed by the basis paths in reduction graph, and design a \\emph{Skeleton Method} to identify the basis paths efficiently. With the identified basis paths, we develop the optimization algorithm in path space for RNN models. Our experiments on several benchmark datasets show that we can obtain significantly more effective RNN models in this way than using optimization methods in the weight space. ",
        "keywords": "optimization;neural network;positively scale-invariant;path space;deep learning;RNN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yue Wang;Qi Meng;Wei Chen;Yuting Liu;Zhi-Ming Ma;Tie-Yan Liu",
        "authorids": "11271012@bjtu.edu.cn;meq@microsoft.com;wche@microsoft.com;ytliu@bjtu.edu.cn;mazm@amt.ac.cn;tie-yan.liu@microsoft.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nwang2020path,\ntitle={Path Space for Recurrent Neural Networks with Re{\\{}LU{\\}} Activations},\nauthor={Yue Wang and Qi Meng and Wei Chen and Yuting Liu and Zhi-Ming Ma and Tie-Yan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=rked_6NFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rked_6NFwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "238;774;202",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "297;454;155",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.6666666666667,
            261.5713202083813
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            302.0,
            122.11742982337397
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zGs3KGdFu6IJ:scholar.google.com/&scioq=Path+Space+for+Recurrent+Neural+Networks+with+ReLU+Activations&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkeeoeHYvr",
        "title": "AdvCodec: Towards A Unified Framework for Adversarial Text Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "we propose a novel framework AdvCodec to generate adversarial text agaist general NLP tasks based on tree-autoencoder, and we show that AdvCodec outperforms other baselines and achieves high performance in human evaluation.",
        "abstract": "Machine learning (ML) especially deep neural networks (DNNs) have been widely applied to real-world applications. However, recent studies show that DNNs are vulnerable to carefully crafted \\emph{adversarial examples} which  only deviate from the original data by a small magnitude of perturbation. \nWhile there has been great interest on generating imperceptible adversarial examples in continuous data domain (e.g. image and audio) to explore the model vulnerabilities, generating \\emph{adversarial text} in the discrete domain is still challenging. \nThe main contribution of this paper is to propose a general targeted attack framework \\advcodec for adversarial text generation which addresses the challenge of discrete input space and be easily adapted to general natural language processing (NLP) tasks. \nIn particular, we propose a tree based autoencoder to encode discrete text data into continuous vector space, upon which we optimize the adversarial perturbation. With the tree based decoder, it is possible to ensure the grammar correctness of the generated text; and the tree based encoder enables flexibility of making manipulations on different levels of text, such as sentence (\\advcodecsent) and word (\\advcodecword) levels. We consider multiple attacking scenarios, including appending an adversarial sentence or adding unnoticeable words to a given paragraph, to achieve arbitrary \\emph{targeted attack}. To demonstrate the effectiveness of the proposed method, we consider two most representative NLP tasks: sentiment analysis and question answering (QA). Extensive experimental results show that \\advcodec has successfully attacked both tasks. In particular, our attack causes a BERT-based sentiment classifier accuracy to drop from $0.703$ to $0.006$, and a BERT-based QA model's F1 score to drop from $88.62$ to $33.21$ (with best targeted attack F1 score as $46.54$). Furthermore, we show that the white-box generated adversarial texts can transfer across other black-box models, shedding light on an effective way to examine the robustness of existing NLP models.",
        "keywords": "adversarial text generation;tree-autoencoder;human evaluation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Boxin Wang;Hengzhi Pei;Han Liu;Bo Li",
        "authorids": "boxinw2@illinois.edu;hzpei16@fudan.edu.cn;hanliu@northwestern.edu;lbo@illinois.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020advcodec,\ntitle={AdvCodec: Towards A Unified Framework for Adversarial Text Generation},\nauthor={Boxin Wang and Hengzhi Pei and Han Liu and Bo Li},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeeoeHYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkeeoeHYvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "316;551;159",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1052;1098;390",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            342.0,
            161.0858984103409
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            846.6666666666666,
            323.4577080375252
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17494471267954561266&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkegcC4YvS",
        "title": "Removing the Representation Error of GAN Image Priors Using the Deep Decoder",
        "track": "main",
        "status": "Reject",
        "tldr": "A linear combination of a GAN and a Deep Decoder is an effective natural signal prior for inverse problems, outperforming both components on both in- and out-of-distribution images.",
        "abstract": "Generative models, such as GANs, have demonstrated impressive performance as natural image priors for solving inverse problems such as image restoration and compressive sensing. Despite this performance, they can exhibit substantial representation error for both in-distribution and out-of-distribution images, because they maintain explicit low-dimensional learned representations of a natural signal class. In this paper, we demonstrate a method for removing the representation error of a GAN when used as a prior in inverse problems by modeling images as the linear combination of a GAN with a Deep Decoder. The deep decoder is an underparameterized and most importantly unlearned natural signal model similar to the Deep Image Prior.  No knowledge of the specific inverse problem is needed in the training of the GAN underlying our method.  For compressive sensing and image superresolution, our hybrid model exhibits consistently higher PSNRs than both the GAN priors and Deep Decoder separately, both on in-distribution and out-of-distribution images.  This model provides a method for extensibly and cheaply leveraging both the benefits of learned and unlearned image recovery priors in inverse problems.",
        "keywords": "deep decoder;deep image prior;GAN;inverse problems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Max Daniels;Reinhard Heckel;Paul Hand",
        "authorids": "daniels.g@husky.neu.edu;rh43@rice.edu;p.hand@northeastern.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ndaniels2020removing,\ntitle={Removing the Representation Error of {\\{}GAN{\\}} Image Priors Using the Deep Decoder},\nauthor={Max Daniels and Reinhard Heckel and Paul Hand},\nyear={2020},\nurl={https://openreview.net/forum?id=rkegcC4YvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkegcC4YvS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "305;449;292",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "199;168;88",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.6666666666667,
            71.14461016518086
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            151.66666666666666,
            46.764183825753754
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CgDgk15tcxQJ:scholar.google.com/&scioq=Removing+the+Representation+Error+of+GAN+Image+Priors+Using+the+Deep+Decoder&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rkehoAVtvS",
        "title": "Adversarial Paritial Multi-label Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Partial multi-label learning (PML), which tackles the problem of learning multi-label prediction models from instances with overcomplete noisy annotations, has recently started gaining attention from the research community. In this paper, we propose a novel adversarial learning model, PML-GAN, under a generalized encoder-decoder framework for partial multi-label learning. The PML-GAN model uses a disambiguation network to identify noisy labels and uses a multi-label prediction network to map the training instances to the disambiguated label vectors, while deploying a generative adversarial network as an inverse mapping from label vectors to data samples in the input feature space. The learning of the overall model corresponds to a minimax adversarial game, which enhances the correspondence of input features with the output labels. Extensive experiments are conducted on multiple datasets, while the proposed model demonstrates the state-of-the-art performance for partial multi-label learning.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yan Yan;Yuhong Guo",
        "authorids": "yanyan.nwpu@gmail.com;yuhongguo.cs@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nyan2020adversarial,\ntitle={Adversarial Paritial Multi-label Learning},\nauthor={Yan Yan and Yuhong Guo},\nyear={2020},\nurl={https://openreview.net/forum?id=rkehoAVtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkehoAVtvS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "226;306;913",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            481.6666666666667,
            306.74238195738275
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4704245507490198362&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkeiQlBFPB",
        "title": "Meta-Learning with Warped Gradient Descent",
        "track": "main",
        "status": "Talk",
        "tldr": "We propose a novel framework for meta-learning a gradient-based update rule that scales to beyond few-shot learning and is applicable to any form of learning, including continual learning.",
        "abstract": "Learning an efficient update rule from data that promotes rapid learning of new tasks from the same distribution remains an open problem in meta-learning. Typically, previous works have approached this issue either by attempting to train a neural network that directly produces updates or by attempting to learn better initialisations or scaling factors for a gradient-based update rule. Both of these approaches pose challenges. On one hand, directly producing an update forgoes a useful inductive bias and can easily lead to non-converging behaviour. On the other hand, approaches that try to control a gradient-based update rule typically resort to computing gradients through the learning process to obtain their meta-gradients, leading to methods that can not scale beyond few-shot task adaptation. In this work, we propose Warped Gradient Descent (WarpGrad), a method that intersects these approaches to mitigate their limitations. WarpGrad meta-learns an efficiently parameterised preconditioning matrix that facilitates gradient descent across the task distribution. Preconditioning arises by interleaving non-linear layers, referred to as warp-layers, between the layers of a task-learner. Warp-layers are meta-learned without backpropagating through the task training process in a manner similar to methods that learn to directly produce updates. WarpGrad is computationally efficient, easy to implement, and can scale to arbitrarily large meta-learning problems. We provide a geometrical interpretation of the approach and evaluate its effectiveness in a variety of settings, including few-shot, standard supervised, continual and reinforcement learning.",
        "keywords": "meta-learning;transfer learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sebastian Flennerhag;Andrei A. Rusu;Razvan Pascanu;Francesco Visin;Hujun Yin;Raia Hadsell",
        "authorids": "flennerhag@google.com;andreirusu@google.com;razp@google.com;visin@google.com;hujun.yin@manchester.ac.uk;raia@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nFlennerhag2020Meta-Learning,\ntitle={Meta-Learning with Warped Gradient Descent},\nauthor={Sebastian Flennerhag and Andrei A. Rusu and Razvan Pascanu and Francesco Visin and Hujun Yin and Raia Hadsell},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeiQlBFPB}\n}",
        "github": "https://github.com/flennerhag/warpgrad",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkeiQlBFPB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "203;397;741",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "570;66;743",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            447.0,
            222.4649785172189
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            459.6666666666667,
            287.18441616649204
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 266,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11176205486602510509&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkej86VYvB",
        "title": "Temporal Difference Weighted Ensemble For Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Ensemble method for reinforcement learning that weights Q-functions based on accumulated TD errors.",
        "abstract": "Combining multiple function approximators in machine learning models typically leads to better performance and robustness compared with a single function. In reinforcement learning, ensemble algorithms such as an averaging method and a majority voting method are not always optimal, because each function can learn fundamentally different optimal trajectories from exploration. In this paper, we propose a Temporal Difference Weighted (TDW) algorithm, an ensemble method that adjusts weights of each contribution based on accumulated temporal difference errors. The advantage of this algorithm is that it improves ensemble performance by reducing weights of Q-functions unfamiliar with current trajectories. We provide experimental results for Gridworld tasks and Atari tasks that show significant performance improvements compared with baseline algorithms.",
        "keywords": "reinforcement learning;ensemble;deep q-network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Takuma Seno;Michita Imai",
        "authorids": "seno@ailab.ics.keio.ac.jp;michita@ailab.ics.keio.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nseno2020temporal,\ntitle={Temporal Difference Weighted Ensemble For Reinforcement Learning},\nauthor={Takuma Seno and Michita Imai},\nyear={2020},\nurl={https://openreview.net/forum?id=rkej86VYvB}\n}",
        "github": "https://drive.google.com/open?id=1-EfJUguTqvWt32Zb2-AmyMLgrjLn5Va3",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer5",
        "site": "https://openreview.net/forum?id=rkej86VYvB",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "771;338;465",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "485;314;388",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            524.6666666666666,
            181.73668375488265
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            395.6666666666667,
            70.02063188010294
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iE1iSHXufJYJ:scholar.google.com/&scioq=Temporal+Difference+Weighted+Ensemble+For+Reinforcement+Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkem91rtDB",
        "title": "Inductive and Unsupervised Representation Learning on Graph Structured Objects",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper proposed a novel framework for graph similarity learning in inductive and unsupervised scenario.",
        "abstract": "Inductive and unsupervised graph learning is a critical technique for predictive or information retrieval tasks where label information is difficult to obtain. It is also challenging to make graph learning inductive and unsupervised at the same time, as learning processes guided by reconstruction error based loss functions inevitably demand graph similarity evaluation that is usually computationally intractable. In this paper, we propose a general framework SEED (Sampling, Encoding, and Embedding Distributions) for inductive and unsupervised representation learning on graph structured objects. Instead of directly dealing with the computational challenges raised by graph similarity evaluation, given an input graph, the SEED framework samples a number of subgraphs whose reconstruction errors could be efficiently evaluated, encodes the subgraph samples into a collection of subgraph vectors, and employs the embedding of the subgraph vector distribution as the output vector representation for the input graph. By theoretical analysis, we demonstrate the close connection between SEED and graph isomorphism. Using public benchmark datasets, our empirical study suggests the proposed SEED framework is able to achieve up to 10% improvement, compared with competitive baseline methods.",
        "keywords": "Graph representation learning;Graph isomorphism;Graph similarity learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lichen Wang;Bo Zong;Qianqian Ma;Wei Cheng;Jingchao Ni;Wenchao Yu;Yanchi Liu;Dongjin Song;Haifeng Chen;Yun Fu",
        "authorids": "wanglichenxj@gmail.com;bzong@nec-labs.com;maqq@bu.edu;weicheng@nec-labs.com;jni@nec-labs.com;wyu@nec-labs.com;yanchi@nec-labs.com;dsong@nec-labs.com;haifeng@nec-labs.com;yunfu@ece.neu.edu",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@inproceedings{\nWang2020Inductive,\ntitle={Inductive and Unsupervised Representation Learning on Graph Structured Objects},\nauthor={Lichen Wang and Bo Zong and Qianqian Ma and Wei Cheng and Jingchao Ni and Wenchao Yu and Yanchi Liu and Dongjin Song and Haifeng Chen and Yun Fu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkem91rtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkem91rtDB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "368;261;147",
        "wc_reply_reviewers": "56;0;0",
        "wc_reply_authors": "817;667;348",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            258.6666666666667,
            90.23795702967178
        ],
        "wc_reply_reviewers_avg": [
            18.666666666666668,
            26.398653164297773
        ],
        "wc_reply_authors_avg": [
            610.6666666666666,
            195.5681182833462
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14490933419727074114&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkenmREFDr",
        "title": "Learning Space Partitions for Nearest Neighbor Search",
        "track": "main",
        "status": "Poster",
        "tldr": "We use supervised learning (and in particular deep learning) to produce better space partitions for fast nearest neighbor search.",
        "abstract": "Space partitions of $\\mathbb{R}^d$ underlie a vast and important\nclass of fast nearest neighbor search (NNS) algorithms. Inspired by recent theoretical work on NNS for general metric spaces (Andoni et al. 2018b,c), we develop a new framework for building space partitions reducing the problem to balanced graph partitioning followed by supervised classification.\nWe instantiate this general approach with the KaHIP graph partitioner (Sanders and Schulz 2013) and neural networks, respectively, to obtain a new partitioning procedure called Neural Locality-Sensitive Hashing (Neural LSH). On several standard benchmarks for NNS (Aumuller et al. 2017), our experiments show that the partitions obtained by Neural LSH consistently outperform partitions found by quantization-based and tree-based methods as well as classic, data-oblivious LSH.",
        "keywords": "space partition;lsh;locality sensitive hashing;nearest neighbor search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yihe Dong;Piotr Indyk;Ilya Razenshteyn;Tal Wagner",
        "authorids": "yihedong@gmail.com;indyk@mit.edu;ilyaraz@microsoft.com;tal.wagner@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nDong2020Learning,\ntitle={Learning Space Partitions for Nearest Neighbor Search},\nauthor={Yihe Dong and Piotr Indyk and Ilya Razenshteyn and Tal Wagner},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkenmREFDr}\n}",
        "github": "https://anonymous.4open.science/r/cdd789a8-818c-4675-98fd-39f8da656129/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkenmREFDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "467;561;182",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "283;883;209",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            403.3333333333333,
            161.14244905948553
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            458.3333333333333,
            301.800522788738
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 112,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10542824053229044397&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkeqn1rtDH",
        "title": "Hierarchical Graph Matching Networks for Deep Graph Similarity Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Hierarchical Graph Matching Networks",
        "abstract": "While the celebrated graph neural networks yields effective representations for individual nodes of a graph, there has been relatively less success in extending to deep graph similarity learning. \nRecent work has considered either global-level graph-graph interactions or low-level node-node interactions, ignoring the rich cross-level interactions between parts of a graph and a whole graph.\nIn this paper, we propose a Hierarchical Graph Matching Network (HGMN) for computing the graph similarity between any pair of graph-structured objects. Our model jointly learns graph representations and a graph matching metric function for computing graph similarity in an end-to-end fashion. The proposed HGMN model consists of a multi-perspective node-graph matching network for effectively learning cross-level interactions between parts of a graph and a whole graph, and a siamese graph neural network for learning global-level interactions between two graphs. Our comprehensive experiments demonstrate that our proposed HGMN consistently outperforms state-of-the-art graph matching networks baselines for both classification and regression tasks. ",
        "keywords": "Graph Neural Network;Graph Matching Network;Graph Similarity Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiang Ling;Lingfei Wu;Saizhuo Wang;Tengfei Ma;Fangli Xu;Chunming Wu;Shouling Ji",
        "authorids": "lingxiang@zju.edu.cn;lwu@email.wm.edu;szwang@zju.edu.cn;tengfei.ma1@ibm.com;lili@yixue.us;wuchunming@zju.edu.cn;sji@zju.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nling2020hierarchical,\ntitle={Hierarchical Graph Matching Networks for Deep Graph Similarity Learning},\nauthor={Xiang Ling and Lingfei Wu and Saizhuo Wang and Tengfei Ma and Fangli Xu and Chunming Wu and Shouling Ji},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeqn1rtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkeqn1rtDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "357;147;216",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1213;737;774",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            240.0,
            87.39565206576354
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            908.0,
            216.19589882018266
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2724548747917500719&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkerLaVtDr",
        "title": "A General Upper Bound for Unsupervised Domain Adaptation",
        "track": "main",
        "status": "Reject",
        "tldr": "joint error matters for unsupervised domain adaptation especially when the domain shift is huge",
        "abstract": "In this work, we present a novel upper bound of target error to address the problem for unsupervised domain adaptation. Recent studies reveal that a deep neural network can learn transferable features which generalize well to novel tasks. Furthermore,  Ben-David et al. (2010) provide an upper bound for target error when transferring the knowledge, which can be summarized as minimizing the source error and  distance between marginal distributions simultaneously. However, common methods based on the theory usually ignore the joint error such that samples from different classes might be mixed together when matching marginal distribution. And in such case, no matter how we minimize the marginal discrepancy, the target error is not bounded due to an increasing joint error. To address this problem, we propose a general upper bound taking joint error into account, such that the undesirable case can be properly penalized. In addition, we utilize constrained hypothesis space to further formalize a tighter bound as well as a novel cross margin discrepancy to measure the dissimilarity between hypotheses which alleviates instability during adversarial learning. Extensive empirical evidence shows that our proposal outperforms related approaches in image classification error rates on standard domain adaptation benchmarks.",
        "keywords": "unsupervised domain adaptation;upper bound;joint error;hypothesis space constraint;cross margin discrepancy",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dexuan Zhang;Tatsuya Harada",
        "authorids": "dexuan.zhang@mi.t.u-tokyo.ac.jp;harada@mi.t.u-tokyo.ac.jp",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzhang2020a,\ntitle={A General Upper Bound for Unsupervised Domain Adaptation},\nauthor={Dexuan Zhang and Tatsuya Harada},\nyear={2020},\nurl={https://openreview.net/forum?id=rkerLaVtDr}\n}",
        "github": "https://drive.google.com/open?id=1XGOFQAjsCg9gTGSfDuUs-cd2dt9cKGak",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkerLaVtDr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "362;472;523",
        "wc_reply_reviewers": "0;115;85",
        "wc_reply_authors": "496;419;1046",
        "reply_reviewers": "0;1;2",
        "reply_authors": "1;2;4",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.3333333333333,
            67.18300049533033
        ],
        "wc_reply_reviewers_avg": [
            66.66666666666667,
            48.70546398734153
        ],
        "wc_reply_authors_avg": [
            653.6666666666666,
            279.1968640384216
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2832314371242681670&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkesVkHtDr",
        "title": "Meta-Learning Runge-Kutta",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Initial value problems, i.e. differential equations with specific, initial conditions, represent a classic problem within the field of ordinary differential equations(ODEs). While the simplest types of ODEs may have closed-form solutions, most interesting cases typically rely on iterative schemes for numerical integration such as the family of Runge-Kutta methods. They are, however, sensitive to the strategy the step size is adapted during integration, which has to be chosen by the experimenter. In this paper, we show how the design of a step size controller can be cast as a learning problem, allowing deep networks to learn to exploit structure in the initial value problem at hand in an automatic way. The key ingredients for the resulting Meta-Learning Runge-Kutta (MLRK) are the development of a good performance measure and the identification of suitable input features. Traditional approaches suggest the local error estimates as input to the controller. However, by studying the characteristics of the local error function we show that including the partial derivatives of the initial value problem is favorable. Our experiments demonstrate considerable benefits over traditional approaches. In particular, MLRK is able to mitigate sudden spikes in the local error function by a faster adaptation of the step size. More importantly, the additional information in the form of partial derivatives and function values leads to a substantial improvement in performance. The source code can be found at https://www.dropbox.com/sh/rkctdfhkosywnnx/AABKadysCR8-aHW_0kb6vCtSa?dl=0",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nadine Behrmann;Patrick Schramowski;Kristian Kersting",
        "authorids": "nadine.behrmann@freenet.de;schramowski@cs.tu-darmstadt.de;kersting@cs.tu-darmstadt.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nbehrmann2020metalearning,\ntitle={Meta-Learning Runge-Kutta},\nauthor={Nadine Behrmann and Patrick Schramowski and Kristian Kersting},\nyear={2020},\nurl={https://openreview.net/forum?id=rkesVkHtDr}\n}",
        "github": "https://www.dropbox.com/sh/rkctdfhkosywnnx/AABKadysCR8-aHW_0kb6vCtSa?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkesVkHtDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "741;435;299",
        "wc_reply_reviewers": "506;0;0",
        "wc_reply_authors": "993;567;422",
        "reply_reviewers": "2;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            491.6666666666667,
            184.84107287673436
        ],
        "wc_reply_reviewers_avg": [
            168.66666666666666,
            238.53068752026206
        ],
        "wc_reply_authors_avg": [
            660.6666666666666,
            242.33631359928063
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jYDv6WGA7FEJ:scholar.google.com/&scioq=Meta-Learning+Runge-Kutta&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rketraEtPr",
        "title": "Learning Time-Aware Assistance Functions for Numerical Fluid Solvers",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a neural network approach to assist partial differential equation solvers.",
        "abstract": "Improving the accuracy of numerical methods remains a central challenge in many disciplines and is especially important for nonlinear simulation problems. A representative example of such problems is fluid flow, which has been thoroughly studied to arrive at efficient simulations of complex flow phenomena. This paper presents a data-driven approach that learns to improve the accuracy of numerical solvers. The proposed method utilizes an advanced numerical scheme with a fine simulation resolution to acquire reference data. We, then, employ a neural network that infers a correction to move a coarse thus quickly obtainable result closer to the reference data. We provide insights into the targeted learning problem with different learning approaches: fully supervised learning methods with a naive and an optimized data acquisition as well as an unsupervised learning method with a differentiable Navier-Stokes solver. While our approach is very general and applicable to arbitrary partial differential equation models, we specifically highlight gains in accuracy for fluid flow simulations.",
        "keywords": "PDEs;convolutional neural networks;numerical simulation;fluids",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kiwon Um;Yun (Raymond) Fei;Philipp Holl;Nils Thuerey",
        "authorids": "kiwon.um@tum.de;yf2320@columbia.edu;philipp.holl@tum.de;nils.thuerey@tum.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\num2020learning,\ntitle={Learning Time-Aware Assistance Functions for Numerical Fluid Solvers},\nauthor={Kiwon Um and Yun (Raymond) Fei and Philipp Holl and Nils Thuerey},\nyear={2020},\nurl={https://openreview.net/forum?id=rketraEtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rketraEtPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "265;323;98",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "156;302;15",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            228.66666666666666,
            95.3811069109368
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            157.66666666666666,
            117.17318616285705
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mPxVgMgk2FgJ:scholar.google.com/&scioq=Learning+Time-Aware+Assistance+Functions+for+Numerical+Fluid+Solvers&hl=en&as_sdt=0,33",
        "gs_version_total": 2
    },
    {
        "id": "rkeu30EtvS",
        "title": "Network Deconvolution",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We propose a method called network deconvolution that resembles animal vision system to train convolution networks better.",
        "abstract": "Convolution is a central operation in Convolutional Neural Networks (CNNs), which applies a kernel to overlapping regions shifted across the image. However, because of the strong  correlations in real-world image data, convolutional kernels are in effect re-learning redundant data. In this work, we show that this redundancy has made neural network training challenging, and propose network deconvolution, a procedure which optimally removes pixel-wise and channel-wise correlations before the data is fed into each layer. Network deconvolution can be efficiently calculated at a fraction of the computational cost of a convolution layer. We also show that the deconvolution filters in the first layer of the network resemble the center-surround structure found in biological neurons in the visual regions of the brain. Filtering with such kernels results in a sparse representation, a desired property that has been missing in the training of neural networks. Learning from the sparse representation promotes faster convergence and superior results without the use of batch normalization. We apply our network deconvolution operation to 10 modern neural network models by replacing batch normalization within each. Extensive experiments show that the network deconvolution operation is able to deliver performance improvement in all cases on the CIFAR-10, CIFAR-100, MNIST, Fashion-MNIST, Cityscapes, and ImageNet datasets.",
        "keywords": "convolutional networks;network deconvolution;whitening",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chengxi Ye;Matthew Evanusa;Hua He;Anton Mitrokhin;Tom Goldstein;James A. Yorke;Cornelia Fermuller;Yiannis Aloimonos",
        "authorids": "yechengxi@gmail.com;mevanusa@umd.edu;huah@umd.edu;amitrokh@umd.edu;tomg@cs.umd.edu;yorke@umd.edu;fer@umiacs.umd.edu;yiannis@cs.umd.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nYe2020Network,\ntitle={Network Deconvolution},\nauthor={Chengxi Ye and Matthew Evanusa and Hua He and Anton Mitrokhin and Tom Goldstein and James A. Yorke and Cornelia Fermuller and Yiannis Aloimonos},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeu30EtvS}\n}",
        "github": "https://github.com/yechengxi/deconvolution",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkeu30EtvS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "332;273;182",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "241;316;168",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            262.3333333333333,
            61.69999099585744
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            241.66666666666666,
            60.42258591693084
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 55,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2003981601851115307&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkeuAhVKvB",
        "title": "Dynamically Pruned Message Passing Networks for Large-scale Knowledge Graph Reasoning",
        "track": "main",
        "status": "Poster",
        "tldr": " We propose to learn an input-dependent subgraph, dynamically and selectively expanded, to explicitly model a sequential reasoning process.",
        "abstract": "We propose Dynamically Pruned Message Passing Networks (DPMPN) for large-scale knowledge graph reasoning. In contrast to existing models, embedding-based or path-based, we learn an input-dependent subgraph to explicitly model a sequential reasoning process. Each subgraph is dynamically constructed, expanding itself selectively under a flow-style attention mechanism. In this way, we can not only construct graphical explanations to interpret prediction, but also prune message passing in Graph Neural Networks (GNNs) to scale with the size of graphs. We take the inspiration from the consciousness prior proposed by Bengio to design a two-GNN framework to encode global input-invariant graph-structured representation and learn local input-dependent one coordinated by an attention module. Experiments show the reasoning capability in our model that is providing a clear graphical explanation as well as predicting results accurately, outperforming most state-of-the-art methods in knowledge base completion tasks.",
        "keywords": "knowledge graph reasoning;graph neural networks;attention mechanism",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xiaoran Xu;Wei Feng;Yunsheng Jiang;Xiaohui Xie;Zhiqing Sun;Zhi-Hong Deng",
        "authorids": "xiaoran.xu@hulu.com;wei.feng@hulu.com;yunsheng.jiang@hulu.com;xiaohui.xie@hulu.com;zhiqings@andrew.cmu.edu;zhdeng@pku.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nXu2020Dynamically,\ntitle={Dynamically Pruned Message Passing Networks for Large-scale Knowledge Graph Reasoning},\nauthor={Xiaoran Xu and Wei Feng and Yunsheng Jiang and Xiaohui Xie and Zhiqing Sun and Zhi-Hong Deng},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkeuAhVKvB}\n}",
        "github": "https://github.com/netpaladinx/DPMPN",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkeuAhVKvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "228;317;351",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "630;765;683",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            298.6666666666667,
            51.86092513208336
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            692.6666666666666,
            55.5357742560795
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 86,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6314488797301074088&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkevSgrtPr",
        "title": "A closer look at the approximation capabilities of neural networks",
        "track": "main",
        "status": "Poster",
        "tldr": "A quantitative refinement of the universal approximation theorem via an algebraic approach.",
        "abstract": "The universal approximation theorem, in one of its most general versions, says that if we consider only continuous activation functions \u03c3, then a standard feedforward neural network with one hidden layer is able to approximate any continuous multivariate function f to any given approximation threshold \u03b5, if and only if \u03c3 is non-polynomial. In this paper, we give a direct algebraic proof of the theorem. Furthermore we shall explicitly quantify the number of hidden units required for approximation. Specifically, if X in R^n is compact, then a neural network with n input units, m output units, and a single hidden layer with {n+d choose d} hidden units (independent of m and \u03b5), can uniformly approximate any polynomial function f:X -> R^m whose total degree is at most d for each of its m coordinate functions. In the general case that f is any continuous function, we show there exists some N in O(\u03b5^{-n}) (independent of m), such that N hidden units would suffice to approximate f. We also show that this uniform approximation property (UAP) still holds even under seemingly strong conditions imposed on the weights. We highlight several consequences: (i) For any \u03b4 > 0, the UAP still holds if we restrict all non-bias weights w in the last layer to satisfy |w| < \u03b4. (ii) There exists some \u03bb>0 (depending only on f and \u03c3), such that the UAP still holds if we restrict all non-bias weights w in the first layer to satisfy |w|>\u03bb. (iii) If the non-bias weights in the first layer are *fixed* and randomly chosen from a suitable range, then the UAP holds with probability 1.",
        "keywords": "deep learning;approximation;universal approximation theorem",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kai Fong Ernest Chong",
        "authorids": "ernest_chong@sutd.edu.sg",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nChong2020A,\ntitle={A closer look at the approximation capabilities of neural networks},\nauthor={Kai Fong Ernest Chong},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkevSgrtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkevSgrtPr",
        "pdf_size": 0,
        "rating": "6;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "98;269;305;358",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "33;69;232;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;0",
        "rating_avg": [
            6.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            257.5,
            97.3768452970212
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            83.5,
            89.14174106444186
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.75,
            0.4330127018922193
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 31,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9398651734072561379&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkewaxrtvr",
        "title": "Privacy-preserving Representation Learning by Disentanglement",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep learning and latest machine learning technology heralded an era of success in data analysis. Accompanied by the ever increasing performance, reaching super-human performance in many areas, is the requirement of amassing more and more data to train these models. Often ignored or underestimated, the big data curation is associated with the risk of privacy leakages. The proposed approach seeks to mitigate these privacy issues. In order to sanitize data from sensitive content, we propose to learn a privacy-preserving data representation by disentangling into public and private part, with the public part being shareable without privacy infringement. The proposed approach deals with the setting where the private features are not explicit, and is estimated though the course of learning. This is particularly appealing, when the notion of sensitive attribute is ``fuzzy''. We showcase feasibility in terms of classification of facial attributes and identity on the CelebA dataset. The results suggest that private component can be removed in the cases where the the downstream task is known a priori (i.e., ``supervised''), and the case where it is not known a priori (i.e., ``weakly-supervised'').",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tassilo Klein;Moin Nabi",
        "authorids": "tassilo.klein@sap.com;m.nabi@sap.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nklein2020privacypreserving,\ntitle={Privacy-preserving Representation Learning by Disentanglement},\nauthor={Tassilo Klein and Moin Nabi},\nyear={2020},\nurl={https://openreview.net/forum?id=rkewaxrtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkewaxrtvr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "321;806;296",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            474.3333333333333,
            234.74572531902592
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11084328739357664614&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkezdaEtvH",
        "title": "Hyperbolic Discounting and Learning Over Multiple Horizons",
        "track": "main",
        "status": "Reject",
        "tldr": "A deep RL agent that learns hyperbolic (and other non-exponential) Q-values and a new multi-horizon auxiliary task.",
        "abstract": "Reinforcement learning (RL) typically defines a discount factor as part of the Markov Decision Process.  The discount factor values future rewards by an exponential scheme that leads to theoretical convergence guarantees of the Bellman equation. However, evidence from psychology, economics and neuroscience suggests that humans and animals instead have hyperbolic time-preferences.  Here we extend earlier work of Kurth-Nelson and Redish and propose an efficient deep reinforcement learning agent that acts via hyperbolic discounting and other non-exponential discount mechanisms. We demonstrate that a simple approach approximates hyperbolic discount functions while still using familiar temporal-difference learning techniques in RL.  Additionally, and independent of hyperbolic discounting, we make a surprising discovery that simultaneously learning value functions over multiple time-horizons is an effective auxiliary task which often improves over state-of-the-art methods.",
        "keywords": "Deep learning;reinforcement learning;discounting;hyperbolic discounting;auxiliary tasks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "William Fedus;Carles Gelada;Yoshua Bengio;Marc G. Bellemare;Hugo Larochelle",
        "authorids": "liam.fedus@gmail.com;carlesgelada@hotmail.com;yoshua.bengio@mila.quebec;bellemare@google.com;hugolarochelle@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nfedus2020hyperbolic,\ntitle={Hyperbolic Discounting and Learning Over Multiple Horizons},\nauthor={William Fedus and Carles Gelada and Yoshua Bengio and Marc G. Bellemare and Hugo Larochelle},\nyear={2020},\nurl={https://openreview.net/forum?id=rkezdaEtvH}\n}",
        "github": "[![github](/images/github_icon.svg) google-research/google-research](https://github.com/google-research/google-research/tree/master/hyperbolic_discount)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkezdaEtvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "454;124;554",
        "wc_reply_reviewers": "0;0;283",
        "wc_reply_authors": "751;174;969",
        "reply_reviewers": "0;0;2",
        "reply_authors": "1;1;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            377.3333333333333,
            183.7268503936089
        ],
        "wc_reply_reviewers_avg": [
            94.33333333333333,
            133.40747938386198
        ],
        "wc_reply_authors_avg": [
            631.3333333333334,
            335.4065526425439
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 134,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4113613282117941672&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkg-TJBFPB",
        "title": "RIDE: Rewarding Impact-Driven Exploration for Procedurally-Generated Environments",
        "track": "main",
        "status": "Poster",
        "tldr": "Reward agents for taking actions that lead to changes in the environment state.",
        "abstract": "Exploration in sparse reward environments remains one of the key challenges of model-free reinforcement learning. Instead of solely relying on extrinsic rewards provided by the environment, many state-of-the-art methods use intrinsic rewards to encourage exploration. However, we show that existing methods fall short in procedurally-generated environments where an agent is unlikely to visit a state more than once. We propose a novel type of intrinsic reward which encourages the agent to take actions that lead to significant changes in its learned state representation. We evaluate our method on multiple challenging procedurally-generated tasks in MiniGrid, as well as on tasks with high-dimensional observations used in prior work. Our experiments demonstrate that this approach is more sample efficient than existing exploration methods, particularly for procedurally-generated MiniGrid environments. Furthermore, we analyze the learned behavior as well as the intrinsic reward received by our agent. In contrast to previous approaches, our intrinsic reward does not diminish during the course of training and it rewards the agent substantially more for interacting with objects that it can control.",
        "keywords": "reinforcement learning;exploration;curiosity",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Roberta Raileanu;Tim Rockt\u00e4schel",
        "authorids": "raileanu@cs.nyu.edu;tim.rocktaeschel@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nRaileanu2020RIDE:,\ntitle={RIDE: Rewarding Impact-Driven Exploration for Procedurally-Generated Environments},\nauthor={Roberta Raileanu and Tim Rockt\u00e4schel},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg-TJBFPB}\n}",
        "github": "https://github.com/facebookresearch/impact-driven-exploration",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkg-TJBFPB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "636;409;321",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "509;532;119",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            455.3333333333333,
            132.70602431272246
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            386.6666666666667,
            189.50168571516423
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 241,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=220681399532996329&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkg-mA4FDr",
        "title": "Pre-training Tasks for Embedding-based Large-scale Retrieval",
        "track": "main",
        "status": "Poster",
        "tldr": "We consider large-scale retrieval problems such as question answering retrieval and present a comprehensive study of how different sentence level pre-training improving the BERT-style token-level pre-training for two-tower Transformer models.",
        "abstract": "We consider the large-scale query-document retrieval problem: given a query (e.g., a question), return the set of relevant documents (e.g., paragraphs containing the answer) from a large document corpus. This problem is often solved in two steps. The retrieval phase first reduces the solution space, returning a subset of candidate documents. The scoring phase then re-ranks the documents. Critically, the retrieval algorithm not only desires high recall but also requires to be highly efficient, returning candidates in time sublinear to the number of documents. Unlike the scoring phase witnessing significant advances recently due to the BERT-style pre-training tasks on cross-attention models, the retrieval phase remains less well studied. Most previous works rely on classic Information Retrieval (IR) methods such as BM-25 (token matching + TF-IDF weights). These models only accept sparse handcrafted features and can not be optimized for different downstream tasks of interest. In this paper, we conduct a comprehensive study on the embedding-based retrieval models. We show that the key ingredient of learning a strong embedding-based Transformer model is the set of pre-training tasks. With adequately designed paragraph-level pre-training tasks, the Transformer models can remarkably improve over the widely-used BM-25 as well as embedding models without Transformers. The paragraph-level pre-training tasks we studied are Inverse Cloze Task (ICT), Body First Selection (BFS), Wiki Link Prediction (WLP), and the combination of all three.",
        "keywords": "natural language processing;large-scale retrieval;unsupervised representation learning;paragraph-level pre-training;two-tower Transformer models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei-Cheng Chang;Felix X. Yu;Yin-Wen Chang;Yiming Yang;Sanjiv Kumar",
        "authorids": "wchang2@cs.cmu.edu;felixyu@google.com;yinwen@google.com;yiming@cs.cmu.edu;sanjivk@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nChang2020Pre-training,\ntitle={Pre-training Tasks for Embedding-based Large-scale Retrieval},\nauthor={Wei-Cheng Chang and Felix X. Yu and Yin-Wen Chang and Yiming Yang and Sanjiv Kumar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg-mA4FDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkg-mA4FDr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "172;641;196",
        "wc_reply_reviewers": "0;283;41",
        "wc_reply_authors": "144;649;165",
        "reply_reviewers": "0;2;1",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            336.3333333333333,
            215.65455916555277
        ],
        "wc_reply_reviewers_avg": [
            108.0,
            124.87059968890462
        ],
        "wc_reply_authors_avg": [
            319.3333333333333,
            233.2671334662377
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 355,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1635121653622965928&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkg0_eHtDr",
        "title": "Benefits of Overparameterization in Single-Layer Latent Variable Generative Models",
        "track": "main",
        "status": "Reject",
        "tldr": "Overparameterization aids parameter recovery in unsupervised settings.",
        "abstract": "One of the most surprising and exciting discoveries in supervising learning was the benefit of overparameterization (i.e. training a very large model) to improving the optimization landscape of a problem, with minimal effect on statistical performance (i.e. generalization). In contrast, unsupervised settings have been under-explored, despite the fact that it has been observed that overparameterization can be helpful as early as Dasgupta & Schulman (2007). In this paper, we perform an exhaustive study of different aspects of overparameterization in unsupervised learning via synthetic and semi-synthetic experiments. We discuss benefits to different metrics of success (recovering the parameters of the ground-truth model, held-out log-likelihood), sensitivity to variations of the training algorithm, and behavior as the amount of overparameterization increases. We find that, when learning using  methods such as variational inference,  larger models can significantly increase the number of ground truth latent variables recovered.",
        "keywords": "overparameterization;unsupervised;parameter recovery;rigorous experiments",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rares-Darius Buhai;Andrej Risteski;Yoni Halpern;David Sontag",
        "authorids": "rbuhai@mit.edu;aristesk@andrew.cmu.edu;yhalpern@google.com;dsontag@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nbuhai2020benefits,\ntitle={Benefits of Overparameterization in Single-Layer Latent Variable Generative Models},\nauthor={Rares-Darius Buhai and Andrej Risteski and Yoni Halpern and David Sontag},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg0_eHtDr}\n}",
        "github": "https://drive.google.com/file/d/1bKia5vceblhQuggssScteiUR_QfoVzmu/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkg0_eHtDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "242;223;277",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "454;640;255",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            247.33333333333334,
            22.365648262955002
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            449.6666666666667,
            157.2054565069405
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4883954181311992003&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkg1ngrFPr",
        "title": "Information Geometry of Orthogonal Initializations and Training",
        "track": "main",
        "status": "Poster",
        "tldr": "nearly isometric DNN initializations imply low parameter space curvature, and a lower condition number, but that's not always great",
        "abstract": "    Recently mean field theory has been successfully used to analyze properties\n    of wide, random neural networks. It gave rise to a prescriptive theory for\n    initializing feed-forward neural networks with orthogonal weights, which\n    ensures that both the forward propagated activations and the backpropagated\n    gradients are near \\(\\ell_2\\) isometries and as a consequence training is\n    orders of magnitude faster. Despite strong empirical performance, the\n    mechanisms by which critical initializations confer an advantage in the\n    optimization of deep neural networks are poorly understood. Here we show a\n    novel connection between the maximum curvature of the optimization landscape\n    (gradient smoothness) as measured by the Fisher information matrix (FIM) and\n    the spectral radius of the input-output Jacobian, which partially explains\n    why more isometric networks can train much faster. Furthermore, given that\n    orthogonal weights are necessary to ensure that gradient norms are\n    approximately preserved at initialization, we experimentally investigate the\n    benefits of maintaining orthogonality throughout training, and we conclude\n    that manifold optimization of weights performs well regardless of the\n    smoothness of the gradients. Moreover, we observe a surprising yet robust\n    behavior of highly isometric initializations --- even though such networks\n    have a lower FIM condition number \\emph{at initialization}, and therefore by\n    analogy to convex functions should be easier to optimize, experimentally\n    they prove to be much harder to train with stochastic gradient descent. We\n    conjecture the FIM condition number plays a non-trivial role in the optimization.",
        "keywords": "Fisher;mean-field;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Piotr Aleksander Sok\u00f3\u0142;Il Memming Park",
        "authorids": "piotr.sokol@stonybrook.edu;memming.park@stonybrook.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nSok\u00f3\u01422020Information,\ntitle={Information Geometry of Orthogonal Initializations and Training},\nauthor={Piotr Aleksander Sok\u00f3\u0142 and Il Memming Park},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg1ngrFPr}\n}",
        "github": "https://github.com/PiotrSokol/info-geom",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkg1ngrFPr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "276;278;199",
        "wc_reply_reviewers": "0;0;20",
        "wc_reply_authors": "627;844;312",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;4;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            251.0,
            36.778616975991184
        ],
        "wc_reply_reviewers_avg": [
            6.666666666666667,
            9.428090415820632
        ],
        "wc_reply_authors_avg": [
            594.3333333333334,
            218.41296257828247
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 22,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11408244093507433474&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkg3kRNKvH",
        "title": "Linguistic Embeddings as a Common-Sense Knowledge Repository: Challenges and Opportunities",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper presents a paradigm and methodology for using learned sentence representations as emergent, flexible knowledge bases that can be queried using linear algebra.",
        "abstract": "Many applications of linguistic embedding models rely on their value as pre-trained inputs for end-to-end tasks such as dialog modeling, machine translation, or question answering. This position paper presents an alternate paradigm: Rather than using learned embeddings as input features, we instead treat them as a common-sense knowledge repository that can be queried via simple mathematical operations within the embedding space. We show how linear offsets can be used to (a) identify an object given its description, (b) discover relations of an object given its label, and (c) map free-form text to a set of action primitives. Our experiments provide a valuable proof of concept that language-informed common sense reasoning, or `reasoning in the linguistic domain', lies within the grasp of the research community. In order to attain this goal, however, we must reconsider the way neural embedding models are typically trained an evaluated. To that end, we also identify three empirically-motivated evaluation metrics for use in the training of future embedding models.",
        "keywords": "knowledge representation;word embeddings;sentence embeddings;common-sense knowledge",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nancy Fulda",
        "authorids": "nfulda@byu.edu",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nfulda2020linguistic,\ntitle={Linguistic Embeddings as a Common-Sense Knowledge Repository: Challenges and Opportunities},\nauthor={Nancy Fulda},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg3kRNKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkg3kRNKvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "83;169;310",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            187.33333333333334,
            93.57468793547869
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8529510697170720763&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkg6FgrtPB",
        "title": "Biologically Plausible Neural Networks via Evolutionary Dynamics and Dopaminergic Plasticity",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Artificial neural networks (ANNs) lack in biological plausibility, chiefly because backpropagation requires a variant of plasticity (precise changes of the synaptic weights informed by neural events that occur downstream in the neural circuit) that is profoundly incompatible with the current understanding of the animal brain. Here we propose that backpropagation can happen in evolutionary time, instead of lifetime, in what we call neural net evolution (NNE). In NNE the weights of the links of the neural net are sparse linear functions of the animal's genes, where each gene has two alleles, 0 and 1. In each generation, a population is generated at random based on current allele frequencies, and it is tested in the learning task. The relative performance of the two alleles of each gene over the whole population is determined, and the allele frequencies are updated via the standard population genetics equations for the weak selection regime. We prove that, under assumptions, NNE succeeds in learning simple labeling functions with high probability, and with polynomially many generations and individuals per generation. We test the NNE concept, with only one hidden layer, on MNIST with encouraging results. Finally, we explore a further version of biologically plausible ANNs inspired by the recent discovery in animals of dopaminergic plasticity: the increase of the strength of a synapse that fired if dopamine was released soon after the firing.",
        "keywords": "Biological plausibility;dopaminergic plasticity;allele frequency;neural net evolution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sruthi Gorantla;Anand Louis;Christos H. Papadimitriou;Santosh Vempala;Naganand Yadati",
        "authorids": "sruthi@comp.nus.edu.sg;anandl@iisc.ac.in;christos@columbia.edu;vempala@gatech.edu;y.naganand@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ngorantla2020biologically,\ntitle={Biologically Plausible Neural Networks via Evolutionary Dynamics and Dopaminergic Plasticity},\nauthor={Sruthi Gorantla and Anand Louis and Christos H. Papadimitriou and Santosh Vempala and Naganand Yadati},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg6FgrtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkg6FgrtPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "504;320;402",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "553;193;465",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            408.6666666666667,
            75.26545614615571
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            403.6666666666667,
            153.23475091795885
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8331566883806687150&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rkg6PhNKDr",
        "title": "HOW IMPORTANT ARE NETWORK WEIGHTS? TO WHAT EXTENT DO THEY NEED AN UPDATE?",
        "track": "main",
        "status": "Reject",
        "tldr": "An experimental paper that proves the amount of redundant weights that can be freezed from the third epoch only, with only a very slight drop in accuracy.",
        "abstract": "In the context of optimization, a gradient of a neural network indicates the amount a specific weight should change with respect to the loss. Therefore, small gradients indicate a good value of the weight that requires no change and can be kept frozen during training. This paper provides an experimental study on the importance of a neural network weights, and to which extent do they need to be updated. We wish to show that starting from the third epoch, freezing weights which have no informative gradient and are less likely to be changed during training, results in a very slight drop in the overall accuracy (and in sometimes better). We experiment on the MNIST, CIFAR10 and Flickr8k datasets using several architectures (VGG19,\nResNet-110 and DenseNet-121). On CIFAR10, we show that freezing 80% of the VGG19 network parameters from the third epoch onwards results in 0.24% drop in accuracy, while freezing 50% of Resnet-110 parameters results in 0.9% drop in accuracy and finally freezing 70% of Densnet-121 parameters results in 0.57% drop in accuracy. Furthermore, to experiemnt with real-life applications, we train an image captioning model with attention mechanism on the Flickr8k dataset using LSTM networks, freezing 60% of the parameters from the third epoch onwards, resulting in a better BLEU-4 score than the fully trained model. Our source code can be found in the appendix.",
        "keywords": "weights update;weights importance;weight freezing",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fawaz Sammani;Mahmoud Elsayed;Abdelsalam Hamdi",
        "authorids": "fawaz.sammani@aol.com;elsayedmahmoud@aol.com;abdelsalam.h.a.a@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsammani2020how,\ntitle={{\\{}HOW{\\}} {\\{}IMPORTANT{\\}} {\\{}ARE{\\}} {\\{}NETWORK{\\}} {\\{}WEIGHTS{\\}}? {\\{}TO{\\}} {\\{}WHAT{\\}} {\\{}EXTENT{\\}} {\\{}DO{\\}} {\\{}THEY{\\}} {\\{}NEED{\\}} {\\{}AN{\\}} {\\{}UPDATE{\\}}?},\nauthor={Fawaz Sammani and Mahmoud Elsayed and Abdelsalam Hamdi},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg6PhNKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkg6PhNKDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "398;105;159",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "153;57;127",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            220.66666666666666,
            127.31675284720214
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            112.33333333333333,
            40.54078878802873
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ve7b-pjZFR4J:scholar.google.com/&scioq=HOW+IMPORTANT+ARE+NETWORK+WEIGHTS%3F+TO+WHAT+EXTENT+DO+THEY+NEED+AN+UPDATE%3F&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkg6sJHYDr",
        "title": "Intrinsically Motivated Discovery of Diverse Patterns in Self-Organizing Systems",
        "track": "main",
        "status": "Talk",
        "tldr": "We study how an unsupervised exploration and feature learning approach addresses efficiently a new problem: automatic discovery of diverse self-organized patterns in high-dim complex systems such as the game of life.",
        "abstract": "In many complex dynamical systems, artificial or natural, one can observe self-organization of patterns emerging from local rules. Cellular automata, like the Game of Life (GOL), have been widely used as abstract models enabling the study of various aspects of self-organization and morphogenesis, such as the emergence of spatially localized patterns. However, findings of self-organized patterns in such models have so far relied on manual tuning of parameters and initial states, and on the human eye to identify interesting patterns. In this paper, we formulate the problem of automated discovery of diverse self-organized patterns in such high-dimensional complex dynamical systems, as well as a framework for experimentation and evaluation. Using a continuous GOL as a testbed, we show that recent intrinsically-motivated machine learning algorithms (POP-IMGEPs), initially developed for learning of inverse models in robotics, can be transposed and used in this novel application area. These algorithms combine intrinsically-motivated goal exploration and unsupervised learning of goal space representations. Goal space representations describe the interesting features of patterns for which diverse variations should be discovered. In particular, we compare various approaches to define and learn goal space representations from the perspective of discovering diverse spatially localized patterns. Moreover, we introduce an extension of a state-of-the-art POP-IMGEP algorithm which incrementally learns a goal representation using a deep auto-encoder, and the use of CPPN primitives for generating initialization parameters. We show that it is more efficient than several baselines and equally efficient as a system pre-trained on a hand-made database of patterns identified by human experts.",
        "keywords": "deep learning;unsupervised Learning;self-organization;game-of-life",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chris Reinke;Mayalen Etcheverry;Pierre-Yves Oudeyer",
        "authorids": "chris.reinke@inria.fr;mayalen.etcheverry@inria.fr;pierre-yves.oudeyer@inria.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nReinke2020Intrinsically,\ntitle={Intrinsically Motivated Discovery of Diverse Patterns in Self-Organizing Systems},\nauthor={Chris Reinke and Mayalen Etcheverry and Pierre-Yves Oudeyer},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg6sJHYDr}\n}",
        "github": "https://automated-discovery.github.io/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkg6sJHYDr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "507;233;367",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1153;1187;760",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.0,
            111.86897097348606
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1033.3333333333333,
            193.7736365510598
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2546095608654741205&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rkg8FJBYDS",
        "title": "Variational Diffusion Autoencoders with Random Walk Sampling",
        "track": "main",
        "status": "Reject",
        "tldr": "We combine variational inference and manifold learning (specifically VAEs and diffusion maps) to build a generative model based on a diffusion random walk on a data manifold; we generate samples by drawing from the walk's stationary distribution.",
        "abstract": "Variational inference (VI) methods and especially variational autoencoders (VAEs) specify scalable generative models that enjoy an intuitive connection to manifold learning --- with many default priors the posterior/likelihood pair $q(z|x)$/$p(x|z)$ can be viewed as an approximate homeomorphism (and its inverse) between the data manifold and a latent Euclidean space. However, these approximations are well-documented to become degenerate in training. Unless the subjective prior is carefully chosen, the topologies of the prior and data distributions often will not match.\nConversely, diffusion maps (DM) automatically \\textit{infer} the data topology and enjoy a rigorous connection to manifold learning, but do not scale easily or provide the inverse homeomorphism.\nIn this paper, we propose \\textbf{a)} a principled measure for recognizing the mismatch between data and latent distributions and \\textbf{b)} a method that combines the advantages of variational inference and diffusion maps to learn a homeomorphic generative model. The measure, the \\textit{locally bi-Lipschitz property}, is a sufficient condition for a homeomorphism and easy to compute and interpret. The method, the \\textit{variational diffusion autoencoder} (VDAE), is a novel generative algorithm that first infers the topology of the data distribution, then models a diffusion random walk over the data. To achieve efficient computation in VDAEs, we use stochastic versions of both variational inference and manifold learning optimization. We prove approximation theoretic results for the dimension dependence of VDAEs, and that locally isotropic sampling in the latent space results in a random walk over the reconstructed manifold.\nFinally, we demonstrate the utility of our method on various real and synthetic datasets, and show that it exhibits performance superior to other generative models.",
        "keywords": "generative models;variational inference;manifold learning;diffusion maps",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Henry Li;Ofir Lindenbaum;Xiuyuan Cheng;Alexander Cloninger",
        "authorids": "henryli@eng.ucsd.edu;ofir.lindenbaum@yale.edu;xiuyuan.cheng@duke.edu;acloninger@ucsd.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nli2020variational,\ntitle={Variational Diffusion Autoencoders with Random Walk Sampling},\nauthor={Henry Li and Ofir Lindenbaum and Xiuyuan Cheng and Alexander Cloninger},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg8FJBYDS}\n}",
        "github": "https://anonymous.4open.science/r/876ca4cf-cce1-4d59-af7c-70dead892b20/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkg8FJBYDS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "318;759;329",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "411;495;160",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            468.6666666666667,
            205.34577884361025
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            355.3333333333333,
            142.3149871080188
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 17,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=873676922722877192&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rkg8xTEtvB",
        "title": "Hierarchical Disentangle Network for Object Representation Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Disentangle the primitives of objects in different hierarchy levels",
        "abstract": "An object can be described as the combination of primary visual attributes. Disentangling such underlying primitives is the long objective of representation learning. It is observed that categories have the natural multi-granularity or hierarchical characteristics, i.e. any two objects can share some common primitives in a particular category granularity while they may possess their unique ones in another granularity. However, previous works usually operate in a flat manner (i.e. in a particular granularity) to disentangle the representations of objects. Though they may obtain the primitives to constitute objects as the categories in that granularity, their results are obviously not efficient and complete. In this paper, we propose the hierarchical disentangle network (HDN) to exploit the rich hierarchical characteristics among categories to divide the disentangling process in a coarse-to-fine manner, such that each level only focuses on learning the specific representations in its granularity and finally the common and unique representations in all granularities jointly constitute the raw object. Specifically, HDN is designed based on an encoder-decoder architecture. To simultaneously ensure the disentanglement and interpretability of the encoded representations, a novel hierarchical generative adversarial network (GAN) is elaborately designed. Quantitative and qualitative evaluations on four object datasets validate the effectiveness of our method.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shishi Qiao;Ruiping Wang;Shiguang Shan;Xilin Chen",
        "authorids": "qiaoshishi14@mails.ucas.ac.cn;wangruiping@ict.ac.cn;sgshan@ict.ac.cn;xlchen@ict.ac.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nqiao2020hierarchical,\ntitle={Hierarchical Disentangle Network for Object Representation Learning},\nauthor={Shishi Qiao and Ruiping Wang and Shiguang Shan and Xilin Chen},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg8xTEtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkg8xTEtvB",
        "pdf_size": 0,
        "rating": "1;1;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "319;352;231;486",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "630;986;519;215",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;2;1;1",
        "rating_avg": [
            4.0,
            3.082207001484488
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            347.0,
            91.63241784434153
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            587.5,
            275.7068189218395
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.25,
            0.4330127018922193
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17581573189288525538&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkg98yBFDr",
        "title": "Reject Illegal Inputs: Scaling Generative Classifiers with Supervised Deep Infomax",
        "track": "main",
        "status": "Reject",
        "tldr": "scale generative classifiers  on complex datasets, and evaluate their effectiveness to reject illegal inputs including out-of-distribution samples and adversarial examples.",
        "abstract": "Deep Infomax~(DIM) is an unsupervised representation learning framework by maximizing the mutual information between the inputs and the outputs of an encoder, while probabilistic constraints are imposed on the outputs. In this paper, we propose Supervised Deep InfoMax~(SDIM), which introduces supervised probabilistic constraints to the encoder outputs. The supervised probabilistic constraints are equivalent to a generative classifier on high-level data representations, where class conditional log-likelihoods of samples can be evaluated. Unlike other works building generative classifiers with conditional generative models, SDIMs scale on complex datasets, and can achieve comparable performance with discriminative counterparts.  With SDIM, we could perform \\emph{classification with rejection}.\nInstead of always reporting a class label, SDIM only makes predictions when test samples' largest logits surpass some pre-chosen thresholds, otherwise they will be deemed as out of the data distributions, and be rejected.  Our experiments show that SDIM with rejection policy can effectively reject illegal inputs including out-of-distribution samples and adversarial examples.",
        "keywords": "generative classifiers;selective classification;classification with rejection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin WANG;SiuMing Yiu",
        "authorids": "xwang@cs.hku.hk;smyiu@cs.hku.hk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nwang2020reject,\ntitle={Reject Illegal Inputs: Scaling Generative Classifiers with Supervised Deep Infomax},\nauthor={Xin WANG and SiuMing Yiu},\nyear={2020},\nurl={https://openreview.net/forum?id=rkg98yBFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkg98yBFDr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "225;365;165",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "483;326;27",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            251.66666666666666,
            83.79870059984357
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            278.6666666666667,
            189.14603411708697
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kCqD6UEfQIMJ:scholar.google.com/&scioq=Reject+Illegal+Inputs:+Scaling+Generative+Classifiers+with+Supervised+Deep+Infomax&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkgAGAVKPr",
        "title": "Meta-Dataset: A Dataset of Datasets for Learning to Learn from Few Examples",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a new large-scale diverse environment for few-shot learning, and evaluate popular models' performance on it, revealing important research challenges.",
        "abstract": "Few-shot classification refers to learning a classifier for new classes given only a few examples. While a plethora of models have emerged to tackle it, we find the procedure and datasets that are used to assess their progress lacking. To address this limitation, we propose Meta-Dataset: a new benchmark for training and evaluating models that is large-scale, consists of diverse datasets, and presents more realistic tasks. We experiment with popular baselines and meta-learners on Meta-Dataset, along with a competitive method that we propose. We analyze performance as a function of various characteristics of test tasks and examine the models\u2019 ability to leverage diverse training sources for improving their generalization. We also propose a new set of baselines for quantifying the benefit of meta-learning in Meta-Dataset. Our extensive experimentation has uncovered important research challenges and we hope to inspire work in these directions.",
        "keywords": "few-shot learning;meta-learning;few-shot classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Eleni Triantafillou;Tyler Zhu;Vincent Dumoulin;Pascal Lamblin;Utku Evci;Kelvin Xu;Ross Goroshin;Carles Gelada;Kevin Swersky;Pierre-Antoine Manzagol;Hugo Larochelle",
        "authorids": "eleni@cs.toronto.edu;tylerzhu@google.com;vdumoulin@google.com;lamblinp@google.com;evcu@google.com;kelvinxu@berkeley.edu;goroshin@google.com;cgel@google.com;kswersky@google.com;manzagop@google.com;hugolarochelle@google.com",
        "gender": ";;;;;;;;;;",
        "homepage": ";;;;;;;;;;",
        "dblp": ";;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;",
        "orcid": ";;;;;;;;;;",
        "linkedin": ";;;;;;;;;;",
        "or_profile": ";;;;;;;;;;",
        "aff": ";;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;",
        "position": ";;;;;;;;;;",
        "bibtex": "@inproceedings{\nTriantafillou2020Meta-Dataset:,\ntitle={Meta-Dataset: A Dataset of Datasets for Learning to Learn from Few Examples},\nauthor={Eleni Triantafillou and Tyler Zhu and Vincent Dumoulin and Pascal Lamblin and Utku Evci and Kelvin Xu and Ross Goroshin and Carles Gelada and Kevin Swersky and Pierre-Antoine Manzagol and Hugo Larochelle},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgAGAVKPr}\n}",
        "github": "https://storage.googleapis.com/meta-dataset-source-code/meta-dataset-iclr2020.tar.gz",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgAGAVKPr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "224;128;286",
        "wc_reply_reviewers": "32;0;0",
        "wc_reply_authors": "295;396;246",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            212.66666666666666,
            64.99914529352588
        ],
        "wc_reply_reviewers_avg": [
            10.666666666666666,
            15.084944665313014
        ],
        "wc_reply_authors_avg": [
            312.3333333333333,
            62.45175916034889
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            11,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 788,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14266702502378757393&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkgAb1Btvr",
        "title": "Fourier networks for uncertainty estimates and out-of-distribution detection",
        "track": "main",
        "status": "Reject",
        "tldr": "Using sine activations and large weight initialization to mitigate some of the issues regular ensembles face on out-of-distribution detection tasks.",
        "abstract": "A simple method for obtaining uncertainty estimates for Neural Network classifiers (e.g. for out-of-distribution detection) is to use an ensemble of independently trained networks and average the softmax outputs. While this method works, its results are still very far from human performance on standard data sets. We investigate how this method works and observe three fundamental limitations: \"Unreasonable\" extrapolation, \"unreasonable\" agreement between the networks in an ensemble, and the filtering out of features that distinguish the training distribution from some out-of-distribution inputs, but do not contribute to the classification. To mitigate these problems we suggest \"large\" initializations in the first layers and changing the activation function to sin(x) in the last hidden layer. We show that this combines the out-of-distribution behavior from nearest neighbor methods with the generalization capabilities of neural networks, and achieves greatly improved out-of-\ndistribution detection on standard data sets (MNIST/fashionMNIST/notMNIST, SVHN/CIFAR10).",
        "keywords": "Fourier network;out-of-distribution detection;large initialization;uncertainty;ensembles",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hartmut Maennel;Alexandru \u021aifrea",
        "authorids": "hartmutm@google.com;tifreaa@student.ethz.ch",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmaennel2020fourier,\ntitle={Fourier networks for uncertainty estimates and out-of-distribution detection},\nauthor={Hartmut Maennel and Alexandru \u021aifrea},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgAb1Btvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkgAb1Btvr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "139;276;484",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "765;637;730",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            299.6666666666667,
            141.83636894518824
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            710.6666666666666,
            54.01440137181523
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SHAqjPiXU8UJ:scholar.google.com/&scioq=Fourier+networks+for+uncertainty+estimates+and+out-of-distribution+detection&hl=en&as_sdt=0,44",
        "gs_version_total": 0
    },
    {
        "id": "rkgCJ64tDB",
        "title": "Scale-Equivariant Neural Networks with Decomposed Convolutional Filters",
        "track": "main",
        "status": "Reject",
        "tldr": "We construct scale-equivariant convolutional neural networks in the most general form with both computational efficiency and proved deformation robustness.",
        "abstract": "Encoding the input scale information explicitly into the representation learned by a convolutional neural network (CNN) is beneficial for many vision tasks especially when dealing with multiscale input signals. We study, in this paper, a scale-equivariant CNN architecture with joint convolutions across the space and the scaling group, which is shown to be both sufficient and necessary to achieve scale-equivariant representations. To reduce the model complexity and computational burden, we decompose the convolutional filters under two pre-fixed separable bases and truncate the expansion to low-frequency components. A further benefit of the truncated filter expansion is the improved deformation robustness of the equivariant representation. Numerical experiments demonstrate that the proposed scale-equivariant neural network with decomposed convolutional filters (ScDCFNet) achieves significantly improved performance in multiscale image classification and better interpretability than regular CNNs at a reduced model size.",
        "keywords": "scale-equivariant;convolutional neural network;deformation robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei Zhu;Qiang Qiu;Robert Calderbank;Guillermo Sapiro;Xiuyuan Cheng",
        "authorids": "zhu@math.duke.edu;qiang.qiu@duke.edu;robert.calderbank@duke.edu;guillermo.sapiro@duke.edu;xiuyuan.cheng@duke.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhu2020scaleequivariant,\ntitle={Scale-Equivariant Neural Networks with Decomposed Convolutional Filters},\nauthor={Wei Zhu and Qiang Qiu and Robert Calderbank and Guillermo Sapiro and Xiuyuan Cheng},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgCJ64tDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkgCJ64tDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "158;881;242",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "612;1698;315",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            427.0,
            322.8529076839792
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            875.0,
            594.4459605380458
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 26,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9335124388493102781&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkgFXR4KPr",
        "title": "A Simple Recurrent Unit with Reduced Tensor Product Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Widely used recurrent units, including Long-short Term Memory (LSTM) and Gated Recurrent Unit (GRU), perform well on natural language tasks, but their ability to learn structured representations is still questionable. Exploiting reduced Tensor Product Representations (TPRs) --- distributed representations of symbolic structure in which vector-embedded symbols are bound to vector-embedded structural positions --- we propose the TPRU, a simple recurrent unit that, at each time step, explicitly executes structural-role binding and unbinding operations to incorporate structural information into learning. The gradient analysis of our proposed TPRU is conducted to support our model design, and its performance on multiple datasets shows the effectiveness of it. Furthermore, observations on linguistically grounded study demonstrate the interpretability of our TPRU.",
        "keywords": "RNNs;TPRs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shuai Tang;Paul Smolensky;Virginia R. de Sa",
        "authorids": "shuaitang93@ucsd.edu;paul.smolensky@gmail.com;desa@ucsd.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntang2020a,\ntitle={A Simple Recurrent Unit with Reduced Tensor Product Representations},\nauthor={Shuai Tang and Paul Smolensky and Virginia R. de Sa},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgFXR4KPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkgFXR4KPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "315;529;111",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            318.3333333333333,
            170.66406248013146
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6228207596191901836&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkgHY0NYwr",
        "title": "Discovering Motor Programs by Recomposing Demonstrations",
        "track": "main",
        "status": "Poster",
        "tldr": "We learn a space of motor primitives from unannotated robot demonstrations, and show these primitives are semantically meaningful and can be composed for new robot tasks.",
        "abstract": "In this paper, we present an approach to learn recomposable motor primitives across large-scale and diverse manipulation demonstrations. Current approaches to decomposing demonstrations into primitives often assume manually defined primitives and bypass the difficulty of discovering these primitives. On the other hand, approaches in primitive discovery put restrictive assumptions on the complexity of a primitive, which limit applicability to narrow tasks. Our approach attempts to circumvent these challenges by jointly learning both the underlying motor primitives and recomposing these primitives to form the original demonstration. Through constraints on both the parsimony of primitive decomposition and the simplicity of a given primitive, we are able to learn a diverse set of motor primitives, as well as a coherent latent representation for these primitives. We demonstrate both qualitatively and quantitatively, that our learned primitives capture semantically meaningful aspects of a demonstration. This allows us to compose these primitives in a hierarchical reinforcement learning setup to efficiently solve robotic manipulation tasks like reaching and pushing. Our results may be viewed at https://sites.google.com/view/discovering-motor-programs. ",
        "keywords": "Learning from Demonstration;Imitation Learning;Motor Primitives",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tanmay Shankar;Shubham Tulsiani;Lerrel Pinto;Abhinav Gupta",
        "authorids": "tanmayshankar@fb.com;shubhtuls@fb.com;lerrel.pinto@gmail.com;abhinavg@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nShankar2020Discovering,\ntitle={Discovering Motor Programs by Recomposing Demonstrations},\nauthor={Tanmay Shankar and Shubham Tulsiani and Lerrel Pinto and Abhinav Gupta},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgHY0NYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkgHY0NYwr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "1020;166;276",
        "wc_reply_reviewers": "108;0;199",
        "wc_reply_authors": "1322;228;651",
        "reply_reviewers": "1;0;2",
        "reply_authors": "3;1;2",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            487.3333333333333,
            379.3198591631548
        ],
        "wc_reply_reviewers_avg": [
            102.33333333333333,
            81.34016364762381
        ],
        "wc_reply_authors_avg": [
            733.6666666666666,
            450.43263153945185
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 66,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10128196448480047256&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkgIW1HKPB",
        "title": "Unsupervised Representation Learning by Predicting Random Distances",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper introduces a novel Random Distance Prediction model to learn expressive feature representations in a fully unsupervised fashion by predicting random distances, enabling substantially improved anomaly detection and clustering performance.",
        "abstract": "Deep neural networks have gained tremendous success in a broad range of machine learning tasks due to its remarkable capability to learn semantic-rich features from high-dimensional data. However, they often require large-scale labelled data to successfully learn such features, which significantly hinders their adaption into unsupervised learning tasks, such as anomaly detection and clustering, and limits their applications into critical domains where obtaining massive labelled data is prohibitively expensive. To enable downstream unsupervised learning on those domains, in this work we propose to learn features without using any labelled data by training neural networks to predict data distances in a randomly projected space. Random mapping is a highly efficient yet theoretical proven approach to obtain approximately preserved distances. To well predict these random distances, the representation learner is optimised to learn class structures that are implicitly embedded in the randomly projected space. Experimental results on 19 real-world datasets show our learned representations substantially outperform state-of-the-art competing methods in both anomaly detection and clustering tasks.",
        "keywords": "representation learning;unsupervised learning;anomaly detection;clustering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hu Wang;Guansong Pang;Chunhua Shen;Congbo Ma",
        "authorids": "hu.wang@adelaide.edu.au;pangguansong@gmail.com;chunhua.shen@adelaide.edu.au;201520121828@mail.scut.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020unsupervised,\ntitle={Unsupervised Representation Learning by Predicting Random Distances},\nauthor={Hu Wang and Guansong Pang and Chunhua Shen and Congbo Ma},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgIW1HKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkgIW1HKPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "232;144;467",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "755;636;702",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            281.0,
            136.34026062270334
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            697.6666666666666,
            48.678080853250115
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 72,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4392837190615343196&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "rkgIllBtwB",
        "title": "Exploring the Correlation between Likelihood of Flow-based Generative Models and Image Semantics",
        "track": "main",
        "status": "Reject",
        "tldr": "show experimental evidences about the weak correlation between flows' likelihoods and image semantics.",
        "abstract": " Among deep generative models, flow-based models, simply referred as \\emph{flow}s in this paper, differ from other models in that they provide tractable likelihood. Besides being an evaluation metric of synthesized data, flows are supposed to be robust against out-of-distribution~(OoD) inputs since they do not discard any information of the inputs. However, it has been observed that flows trained on FashionMNIST assign higher likelihoods to OoD samples from MNIST. This counter-intuitive observation raises the concern about the robustness of flows' likelihood. In this paper, we explore the correlation between flows' likelihood and image semantics. We choose two typical flows as the target models: Glow, based on coupling transformations, and pixelCNN, based on autoregressive transformations. Our experiments reveal surprisingly weak correlation between flows' likelihoods and image semantics: the predictive likelihoods of flows can be heavily affected by trivial transformations that keep the image semantics unchanged, which we call semantic-invariant transformations~(SITs). We explore three SITs~(all small pixel-level modifications): image pixel translation, random noise perturbation, latent factors zeroing~(limited to flows using multi-scale architecture, e.g. Glow). These findings, though counter-intuitive, resonate with the fact that the predictive likelihood of a flow is the joint probability of all the image pixels. So flows' likelihoods, modeling on pixel-level intensities, is not able to indicate the existence likelihood of the high-level image semantics. We call for attention that it may be \\emph{abuse} if we use the predictive likelihoods of flows for OoD samples detection.",
        "keywords": "flow-based generative models;out-of-distribution samples detection;likelihood robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin WANG;SiuMing Yiu",
        "authorids": "xwang@cs.hku.hk;smyiu@cs.hku.hk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nwang2020exploring,\ntitle={Exploring the Correlation between Likelihood of Flow-based Generative Models and Image Semantics},\nauthor={Xin WANG and SiuMing Yiu},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgIllBtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkgIllBtwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "1394;357;318",
        "wc_reply_reviewers": "858;0;0",
        "wc_reply_authors": "1034;450;449",
        "reply_reviewers": "2;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            689.6666666666666,
            498.2933094295189
        ],
        "wc_reply_reviewers_avg": [
            286.0,
            404.4650788387052
        ],
        "wc_reply_authors_avg": [
            644.3333333333334,
            275.53624484307363
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MGNMQRoOYG4J:scholar.google.com/&scioq=Exploring+the+Correlation+between+Likelihood+of+Flow-based+Generative+Models+and+Image+Semantics&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkgKW64FPH",
        "title": "Constant Time Graph Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an approximation algorithm of GNNs that works in constant time with respect to the input size.",
        "abstract": "The recent advancements in graph neural networks (GNNs) have led to state-of-the-art performances in various applications, including chemo-informatics, question-answering systems, and recommender systems. However, scaling up these methods to huge graphs such as social network graphs and web graphs still remains a challenge. In particular, the existing methods for accelerating GNNs are either not theoretically guaranteed in terms of approximation error, or they require at least a linear time computation cost. \nIn this study, we analyze the neighbor sampling technique to obtain a constant time approximation algorithm for GraphSAGE, the graph attention networks (GAT), and the graph convolutional networks (GCN). The proposed approximation algorithm can theoretically guarantee the precision of approximation. The key advantage of the proposed approximation algorithm is that the complexity is completely independent of the numbers of the nodes, edges, and neighbors of the input and depends only on the error tolerance and confidence probability. To the best of our knowledge, this is the first constant time approximation algorithm for GNNs with a theoretical guarantee. Through experiments using synthetic and real-world datasets, we demonstrate the speed and precision of the proposed approximation algorithm and validate our theoretical results.",
        "keywords": "graph neural networks;constant time algorithm",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ryoma Sato;Makoto Yamada;Hisashi Kashima",
        "authorids": "r.sato@ml.ist.i.kyoto-u.ac.jp;myamada@i.kyoto-u.ac.jp;kashima@i.kyoto-u.ac.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsato2020constant,\ntitle={Constant Time Graph Neural Networks},\nauthor={Ryoma Sato and Makoto Yamada and Hisashi Kashima},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgKW64FPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgKW64FPH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "320;401;272",
        "wc_reply_reviewers": "35;131;0",
        "wc_reply_authors": "1075;479;0",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;1;0",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            331.0,
            53.23532661682466
        ],
        "wc_reply_reviewers_avg": [
            55.333333333333336,
            55.379498813991525
        ],
        "wc_reply_authors_avg": [
            518.0,
            439.7324944402752
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.816496580927726
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6773433450661579358&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkgMkCEtPB",
        "title": "Rapid Learning or Feature Reuse? Towards Understanding the Effectiveness of MAML",
        "track": "main",
        "status": "Poster",
        "tldr": "The success of MAML relies on feature reuse from the meta-initialization, which also yields a natural simplification of the algorithm, with the inner loop removed for the network body, as well as other insights on the head and body.",
        "abstract": "An important research direction in machine learning has centered around developing meta-learning algorithms to tackle few-shot learning. An especially successful algorithm has been Model Agnostic Meta-Learning (MAML), a method that consists of two optimization loops, with the outer loop finding a meta-initialization, from which the inner loop can efficiently learn new tasks. Despite MAML's popularity, a fundamental open question remains -- is the effectiveness of MAML due to the meta-initialization being primed for rapid learning (large, efficient changes in the representations) or due to feature reuse,  with the meta initialization already containing high quality features? We investigate this question, via ablation studies and analysis of the latent representations, finding that feature reuse is the dominant factor. This leads to the ANIL (Almost No Inner Loop) algorithm, a simplification of MAML where we remove the inner loop for all but the (task-specific) head of the underlying neural network. ANIL matches MAML's performance on benchmark few-shot image classification and RL and offers computational improvements over MAML. We further study the precise contributions of the head and body of the network, showing that performance on the test tasks is entirely determined by the quality of the learned features, and we can remove even the head of the network (the NIL algorithm). We conclude with a discussion of the rapid learning vs feature reuse question for meta-learning algorithms more broadly.",
        "keywords": "deep learning analysis;representation learning;meta-learning;few-shot learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aniruddh Raghu;Maithra Raghu;Samy Bengio;Oriol Vinyals",
        "authorids": "aniruddhraghu@gmail.com;maithrar@gmail.com;bengio@google.com;vinyals@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nRaghu2020Rapid,\ntitle={Rapid Learning or Feature Reuse? Towards Understanding the Effectiveness of MAML},\nauthor={Aniruddh Raghu and Maithra Raghu and Samy Bengio and Oriol Vinyals},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgMkCEtPB}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rkgMkCEtPB)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkgMkCEtPB",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "234;160;200",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "662;121;54",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            198.0,
            30.243456592570013
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            279.0,
            272.19968160647556
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 809,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17333883951885713002&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "rkgNJCVKPS",
        "title": "Making DenseNet Interpretable: A Case Study in Clinical Radiology",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "The monotonous routine of medical image analysis under tight time constraints has always led to work fatigue for many medical practitioners.  Medical image interpretation can be error-prone and this can increase the risk of an incorrect procedure being recommended.  While the advancement of complex deep learning models has achieved performance beyond human capability in some computer vision tasks, widespread adoption in the medical field has been held back, among other factors, by poor model interpretability and a lack of high-quality labelled data.  This paper introduces a model interpretation and visualisation framework for the analysis of the feature extraction process in a deep convolutional neural network and applies it to abnormality detection using the musculoskeletal radio-graph dataset (MURA, Stanford). The proposed framework provides a mechanism for interpreting DenseNet deep learning architectures. It aims to provide a deeper insight about the paths of feature generation and reasoning within a DenseNet architecture.  When evaluated on MURA at abnormality detection tasks, the model interpretation framework has been shown capable of identifying limitations from the reasoning of a DenseNet architecture applied to radiography, which can in turn be ameliorated through model interpretation and visualization.",
        "keywords": "Model Interpretation;Medical Image Analysis;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kwun Ho Ngan;Artur d'Avila Garcez;Karen M. Knapp;Andy Appelboam;Constantino Carlos Reyes-Aldasoro",
        "authorids": "kwun-ho.ngan@city.ac.uk;a.garcez@city.ac.uk;k.m.knapp@exeter.ac.uk;andy.appelboam@nhs.net;reyes@city.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "https://bitbucket.org/cityunilondon/iclr2020/src/master/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkgNJCVKPS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "639;193;506",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            446.0,
            186.95632288496333
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9611148149919575746&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkgNKkHtvB",
        "title": "Reformer: The Efficient Transformer",
        "track": "main",
        "status": "Talk",
        "tldr": "Efficient Transformer with locality-sensitive hashing and reversible layers",
        "abstract": "Large Transformer models routinely achieve state-of-the-art results on\na number of tasks but training these models can be prohibitively costly,\nespecially on long sequences. We introduce two techniques to improve\nthe efficiency of Transformers. For one, we replace dot-product attention\nby one that uses locality-sensitive hashing, changing its complexity\nfrom O($L^2$) to O($L \\log L$), where $L$ is the length of the sequence.\nFurthermore, we use reversible residual layers instead of the standard\nresiduals, which allows storing activations only once in the training\nprocess instead of N times, where N is the number of layers.\nThe resulting model, the Reformer, performs on par with Transformer models\nwhile being much more memory-efficient and much faster on long sequences.",
        "keywords": "attention;locality sensitive hashing;reversible layers",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nikita Kitaev;Lukasz Kaiser;Anselm Levskaya",
        "authorids": "kitaev@cs.berkeley.edu;lukaszkaiser@google.com;levskaya@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nKitaev2020Reformer:,\ntitle={Reformer: The Efficient Transformer},\nauthor={Nikita Kitaev and Lukasz Kaiser and Anselm Levskaya},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgNKkHtvB}\n}",
        "github": "https://github.com/google/trax/tree/master/trax/models/reformer",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgNKkHtvB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "288;490;129",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "515;385;252",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            302.3333333333333,
            147.72572182558085
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            384.0,
            107.37162877905256
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            23,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3397,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16827908105960721293&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkgO66VKDS",
        "title": "LEARNED STEP SIZE QUANTIZATION",
        "track": "main",
        "status": "Poster",
        "tldr": "A method for learning quantization configuration for low precision networks that achieves state of the art performance for quantized networks.",
        "abstract": "Deep networks run with low precision operations at inference time offer power and space advantages over high precision alternatives, but need to overcome the challenge of maintaining high accuracy as precision decreases. Here, we present a method for training such networks, Learned Step Size Quantization, that achieves the highest accuracy to date on the ImageNet dataset when using models, from a variety of architectures, with weights and activations quantized to 2-, 3- or 4-bits of precision, and that can train 3-bit models that reach full precision baseline accuracy. Our approach builds upon existing methods for learning weights in quantized networks by improving how the quantizer itself is configured. Specifically, we introduce a novel means to estimate and scale the task loss gradient at each weight and activation layer's quantizer step size, such that it can be learned in conjunction with other network parameters. This approach works using different levels of precision as needed for a given system and requires only a simple modification of existing training code.",
        "keywords": "deep learning;low precision;classification;quantization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Steven K. Esser;Jeffrey L. McKinstry;Deepika Bablani;Rathinakumar Appuswamy;Dharmendra S. Modha",
        "authorids": "sesser@us.ibm.com;jlmckins@us.ibm.com;deepika.bablani@ibm.com;rappusw@us.ibm.com;dmodha@us.ibm.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nEsser2020LEARNED,\ntitle={LEARNED STEP SIZE QUANTIZATION},\nauthor={Steven K. Esser and Jeffrey L. McKinstry and Deepika Bablani and Rathinakumar Appuswamy and Dharmendra S. Modha},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgO66VKDS}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 8 community implementations](https://paperswithcode.com/paper/?openreview=rkgO66VKDS)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkgO66VKDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "344;369;280",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "625;443;277",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            331.0,
            37.47888294315436
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            448.3333333333333,
            142.12044969750912
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1015,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3237721842964198392&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkgOlCVYvB",
        "title": "Pure and Spurious Critical Points: a Geometric Study of Linear Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "The critical locus of the loss function of a neural network is determined by the geometry of the functional space and by the parameterization of this space by the network's weights. We introduce a natural distinction between pure critical points, which only depend on the functional space, and spurious critical points, which arise from the parameterization. We apply this perspective to revisit and extend the literature on the loss function of linear neural networks. For this type of network, the functional space is either the set of all linear maps from input to output space, or a determinantal variety, i.e., a set of linear maps with bounded rank. We use geometric properties of determinantal varieties to derive new results on the landscape of linear networks with different loss functions and different parameterizations. Our analysis clearly illustrates that the absence of \"bad\" local minima in the loss landscape of linear networks is due to two distinct phenomena that apply in different settings: it is true for arbitrary smooth convex losses in the case of architectures that can express all linear maps (\"filling architectures\") but it holds only for the quadratic loss when the functional space is a determinantal variety (\"non-filling architectures\"). Without any assumption on the architecture, smooth convex losses may lead to landscapes with many bad minima.",
        "keywords": "Loss landscape;linear networks;algebraic geometry",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matthew Trager;Kathl\u00e9n Kohn;Joan Bruna",
        "authorids": "matthew.trager@cims.nyu.edu;kathlen.korn@gmail.com;bruna@cims.nyu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nTrager2020Pure,\ntitle={Pure and Spurious Critical Points: a Geometric Study of Linear Networks},\nauthor={Matthew Trager and Kathl\u00e9n Kohn and Joan Bruna},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgOlCVYvB}\n}",
        "github": "https://drive.google.com/file/d/1eSU6mwgmowSAyQY3b1jXPzvbymNv338z/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkgOlCVYvB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "323;229;333",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "587;225;137",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.0,
            46.847269575362304
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            316.3333333333333,
            194.73286545647318
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 40,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3013216967260274343&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rkgOuJBtPS",
        "title": "Task-Agnostic Robust Encodings for Combating Adversarial Typos",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Despite achieving excellent benchmark performance, state-of-the-art NLP models can still be easily fooled by adversarial perturbations such as typos. Previous heuristic defenses cannot guard against the exponentially large number of possible perturbations, and previous certified defenses only work with limited model sizes and simple architectures. In this paper, we construct task-agnostic robust encodings (TARE): sentence representations that improve the robustness of any model for multiple downstream tasks at once, and enable efficient exact computation of robust accuracy (accuracy on worst-case perturbations) for a fixed family of perturbations. The core idea behind TARE is to map sentences through a discrete bottleneck before feeding them to a downstream model. To create robust encodings, we must optimize for two competing goals: the encoding of a sentence must retain enough information about the sentence, but should also map all perturbations of the sentence to the same encoding to ensure invariance to perturbations. Averaged across six tasks from GLUE, a standard suite of NLP tasks,  the same encoding leads to robust accuracy of 71.2% when defending against a large family of typos, while a strong baseline that uses a typo corrector achieves only 38.5% accuracy, and training on random typos achieves only 9.9% accuracy.",
        "keywords": "Natural language processing;adversarial examples;robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Erik Jones;Robin Jia;Aditi Raghunathan;Percy Liang",
        "authorids": "erik.jones313@gmail.com;robinjia@stanford.edu;aditir@stanford.edu;pliang@cs.stanford.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkgOuJBtPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "394;292;280",
        "wc_reply_reviewers": "65;0;0",
        "wc_reply_authors": "147;195;398",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            322.0,
            51.146847410177685
        ],
        "wc_reply_reviewers_avg": [
            21.666666666666668,
            30.641293851417057
        ],
        "wc_reply_authors_avg": [
            246.66666666666666,
            108.78827551206467
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:81ewarkab_EJ:scholar.google.com/&scioq=Task-Agnostic+Robust+Encodings+for+Combating+Adversarial+Typos&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rkgPnhNFPB",
        "title": "Random Matrix Theory Proves that Deep Learning Representations of GAN-data Behave as Gaussian Mixtures",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This paper shows that deep learning (DL) representations of data produced by generative adversarial nets (GANs) are random vectors which fall within the class of so-called concentrated random vectors. Further exploiting the fact that Gram matrices, of the type G = X'X with X = [x_1 , . . . , x_n ] \u2208 R p\u00d7n and x_i independent concentrated random vectors from a mixture model, behave asymptotically (as n, p \u2192 \u221e) as if the x_i were drawn from a Gaussian mixture, suggests that DL representations of GAN-data can be fully described by their first two statistical moments for a wide range of standard classifiers. Our theoretical findings are validated by generating images with the BigGAN model and across different popular deep representation networks.",
        "keywords": "Random Matrix Theory;Deep Learning Representations;GANs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohamed El Amine Seddik;Cosme Louart;Mohamed Tamaazousti;Romain Couillet",
        "authorids": "melaseddik@gmail.com;cosme.louart@gmail.com;mohamed.tamaazousti@cea.fr;romain.couillet@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nseddik2020random,\ntitle={Random Matrix Theory Proves that Deep Learning Representations of {\\{}GAN{\\}}-data Behave as Gaussian Mixtures},\nauthor={Mohamed El Amine Seddik and Cosme Louart and Mohamed Tamaazousti and Romain Couillet},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgPnhNFPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgPnhNFPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "276;378;227",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "454;1051;17",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            293.6666666666667,
            62.89850731314871
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            507.3333333333333,
            423.80996789703863
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 85,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6806723779757439169&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 12
    },
    {
        "id": "rkgQL6VFwr",
        "title": "Learning Generative Image Object Manipulations from Language Instructions",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The use of adequate feature representations is essential for achieving high performance in high-level human cognitive tasks in computational modeling. Recent developments in deep convolutional and recurrent neural networks architectures enable learning powerful feature representations from both images and natural language text. Besides, other types of networks such as Relational Networks (RN) can learn relations between objects and Generative Adversarial Networks (GAN) have shown to generate realistic images. In this paper, we combine these four techniques to acquire a shared feature representation of the relation between objects in an input image and an object manipulation action description in the form of human language encodings to generate an image that shows the resulting end-effect the action would have on a computer-generated scene. The system is trained and evaluated on a simulated dataset and experimentally used on real-world photos.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Martin L\u00e4ngkvist;Andreas Persson;Amy Loutfi",
        "authorids": "martin.langkvist@oru.se;andreas.persson@oru.se;amy.loutfi@oru.se",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nl{\\\"a}ngkvist2020learning,\ntitle={Learning Generative Image Object Manipulations from Language Instructions},\nauthor={Martin L{\\\"a}ngkvist and Andreas Persson and Amy Loutfi},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgQL6VFwr}\n}",
        "github": "https://www.dropbox.com/s/fkaapcpbk06t8zi/pytorch.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgQL6VFwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "290;223;185",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            232.66666666666666,
            43.40762861781582
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7PnHKzYk2nwJ:scholar.google.com/&scioq=Learning+Generative+Image+Object+Manipulations+from+Language+Instructions&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rkgTKerYwr",
        "title": "Multi-task Network Embedding with Adaptive Loss Weighting",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Network embedding is to learn low-dimensional representations of nodes which mostly preserve the network topological structure. In real-world networks, however, nodes are often associated with a rich set of attributes and labels which are potentially valuable in seeking more effective vector representations. To properly utilize this information, we propose a Joint Autoencoders framework for Multi-task network Embedding (JAME), which aims to encode a shared representation of local network structure, node attributes, and available node labels. Jointly em-bedding via multi-task learning is strongly dependent on the relative weighting between each task\u2019s loss function. Tuning these weights by hand is an expensive and difficult process, making multi-task learning prohibitive in practice. Therefore, we define an adaptive loss weighting layer capable of learning an optimal combination of loss weights during representation learning. Empirical evaluations on real-world datasets show effectiveness and efficiency of our JAME model compared to relevant baseline methods.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fatemeh Salehi Rizi;Michael Granitzer",
        "authorids": "fatemeh.salehirizi@uni-passau.de;michael.granitzer@uni-passau.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkgTKerYwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "326;209;182",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            239.0,
            62.49799996799898
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10604422881364065896&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkgTdkrtPH",
        "title": "NoiGAN: NOISE AWARE KNOWLEDGE GRAPH EMBEDDING WITH GAN",
        "track": "main",
        "status": "Reject",
        "tldr": "We proposed a unified Generative Adversarial Networks (GAN) framework to learn noise-aware knowledge graph embedding.",
        "abstract": "Knowledge graph has gained increasing attention in recent years for its successful applications of numerous tasks. Despite the rapid growth of knowledge construction, knowledge graphs still suffer from severe incompletion and inevitably involve various kinds of errors. Several attempts have been made to complete knowledge graph as well as to detect noise. However, none of them considers unifying these two tasks even though they are inter-dependent and can mutually boost the performance of each other. In this paper, we proposed to jointly combine these two tasks with a unified Generative Adversarial Networks (GAN) framework to learn noise-aware knowledge graph embedding. Extensive experiments have demonstrated that our approach is superior to existing state-of-the-art algorithms both in regard to knowledge graph completion and error detection. ",
        "keywords": "Knowledge graph embedding;Noise aware",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kewei Cheng;Yikai Zhu;Ming Zhang;Yizhou Sun",
        "authorids": "viviancheng@cs.ucla.edu;zhuyikai.zyk@gmail.com;mzhang_cs@pku.edu.cn;yzsun@cs.ucla.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ncheng2020noigan,\ntitle={Noi{\\{}GAN{\\}}: {\\{}NOISE{\\}} {\\{}AWARE{\\}} {\\{}KNOWLEDGE{\\}} {\\{}GRAPH{\\}} {\\{}EMBEDDING{\\}} {\\{}WITH{\\}} {\\{}GAN{\\}}},\nauthor={Kewei Cheng and Yikai Zhu and Ming Zhang and Yizhou Sun},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgTdkrtPH}\n}",
        "github": "https://www.dropbox.com/sh/pk39s4hv3pvlmvv/AABr8jYyyMg6MZh0KT9h6Z7-a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkgTdkrtPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "129;443;268",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "492;540;440",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            280.0,
            128.47048947780448
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            490.6666666666667,
            40.83571421630281
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5001458497817419604&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkgU1gHtvr",
        "title": "Infinite-horizon Off-Policy Policy Evaluation with Multiple Behavior Policies",
        "track": "main",
        "status": "Poster",
        "tldr": "A new partially policy-agnostic method for infinite-horizon off-policy policy evalution with multiple known or unknown behavior policies.",
        "abstract": "We consider off-policy policy evaluation when the trajectory data are generated by multiple behavior policies. Recent work has shown the key role played by the state or state-action stationary distribution corrections in the infinite horizon context for off-policy policy evaluation. We propose estimated mixture policy (EMP), a novel class of partially policy-agnostic methods to accurately estimate those quantities. With careful analysis, we show that EMP gives rise to estimates with reduced variance for estimating the state stationary distribution correction while it also offers a useful induction bias for estimating the state-action stationary distribution correction. In extensive experiments with both continuous and discrete environments, we demonstrate that our algorithm offers significantly improved accuracy compared to the state-of-the-art methods.",
        "keywords": "off-policy policy evaluation;multiple importance sampling;kernel method;variance reduction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinyun Chen;Lu Wang;Yizhe Hang;Heng Ge;Hongyuan Zha",
        "authorids": "chenxinyun@cuhk.edu.cn;luwang@stu.ecnu.edu.cn;hangyhan@mail.ustc.edu.cn;hengge@mail.sdu.edu.cn;zhahy@cuhk.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nChen2020Infinite-horizon,\ntitle={Infinite-horizon Off-Policy Policy Evaluation with Multiple Behavior Policies},\nauthor={Xinyun Chen and Lu Wang and Yizhe Hang and Heng Ge and Hongyuan Zha},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgU1gHtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgU1gHtvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "324;767;499",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "197;674;523",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            530.0,
            182.17756905466342
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            464.6666666666667,
            199.0549896106322
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12044456838255433941&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkgZaT4tDr",
        "title": "Parameterized Action Reinforcement Learning for Inverted Index Match Plan Generation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Match plan generation in the inverted index at Microsoft Bing is used to be based on hand-crafted rules. We formulate the generation process as a Parameterized Action MDP with sharing parameters and purpose a reinforcement learning algorithm on such formulation. We combine deterministic policy learning on discrete and continuous action spaces and several recent advances in deep reinforcement learning. For exploring in the parameterized action space, the agent outputs softmax values for discrete actions and applies Parameter Space Noise on policy network to unify the exploration direction in both spaces. We apply prioritized recurrent replay on match plan sequences and pad short match plans. We also use invertible value function rescaling and n-step return to stabilize the training. The agent is evaluated on our environment and some benchmarks. It outperforms the well-designed production match plan and beats the baselines on the benchmarks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Linfeng Zhao;Lifei Zhu;Qi Chen;Hui Xue;Haidong Wang;Chuanjie Liu;Yuan Liu;Lawson Wong;Lintao Zhang",
        "authorids": "zhao.linf@husky.neu.edu;v-lifzh@microsoft.com;cheqi@microsoft.com;xuehui@microsoft.com;haidwa@microsoft.com;chuanli@microsoft.com;yuanliu@neu.edu.cn;lawsonlsw@northeastern.edu;lintaoz@microsoft.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@misc{\nzhao2020parameterized,\ntitle={Parameterized Action Reinforcement Learning for Inverted Index Match Plan Generation},\nauthor={Linfeng Zhao and Lifei Zhu and Qi Chen and Hui Xue and Haidong Wang and Chuanjie Liu and Yuan Liu and Lawson Wong and Lintao Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgZaT4tDr}\n}",
        "github": "http://github.com/zlf0625/iclr2020-code",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkgZaT4tDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "336;394;146",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1512;1354;532",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.0,
            105.91820743699671
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1132.6666666666667,
            429.60550379259445
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JjS3ntaR8YkJ:scholar.google.com/&scioq=Parameterized+Action+Reinforcement+Learning+for+Inverted+Index+Match+Plan+Generation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkg_wREYDS",
        "title": "Representational Disentanglement for Multi-Domain Image Completion",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Multi-domain data are widely leveraged in vision applications to take advantage of complementary information from each modality, e.g., brain tumor segmentation from multi-parametric magnetic resonance imaging (MRI). However, due to different imaging protocol and data loss or corruption, the availability of images in all domains could vary amongst multiple data sources in practice, which makes it challenging to train and test a universal model with a varied set of input data. To tackle this problem, we propose a general approach to complete the possible missing domain of the input data in a variety of application settings. Specifically, we develop a novel generative adversarial network (GAN) architecture that utilizes a representational disentanglement scheme for shared `\"skeleton\" encoding and separate `\"flesh\" encoding across multiple domains. We further illustrate that the learned representation in the multi-domain image translation could be leveraged for higher-level recognition, like segmentation. Specifically, we introduce a unified framework of image completion branch and segmentation branch with a shared content encoder. We demonstrate constant and significant performance improvement by integrating the proposed representation disentanglement scheme in both multi-domain image completion and image segmentation tasks using three evaluation datasets individually for brain tumor segmentation, prostate segmentation, and facial expression image completion.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liyue Shen;Wentao Zhu;Xiaosong Wang;Lei Xing;John Pauly;Baris Turkbey;Stephanie Harmon;Thomas Sanford;Sherif Mehralivand;Peter L. Choyke;Bradford J. Wood;Daguang Xu",
        "authorids": "liyues@stanford.edu;wentaoz@nvidia.com;xiaosongw@nvidia.com;lei@stanford.edu;pauly@stanford.edu;ismail.turkbey@nih.gov;stephanie.harmon@nih.gov;thomas.sanford@nih.gov;sherif.mehralivand@nih.gov;pchoyke@mail.nih.gov;bwood@nih.gov;daguangx@nvidia.com",
        "gender": ";;;;;;;;;;;",
        "homepage": ";;;;;;;;;;;",
        "dblp": ";;;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;;",
        "orcid": ";;;;;;;;;;;",
        "linkedin": ";;;;;;;;;;;",
        "or_profile": ";;;;;;;;;;;",
        "aff": ";;;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;;",
        "position": ";;;;;;;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkg_wREYDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "1055;278;711",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "654;419;437",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            681.3333333333334,
            317.9018017494641
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            503.3333333333333,
            106.79055305701071
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            12,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8040056161003609300&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkgb9kSKwS",
        "title": "Spectral Nonlocal Block for Neural Network",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The nonlocal network is designed for capturing long-range spatial-temporal dependencies in several computer vision tasks. Although having shown excellent performances, it needs an elaborate preparation for both the number and position of the building blocks. In this paper, we propose a new formulation of the nonlocal block and interpret it from the general graph signal processing perspective, where we view it as a fully-connected graph filter approximated by Chebyshev polynomials. The proposed nonlocal block is more efficient and robust, which is a generalized form of existing nonlocal blocks (e.g. nonlocal block, nonlocal stage). Moreover, we give the stable hypothesis and show that the steady-state of the deeper nonlocal structure should meet with it. Based on the stable hypothesis,  a full-order approximation of the nonlocal block is derived for consecutive connections. Experimental results illustrate the clear-cut improvement and practical applicability of the generalized nonlocal block on both image and video classification tasks.",
        "keywords": "Nonlocal Neural Network;Image Classification;Action Recgonition",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lei Zhu;Qi She;Lidan Zhang;Ping guo",
        "authorids": "lei1.zhu@intel.com;qi.she@intel.com;lidan.zhang@intel.com;ping.guo@intel.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhu2020spectral,\ntitle={Spectral Nonlocal Block for Neural Network},\nauthor={Lei Zhu and Qi She and Lidan Zhang and Ping guo},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgb9kSKwS}\n}",
        "github": "https://github.com/zh460045050/spectral-nonlocal-block",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkgb9kSKwS",
        "pdf_size": 0,
        "rating": "1;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "333;462;109;135",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "1487;956;349;392",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "2;2;1;1",
        "rating_avg": [
            4.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            259.75,
            145.394936294219
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            796.0,
            465.32408061479043
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13175616688017339617&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkgbYyHtwB",
        "title": "Disagreement-Regularized Imitation Learning",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Method for addressing covariate shift in imitation learning using ensemble uncertainty",
        "abstract": "We present a simple and effective algorithm designed to address the covariate shift problem in imitation learning. It operates by training an ensemble of policies on the expert demonstration data, and using the variance of their predictions as a cost which is minimized with RL together with a supervised behavioral cloning cost. Unlike adversarial imitation methods, it uses a fixed reward function which is easy to optimize. We prove a regret bound for the algorithm which is linear in the time horizon multiplied by a coefficient which we show to be low for certain problems in which behavioral cloning fails. We evaluate our algorithm empirically across multiple pixel-based Atari environments and continuous control tasks, and show that it matches or significantly outperforms behavioral cloning and generative adversarial imitation learning.",
        "keywords": "imitation learning;reinforcement learning;uncertainty",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kiante Brantley;Wen Sun;Mikael Henaff",
        "authorids": "kdbrant@cs.umd.edu;wen.sun@microsoft.com;mihenaff@microsoft.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nBrantley2020Disagreement-Regularized,\ntitle={Disagreement-Regularized Imitation Learning},\nauthor={Kiante Brantley and Wen Sun and Mikael Henaff},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgbYyHtwB}\n}",
        "github": "[![github](/images/github_icon.svg) xkianteb/dril](https://github.com/xkianteb/dril) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rkgbYyHtwB)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkgbYyHtwB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "1144;486;266",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "629;345;61",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            632.0,
            373.01295777314044
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            345.0,
            231.8850289834742
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 128,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16471833101337443213&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rkgc06VtwH",
        "title": "Improving Semantic Parsing with Neural Generator-Reranker Architecture",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Semantic parsing is the problem of deriving machine interpretable meaning representations from natural language utterances. Neural models with encoder-decoder architectures have recently achieved substantial improvements over traditional methods. Although neural semantic parsers appear to have relatively high recall using large beam sizes, there is room for improvement with respect to one-best precision. In this work, we propose a generator-reranker architecture for semantic parsing. The generator produces a list of potential candidates and the reranker, which consists of a pre-processing step for the candidates followed by a novel critic network, reranks these candidates based on the similarity between each candidate and the input sentence. We show the advantages of this approach along with how it improves the parsing performance through extensive analysis. We experiment our model on three semantic parsing datasets (GEO, ATIS, and OVERNIGHT). The overall architecture achieves the state-of-the-art results in all three datasets. ",
        "keywords": "Natural Language Processing;Semantic Parsing;Neural Reranking",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Huseyin A. Inan;Gaurav Singh Tomar;Huapu Pan",
        "authorids": "hinan1@stanford.edu;gtomar@google.com;huapupan@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ninan2020improving,\ntitle={Improving Semantic Parsing with Neural Generator-Reranker Architecture},\nauthor={Huseyin A. Inan and Gaurav Singh Tomar and Huapu Pan},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgc06VtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkgc06VtwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "532;502;336",
        "wc_reply_reviewers": "0;0;49",
        "wc_reply_authors": "839;530;1148",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            456.6666666666667,
            86.19873677857595
        ],
        "wc_reply_reviewers_avg": [
            16.333333333333332,
            23.098821518760552
        ],
        "wc_reply_authors_avg": [
            839.0,
            252.29744350666735
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2863972972362491365&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkgcsyBKDH",
        "title": "Generative Integration Networks",
        "track": "main",
        "status": "Desk Reject",
        "tldr": "",
        "abstract": "This paper presents an unbiased exploration framework for the belief state $p(s)$ in non-cooperative, multi-agent, partially-observable environments through differentiable recurrent functions. As well as single-agent exploration via intrinsic reward and generative RNNs, several researchers have proposed differentiable multi-agent communication models such as CommNet and IC3Net for scalable exploration through multiple agents. However, none of the existing frameworks so far capture the unbiased belief state in non-cooperative settings as with the nature due to  biased examples reported from adersarial agents.\n{\\em Generative integration networks} (GINs) is the first unbiased exploration framework insipired by honest reporting mechanisms in economics. The key idea is {\\em synchrony}, an inter-agent reward to discriminate the honest reporting and the adversarial reporting \\textbf{without real examples}, which is the different point from the GANs. Experimental results obtained using two non-cooperative multi-agent environments up to 20 agents denote that GINs show state-of-the-art performance in the exploration frameworks.\n",
        "keywords": "Synchrony;State Representations;Generative RNNs;Adversarial Learning;Multi-agent Communication",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shohei Ohsawa",
        "authorids": "s.ohsawa@iii.u-tokyo.ac.jp",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nanonymous2020generative,\ntitle={Generative Integration Networks},\nauthor={Anonymous},\nbooktitle={Submitted to International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgcsyBKDH},\nnote={under review}\n}",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=rkgcsyBKDH",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Xbj0C4OCVCAJ:scholar.google.com/&scioq=Generative+Integration+Networks&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "rkgdYhVtvH",
        "title": "Unifying Graph Convolutional Neural Networks and Label Propagation",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper studies theoretical relationships between Graph Convolutional Neural Networks (GCN) and Label Propagation Algorithm (LPA), then proposes an end-to-end model that unifies GCN and LPA for node classification.",
        "abstract": "Label Propagation (LPA) and Graph Convolutional Neural Networks (GCN) are both message passing algorithms on graphs. Both solve the task of node classification but LPA propagates node label information across the edges of the graph, while GCN propagates and transforms node feature information. However, while conceptually similar, theoretical relation between LPA and GCN has not yet been investigated. Here we study the relationship between LPA and GCN in terms of two aspects: (1) feature/label smoothing where we analyze how the feature/label of one node are spread over its neighbors; And, (2) feature/label influence of how much the initial feature/label of one node influences the final feature/label of another node. Based on our theoretical analysis, we propose an end-to-end model that unifies GCN and LPA for node classification. In our unified model, edge weights are learnable, and the LPA serves as regularization to assist the GCN in learning proper edge weights that lead to improved classification performance. Our model can also be seen as learning attention weights based on node labels, which is more task-oriented than existing feature-based attention models. In a number of experiments on real-world graphs, our model shows superiority over state-of-the-art GCN-based methods in terms of node classification accuracy.\n",
        "keywords": "graph convolutional neural networks;label propagation;node classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hongwei Wang;Jure Leskovec",
        "authorids": "wanghongwei55@gmail.com;jure@cs.stanford.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nwang2020unifying,\ntitle={Unifying Graph Convolutional Neural Networks and Label Propagation},\nauthor={Hongwei Wang and Jure Leskovec},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgdYhVtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgdYhVtvH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "213;224;141",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "498;372;114",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            192.66666666666666,
            36.80881536926839
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            328.0,
            159.82490419205638
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 220,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3579942509717383483&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkgfdeBYvH",
        "title": "Effect of Activation Functions on the Training of Overparametrized Neural Nets",
        "track": "main",
        "status": "Poster",
        "tldr": "We provide theoretical results about the effect of activation function on the training of highly overparametrized 2-layer neural networks",
        "abstract": "It is well-known that overparametrized neural networks trained using gradient based methods quickly achieve small training error with appropriate hyperparameter settings. Recent papers have proved this statement theoretically for highly overparametrized networks under reasonable assumptions. These results either assume that the activation function is ReLU or they depend on the minimum eigenvalue of a certain Gram matrix. In the latter case, existing works only prove that this minimum eigenvalue is non-zero and do not provide quantitative bounds which require that this eigenvalue be large. Empirically, a number of alternative activation functions have been proposed which tend to perform better than ReLU at least in some settings but no clear understanding has emerged. This state of affairs underscores the importance of theoretically understanding the impact of activation functions on training. In the present paper, we provide theoretical results about the effect of activation function on the training of highly overparametrized 2-layer neural networks. A crucial property that governs the performance of an activation is whether or not it is smooth: \n\u2022 For non-smooth activations such as ReLU, SELU, ELU, which are not smooth because there is a point where either the \ufb01rst order or second order derivative is discontinuous, all eigenvalues of the associated Gram matrix are large under minimal assumptions on the data. \n\u2022 For smooth activations such as tanh, swish, polynomial, which have derivatives of all orders at all points, the situation is more complex: if the subspace spanned by the data has small dimension then the minimum eigenvalue of the Gram matrix can be small leading to slow training. But if the dimension is large and the data satis\ufb01es another mild condition, then the eigenvalues are large. If we allow deep networks, then the small data dimension is not a limitation provided that the depth is suf\ufb01cient. \nWe discuss a number of extensions and applications of these results.",
        "keywords": "activation functions;deep learning theory;neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Abhishek Panigrahi;Abhishek Shetty;Navin Goyal",
        "authorids": "abhishekpanigrahi034@gmail.com;ashetty1995@gmail.com;navingo@microsoft.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nPanigrahi2020Effect,\ntitle={Effect of Activation Functions on the Training of Overparametrized Neural Nets},\nauthor={Abhishek Panigrahi and Abhishek Shetty and Navin Goyal},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgfdeBYvH}\n}",
        "github": "https://drive.google.com/file/d/1Erj761XggITFSlcdiJJ8fKAkoEALU4L8/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgfdeBYvH",
        "pdf_size": 0,
        "rating": "6;8",
        "confidence": "0;0",
        "wc_review": "517;338",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "569;125",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            7.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            427.5,
            89.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            347.0,
            222.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9853661943036618139&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkgg6xBYDH",
        "title": "Understanding Generalization in Recurrent Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We prove generalization bounds for recurrent neural networks based on matrix 1-norm and Fisher-Rao norm.",
        "abstract": "In this work, we develop the theory for analyzing the generalization performance of recurrent neural networks. We first present a new generalization bound for recurrent neural networks based on matrix 1-norm and Fisher-Rao norm. The definition of Fisher-Rao norm relies on a structural lemma about the gradient of RNNs. This new generalization bound assumes that the covariance matrix of the input data is positive definite, which might limit its use in practice. To address this issue, we propose to add random noise to the input data and prove a generalization bound for training with random noise, which is an extension of the former one. Compared with existing results, our generalization bounds have no explicit dependency on the size of networks. We also discover that Fisher-Rao norm for RNNs can be interpreted as a measure of gradient, and incorporating this gradient measure not only can tighten the bound, but allows us to build a relationship between generalization and trainability. Based on the bound, we theoretically analyze the effect of covariance of features on generalization of RNNs and discuss how weight decay and gradient clipping in the training can help improve generalization. ",
        "keywords": "generalization;recurrent neural networks;learning theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhuozhuo Tu;Fengxiang He;Dacheng Tao",
        "authorids": "zhtu3055@uni.sydney.edu.au;fengxiang.he@sydney.edu.au;dacheng.tao@sydney.edu.au",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nTu2020Understanding,\ntitle={Understanding Generalization in Recurrent Neural Networks},\nauthor={Zhuozhuo Tu and Fengxiang He and Dacheng Tao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgg6xBYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkgg6xBYDH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "307;161;532",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "730;89;156",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            333.3333333333333,
            152.60042230901226
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            325.0,
            287.6815368887386
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 46,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13263006049785135701&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkgi6JSYvB",
        "title": "GraphFlow: Exploiting Conversation Flow with Graph Neural Networks for Conversational Machine Comprehension",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Conversational machine comprehension (MC) has proven significantly more challenging compared to traditional MC since it requires better utilization of conversation history. However, most existing approaches do not effectively capture conversation history and thus have trouble handling questions involving coreference or ellipsis. We propose a novel graph neural network (GNN) based model, namely GraphFlow, which captures conversational flow in the dialog. Specifically, we first propose a new approach to dynamically construct a question-aware context graph from passage text at each turn. We then present a novel flow mechanism to model the temporal dependencies in the sequence of context graphs. The proposed GraphFlow model shows superior performance compared to existing state-of-the-art methods. For instance, GraphFlow outperforms two recently proposed models on the CoQA benchmark dataset: FlowQA by 2.3% and SDNet by 0.7% on F1 score, respectively. In addition, visualization experiments show that our proposed model can better mimic the human reasoning process for conversational MC compared to existing models. ",
        "keywords": "deep learning;graph neural networks;natural language processing;reading comprehension",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Chen;Lingfei Wu;Mohammed J. Zaki",
        "authorids": "cheny39@rpi.edu;lwu@email.wm.edu;zaki@cs.rpi.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nchen2020graphflow,\ntitle={GraphFlow: Exploiting Conversation Flow with Graph Neural Networks for Conversational Machine Comprehension},\nauthor={Yu Chen and Lingfei Wu and Mohammed J. Zaki},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgi6JSYvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgi6JSYvB",
        "pdf_size": 0,
        "rating": "1;1",
        "confidence": "0;0",
        "wc_review": "388;144",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            266.0,
            122.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 97,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14544389177081733623&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 14
    },
    {
        "id": "rkgiURVFDS",
        "title": "Certified Robustness to Adversarial Label-Flipping Attacks via Randomized Smoothing",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a classifier that is certifiably robust against an adversary that flips labels to target each test point independently; we then show how this classifier can be evaluated at no additional runtime cost over traditional classification.",
        "abstract": "This paper considers label-flipping attacks, a type of data poisoning attack where an adversary relabels a small number of examples in a training set in order to degrade the performance of the resulting classifier. In this work, we propose a strategy to build classifiers that are certifiably robust against a strong variant of label-flipping, where the adversary can target each test example independently. In other words, for each test point, our classifier makes a prediction and includes a certification that its prediction would be the same had some number of training labels been changed adversarially. Our approach leverages randomized smoothing, a technique that has previously been used to guarantee test-time robustness to adversarial manipulation of the input to a classifier. Further, we obtain these certified bounds with no additional runtime cost over standard classification. On the Dogfish binary classification task from ImageNet, in the face of an adversary who is allowed to flip 10 labels to individually target each test point, the baseline undefended classifier achieves no more than 29.3% accuracy; we obtain a classifier that maintains 64.2% certified accuracy against the same adversary.",
        "keywords": "Adversarial Robustness;Label Flipping Attack;Data Poisoning Attack",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Elan Rosenfeld;Ezra Winston;Pradeep Ravikumar;J. Zico Kolter",
        "authorids": "ekr@andrew.cmu.edu;ewinston@andrew.cmu.edu;pradeepr@cs.cmu.edu;zkolter@cs.cmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nrosenfeld2020certified,\ntitle={Certified Robustness to Adversarial Label-Flipping Attacks via Randomized Smoothing},\nauthor={Elan Rosenfeld and Ezra Winston and Pradeep Ravikumar and J. Zico Kolter},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgiURVFDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgiURVFDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "234;824;258",
        "wc_reply_reviewers": "15;0;0",
        "wc_reply_authors": "287;1065;401",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;3;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            438.6666666666667,
            272.6479211160226
        ],
        "wc_reply_reviewers_avg": [
            5.0,
            7.0710678118654755
        ],
        "wc_reply_authors_avg": [
            584.3333333333334,
            343.054255508108
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16833220820025540707&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkgl51rKDB",
        "title": "Efficient meta reinforcement learning via meta goal generation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Meta reinforcement learning (meta-RL) is able to accelerate the acquisition of new tasks by learning from past experience. Current meta-RL methods usually learn to adapt to new tasks by directly optimizing the parameters of policies over primitive actions. However, for complex tasks which requires sophisticated control strategies, it would be quite inefficient to to directly learn such a meta-policy. Moreover, this problem can become more severe and even fail in spare reward settings, which is quite common in practice. To this end, we propose a new meta-RL algorithm called meta goal-generation for hierarchical RL (MGHRL) by leveraging hierarchical actor-critic framework. Instead of directly generate policies over primitive actions for new tasks, MGHRL learns to generate high-level meta strategies over subgoals given past experience and leaves the rest of how to achieve subgoals as independent RL subtasks. Our empirical results on several challenging simulated robotics environments show that our method enables more efficient and effective meta-learning from past experience and outperforms state-of-the-art meta-RL and Hierarchical-RL methods in sparse reward settings.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Haotian Fu;Hongyao Tang;Jianye Hao",
        "authorids": "haotianfu@tju.edu.cn;bluecontra@tju.edu.cn;jianye.hao@tju.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nfu2020efficient,\ntitle={Efficient meta reinforcement learning via meta goal generation},\nauthor={Haotian Fu and Hongyao Tang and Jianye Hao},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgl51rKDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkgl51rKDB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "743;398;651",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "15;15;15",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            597.3333333333334,
            145.86828152671012
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            15.0,
            0.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1101128621807918785&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkglZyHtvH",
        "title": "Refining the variational posterior through iterative optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "The paper proposes an algorithm to increase the flexibility of the variational posterior in Bayesian neural networks through iterative optimization.",
        "abstract": "Variational inference (VI) is a popular approach for approximate Bayesian inference that is particularly promising for highly parameterized models such as deep neural networks.  A key challenge of variational inference is to approximate the posterior over model parameters with a distribution that is simpler and tractable yet sufficiently expressive. In this work, we propose a method for training highly flexible variational distributions by starting with a coarse approximation and iteratively refining it. Each refinement step makes cheap, local adjustments and only requires optimization of simple variational families. We demonstrate theoretically that our method always improves a bound on the approximation (the Evidence Lower BOund) and observe this empirically across a variety of benchmark tasks.  In experiments, our method consistently outperforms recent variational inference methods for deep learning in terms of log-likelihood and the ELBO.  We see that the gains are further amplified on larger scale models, significantly outperforming standard VI and deep ensembles on residual networks on CIFAR10.",
        "keywords": "uncertainty estimation;variational inference;auxiliary variables;Bayesian neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marton Havasi;Jasper Snoek;Dustin Tran;Jonathan Gordon;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato",
        "authorids": "mh740@cam.ac.uk;jsnoek@google.com;trandustin@google.com;jg801@cam.ac.uk;jmh233@cam.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nhavasi2020refining,\ntitle={Refining the variational posterior through iterative optimization},\nauthor={Marton Havasi and Jasper Snoek and Dustin Tran and Jonathan Gordon and Jos{\\'e} Miguel Hern{\\'a}ndez-Lobato},\nyear={2020},\nurl={https://openreview.net/forum?id=rkglZyHtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkglZyHtvH",
        "pdf_size": 0,
        "rating": "3;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "185;252;261;434",
        "wc_reply_reviewers": "0;249;0;265",
        "wc_reply_authors": "248;517;208;612",
        "reply_reviewers": "0;1;0;1",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            5.25,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            283.0,
            91.99184746487049
        ],
        "wc_reply_reviewers_avg": [
            128.5,
            128.62445335160808
        ],
        "wc_reply_authors_avg": [
            396.25,
            172.1516410029251
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1424754595421473107&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rkgpv2VFvr",
        "title": "Sharing Knowledge in Multi-Task Deep Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "A study on the benefit of sharing representation in Multi-Task Reinforcement Learning.",
        "abstract": "We study the benefit of sharing representations among tasks to enable the effective use of deep neural networks in Multi-Task Reinforcement Learning. We leverage the assumption that learning from different tasks, sharing common properties, is helpful to generalize the knowledge of them resulting in a more effective feature extraction compared to learning a single task. Intuitively, the resulting set of features offers performance benefits when used by Reinforcement Learning algorithms. We prove this by providing theoretical guarantees that highlight the conditions for which is convenient to share representations among tasks, extending the well-known finite-time bounds of Approximate Value-Iteration to the multi-task setting. In addition, we complement our analysis by proposing multi-task extensions of three Reinforcement Learning algorithms that we empirically evaluate on widely used Reinforcement Learning benchmarks showing significant improvements over the single-task counterparts in terms of sample efficiency and performance.",
        "keywords": "Deep Reinforcement Learning;Multi-Task",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Carlo D'Eramo;Davide Tateo;Andrea Bonarini;Marcello Restelli;Jan Peters",
        "authorids": "carlo@robot-learning.de;davide@robot-learning.de;andrea.bonarini@polimi.it;marcello.restelli@polimi.it;peters@ias.tu-darmstadt.de",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nD'Eramo2020Sharing,\ntitle={Sharing Knowledge in Multi-Task Deep Reinforcement Learning},\nauthor={Carlo D'Eramo and Davide Tateo and Andrea Bonarini and Marcello Restelli and Jan Peters},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgpv2VFvr}\n}",
        "github": "https://github.com/carloderamo/shared/tree/master",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkgpv2VFvr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "178;430;345",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "287;585;626",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            317.6666666666667,
            104.67834329772113
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            499.3333333333333,
            151.07246237779037
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 159,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15791728185795566753&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rkgqN1SYvr",
        "title": "Provable Benefit of Orthogonal Initialization in Optimizing Deep Linear Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We provide for the first time a rigorous proof that orthogonal initialization speeds up convergence relative to Gaussian initialization, for deep linear networks.",
        "abstract": "The selection of initial parameter values for gradient-based optimization of deep neural networks is one of the most impactful hyperparameter choices in deep learning systems, affecting both convergence times and model performance. Yet despite significant empirical and theoretical analysis, relatively little has been proved about the concrete effects of different initialization schemes. In this work, we analyze the effect of initialization in deep linear networks, and provide for the first time a rigorous proof that drawing the initial weights from the orthogonal group speeds up convergence relative to the standard Gaussian initialization with iid weights. We show that for deep networks, the width needed for efficient convergence to a global minimum with orthogonal initializations is independent of the depth, whereas the width needed for efficient convergence with Gaussian initializations scales linearly in the depth. Our results demonstrate how the benefits of a good initialization can persist throughout learning, suggesting an explanation for the recent empirical successes found by initializing very deep non-linear networks according to the principle of dynamical isometry.",
        "keywords": "deep learning theory;non-convex optimization;orthogonal initialization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wei Hu;Lechao Xiao;Jeffrey Pennington",
        "authorids": "huwei@cs.princeton.edu;xlc@google.com;jpennin@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nHu2020Provable,\ntitle={Provable Benefit of Orthogonal Initialization in Optimizing Deep Linear Networks},\nauthor={Wei Hu and Lechao Xiao and Jeffrey Pennington},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgqN1SYvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgqN1SYvr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "305;291;176",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "530;514;6",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            257.3333333333333,
            57.79465565449994
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            350.0,
            243.33242008961048
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 157,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1278743170539846117&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rkgqm0VKwB",
        "title": "End-to-end named entity recognition and relation extraction using pre-trained language models",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel, high-performing architecture for end-to-end named entity recognition and relation extraction that is fast to train.",
        "abstract": "Named entity recognition (NER) and relation extraction (RE) are two important tasks in information extraction and retrieval (IE & IR). Recent work has demonstrated that it is beneficial to learn these tasks jointly, which avoids the propagation of error inherent in pipeline-based systems and improves performance. However, state-of-the-art joint models typically rely on external natural language processing (NLP) tools, such as dependency parsers, limiting their usefulness to domains (e.g. news) where those tools perform well. The few neural, end-to-end models that have been proposed are trained almost completely from scratch. In this paper, we propose a neural, end-to-end model for jointly extracting entities and their relations which does not rely on external NLP tools and which integrates a large, pre-trained language model. Because the bulk of our model's parameters are pre-trained and we eschew recurrence for self-attention, our model is fast to train. On 5 datasets across 3 domains, our model matches or exceeds state-of-the-art performance, sometimes by a large margin.",
        "keywords": "named entity recognition;relation extraction;information extraction;information retrival;transfer learning;multi-task learning;BERT;transformers;language models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "John Giorgi;Xindi Wang;Nicola Sahar;Won Young Shin;Gary Bader;Bo Wang",
        "authorids": "john.giorgi@utoronto.ca;xindi.wang@uhnresearch.ca;nicola.sahar@mail.utoronto.ca;wonyoung.shin@mail.utoronto.ca;gary.bader@utoronto.ca;bowang@vectorinstitute.ai",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ngiorgi2020endtoend,\ntitle={End-to-end named entity recognition and relation extraction using pre-trained language models},\nauthor={John Giorgi and Xindi Wang and Nicola Sahar and Won Young Shin and Gary Bader and Bo Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgqm0VKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkgqm0VKwB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "154;210;168",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "398;467;498",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            177.33333333333334,
            23.79542439676633
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            454.3333333333333,
            41.79579989531112
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 61,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2135112875144327392&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkgrbTNtDr",
        "title": "Style-based Encoder Pre-training for Multi-modal Image Synthesis",
        "track": "main",
        "status": "Reject",
        "tldr": "Multi-modal image-to-image translation via encoder pre-training to encode the distribution of output variability.",
        "abstract": "Image-to-image (I2I) translation aims to translate images from one domain to another. To tackle the multi-modal version of I2I translation, where input and output domains have a one-to-many relation, an extra latent input is provided to the generator to specify a particular output. Recent works propose involved training objectives to learn a latent embedding, jointly with the generator, that models the distribution of possible outputs. Alternatively, we study a simple, yet powerful pre-training strategy for multi-modal I2I translation. We first pre-train an encoder, using a proxy task, to encode the style of an image, such as color and texture, into a low-dimensional latent style vector. Then we train a generator to transform an input image along with a style-code to the output domain. Our generator achieves state-of-the-art results on several benchmarks with a training objective that includes just a GAN loss and a reconstruction loss, which simplifies and speeds up the training significantly compared to competing approaches. We further study the contribution of different loss terms to learning the task of multi-modal I2I translation, and finally we show that the learned style embedding is not dependent on the target domain and generalizes well to other domains.",
        "keywords": "image-to_image translation;representation learning;multi-modal image synthesis;GANs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Moustafa Meshry;Yixuan Ren;Ricardo Martin-Brualla;Larry Davis;Abhinav Shrivastava",
        "authorids": "mmeshry@cs.umd.edu;yxren@cs.umd.edu;rmbrualla@google.com;lsd@umiacs.umd.edu;abhinav@cs.umd.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nmeshry2020stylebased,\ntitle={Style-based Encoder Pre-training for Multi-modal Image Synthesis},\nauthor={Moustafa Meshry and Yixuan Ren and Ricardo Martin-Brualla and Larry Davis and Abhinav Shrivastava},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgrbTNtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgrbTNtDr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "187;334;249",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "767;448;650",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            256.6666666666667,
            60.25685760881402
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            621.6666666666666,
            131.76325571097414
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OSY9AtqCwocJ:scholar.google.com/&scioq=Style-based+Encoder+Pre-training+for+Multi-modal+Image+Synthesis&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rkgt0REKwS",
        "title": "Curriculum Loss: Robust Learning and Generalization against Label Corruption",
        "track": "main",
        "status": "Poster",
        "tldr": "A novel loss bridges curriculum learning and robust learning",
        "abstract": "Deep neural networks (DNNs) have great expressive power, which can even memorize samples with wrong labels. It is vitally important to reiterate robustness and generalization in DNNs against label corruption. To this end, this paper studies the 0-1 loss, which has a monotonic relationship between empirical adversary (reweighted) risk (Hu et al. 2018). Although the 0-1 loss is robust to outliers, it is also difficult to optimize.   To efficiently optimize the 0-1 loss while keeping its robust properties, we propose a very simple and efficient loss, i.e. curriculum loss (CL). Our CL  is a tighter upper bound of the 0-1 loss compared with conventional summation based surrogate losses.  Moreover, CL can adaptively select samples for stagewise training. As a result, our loss can be deemed as a novel perspective of curriculum sample selection strategy, which bridges a connection between curriculum learning and robust learning.     Experimental results on noisy MNIST, CIFAR10 and CIFAR100 dataset validate the robustness of the proposed loss.",
        "keywords": "Curriculum Learning;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yueming Lyu;Ivor W. Tsang",
        "authorids": "lv_yueming@outlook.com;ivor.tsang@uts.edu.au",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLyu2020Curriculum,\ntitle={Curriculum Loss: Robust Learning and Generalization  against Label Corruption},\nauthor={Yueming Lyu and Ivor W. Tsang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgt0REKwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkgt0REKwS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "1303;192;318",
        "wc_reply_reviewers": "297;0;0",
        "wc_reply_authors": "451;70;230",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            604.3333333333334,
            496.70268325785753
        ],
        "wc_reply_reviewers_avg": [
            99.0,
            140.0071426749364
        ],
        "wc_reply_authors_avg": [
            250.33333333333334,
            156.20570483251313
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 217,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4518224361749160632&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkguLC4tPB",
        "title": "Unknown-Aware Deep Neural Network",
        "track": "main",
        "status": "Reject",
        "tldr": "A CNN architecture that can effective rejects the unknowns in test objects",
        "abstract": "An important property of image classification systems in the real world is that they both accurately classify objects from target classes (``knowns'') and safely reject unknown objects (``unknowns'') that belong to classes not present in the training data. Unfortunately, although the strong generalization ability of existing CNNs ensures their accuracy when classifying known objects, it also causes them to often assign an unknown to a target class with high confidence. As a result, simply using low-confidence detections as a way to detect unknowns does not work well. In this work, we propose an Unknown-aware Deep Neural Network (UDN for short) to solve this challenging problem. The key idea of UDN is to enhance existing CNNs to support a product operation that models the product relationship among the features produced by convolutional layers. This way, missing a single key feature of a target class will greatly reduce the probability of assigning an object to this class. UDN uses a learned ensemble of these product operations, which allows it to balance the contradictory requirements of accurately classifying known objects and correctly rejecting unknowns. To further improve the performance of UDN at detecting unknowns, we propose an information-theoretic regularization strategy that incorporates the objective of rejecting unknowns into the learning process of UDN. We experiment on benchmark image datasets including MNIST, CIFAR-10, CIFAR-100, and SVHN, adding unknowns by injecting one dataset into another. Our results demonstrate that UDN significantly outperforms state-of-the-art methods at rejecting unknowns by 25 percentage points improvement in accuracy, while still preserving the classification accuracy. ",
        "keywords": "unknown;rejection;CNN;product relationship",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lei Cao;Yizhou Yan;Samuel Madden;Elke Rundensteiner",
        "authorids": "lcao@csail.mit.edu;yyan2@wpi.edu;madden@csail.mit.edu;rundenst@cs.wpi.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ncao2020unknownaware,\ntitle={Unknown-Aware Deep Neural Network},\nauthor={Lei Cao and Yizhou Yan and Samuel Madden and Elke Rundensteiner},\nyear={2020},\nurl={https://openreview.net/forum?id=rkguLC4tPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkguLC4tPB",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "219;443;312",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "658;393;294",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            324.6666666666667,
            91.88519406786324
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            448.3333333333333,
            153.66702819914084
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6237937420218265029&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkgvXlrKwH",
        "title": "SEED RL: Scalable and Efficient Deep-RL with Accelerated Central Inference",
        "track": "main",
        "status": "Talk",
        "tldr": "SEED RL, a scalable and efficient deep reinforcement learning agent with accelerated central inference. State of the art results, reduces cost and can process millions of frames per second. ",
        "abstract": "We present a modern scalable reinforcement learning agent called SEED (Scalable, Efficient Deep-RL). By effectively utilizing modern accelerators, we show that it is not only possible to train on millions of frames per second but also to lower the cost. of experiments compared to current methods. We achieve this with a simple architecture that features centralized inference and an optimized communication layer. SEED adopts two state-of-the-art distributed algorithms, IMPALA/V-trace (policy gradients) and R2D2 (Q-learning), and is evaluated on Atari-57, DeepMind Lab and Google Research Football. We improve the state of the art on Football and are able to reach state of the art on Atari-57 twice as fast in wall-time. For the scenarios we consider, a 40% to 80% cost reduction for running experiments is achieved. The implementation along with experiments is open-sourced so results can be reproduced and novel ideas tried out.",
        "keywords": "machine learning;reinforcement learning;scalability;distributed;DeepMind Lab;ALE;Atari-57;Google Research Football",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lasse Espeholt;Rapha\u00ebl Marinier;Piotr Stanczyk;Ke Wang;Marcin Michalski\u200e",
        "authorids": "lespeholt@google.com;raphaelm@google.com;stanczyk@google.com;kewa@google.com;michalski@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nEspeholt2020SEED,\ntitle={SEED RL: Scalable and Efficient Deep-RL with Accelerated Central Inference},\nauthor={Lasse Espeholt and Rapha\u00ebl Marinier and Piotr Stanczyk and Ke Wang and Marcin Michalski\u200e},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgvXlrKwH}\n}",
        "github": "https://drive.google.com/file/d/144yp7PQf486dmctE2oS2md_qmNBTFbez/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer5",
        "site": "https://openreview.net/forum?id=rkgvXlrKwH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "109;138;204",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "55;22;96",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            150.33333333333334,
            39.75200903378623
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            57.666666666666664,
            30.26916289265731
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 166,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5459654094981321816&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkgyS0VFvr",
        "title": "DBA: Distributed Backdoor Attacks against Federated Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We proposed a novel distributed backdoor attack on federated learning and show that it is not only more effective compared with standard centralized attacks, but also harder to be defended by existing robust FL methods",
        "abstract": "Backdoor attacks aim to manipulate a subset of training data by injecting adversarial triggers such that machine learning models trained on the tampered dataset will make arbitrarily (targeted) incorrect prediction on the testset with the same trigger embedded. While federated learning (FL) is capable of aggregating information provided by different parties for training a better model, its distributed learning methodology and inherently heterogeneous data distribution across parties may bring new vulnerabilities. In addition to recent centralized backdoor attacks on FL where each party embeds the same global trigger during training, we propose the distributed backdoor attack (DBA) --- a novel threat assessment framework developed by fully exploiting the distributed nature of FL. DBA decomposes a global trigger pattern into separate local patterns and embed them into the training set of different adversarial parties respectively. Compared to standard centralized backdoors, we show that DBA is substantially more persistent and stealthy against FL on diverse datasets such as finance and image data. We conduct extensive experiments to show that the attack success rate of DBA is significantly higher than centralized backdoors under different settings. Moreover, we find that distributed attacks are indeed more insidious, as DBA can evade two state-of-the-art robust FL algorithms against centralized backdoors. We also provide explanations for the effectiveness of DBA via feature visual interpretation and feature importance ranking.\nTo further explore the properties of DBA, we test the attack performance by varying different trigger factors, including local trigger variations (size, gap, and location), scaling factor in FL, data distribution, and poison ratio and interval. Our proposed DBA and thorough evaluation results shed lights on characterizing the robustness of FL.",
        "keywords": "distributed backdoor attack;federated learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chulin Xie;Keli Huang;Pin-Yu Chen;Bo Li",
        "authorids": "chulinxie@zju.edu.cn;nick_cooper@sjtu.edu.cn;pin-yu.chen@ibm.com;lbo@illinois.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nXie2020DBA:,\ntitle={DBA: Distributed Backdoor Attacks against Federated Learning},\nauthor={Chulin Xie and Keli Huang and Pin-Yu Chen and Bo Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgyS0VFvr}\n}",
        "github": "[![github](/images/github_icon.svg) AI-secure/DBA](https://github.com/AI-secure/DBA) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rkgyS0VFvr)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkgyS0VFvr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "552;265;221",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "347;383;8",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            346.0,
            146.76738965678535
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            246.0,
            168.9319389576761
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 907,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15355321964504262987&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkgz2aEKDr",
        "title": "On the Variance of the Adaptive Learning Rate and Beyond",
        "track": "main",
        "status": "Poster",
        "tldr": "If warmup is the answer, what is the question?",
        "abstract": "The learning rate warmup heuristic achieves remarkable success in stabilizing training, accelerating convergence and improving generalization for adaptive stochastic optimization algorithms like RMSprop and Adam. Pursuing the theory behind warmup, we identify a problem of the adaptive learning rate -- its variance is problematically large in the early stage, and presume warmup works as a variance reduction technique. We provide both empirical and theoretical evidence to verify our hypothesis. We further propose Rectified Adam (RAdam), a novel variant of Adam, by introducing a term to rectify the variance of the adaptive learning rate. Experimental results on image classification, language modeling, and neural machine translation verify our intuition and demonstrate the efficacy and robustness of RAdam. ",
        "keywords": "warmup;adam;adaptive learning rate;variance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liyuan Liu;Haoming Jiang;Pengcheng He;Weizhu Chen;Xiaodong Liu;Jianfeng Gao;Jiawei Han",
        "authorids": "ll2@illinois.edu;jianghm@gatech.edu;penhe@microsoft.com;wzchen@microsoft.com;xiaodl@microsoft.com;jfgao@microsoft.com;hanj@illinois.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nLiu2020On,\ntitle={On the Variance of the Adaptive Learning Rate and Beyond},\nauthor={Liyuan Liu and Haoming Jiang and Pengcheng He and Weizhu Chen and Xiaodong Liu and Jianfeng Gao and Jiawei Han},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkgz2aEKDr}\n}",
        "github": "https://github.com/LiyuanLucasLiu/RAdam",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkgz2aEKDr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "87;355;105",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "104;171;491",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            182.33333333333334,
            122.31471247927982
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            255.33333333333334,
            168.87141722492754
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2552,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2176563085556003509&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkl03ySYDH",
        "title": "SPACE: Unsupervised Object-Oriented Scene Representation via Spatial Attention and Decomposition",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a generative latent variable model for unsupervised scene decomposition that provides factorized object representation per foreground object while also decomposing background segments of complex morphology.",
        "abstract": "The ability to decompose complex multi-object scenes into meaningful abstractions like objects is fundamental to achieve higher-level cognition. Previous approaches for unsupervised object-oriented scene representation learning are either based on spatial-attention or scene-mixture approaches and limited in scalability which is a main obstacle towards modeling real-world scenes. In this paper, we propose a generative latent variable model, called SPACE, that provides a uni\ufb01ed probabilistic modeling framework that combines the best of spatial-attention and scene-mixture approaches. SPACE can explicitly provide factorized object representations for foreground objects while also decomposing background segments of complex morphology. Previous models are good at either of these, but not both. SPACE also resolves the scalability problems of previous methods by incorporating parallel spatial-attention and thus is applicable to scenes with a large number of objects without performance degradations. We show through experiments on Atari and 3D-Rooms that SPACE achieves the above properties consistently in comparison to SPAIR, IODINE, and GENESIS. Results of our experiments can be found on our project website: https://sites.google.com/view/space-project-page",
        "keywords": "Generative models;Unsupervised scene representation;Object-oriented representation;spatial attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhixuan Lin;Yi-Fu Wu;Skand Vishwanath Peri;Weihao Sun;Gautam Singh;Fei Deng;Jindong Jiang;Sungjin Ahn",
        "authorids": "zxlin@zju.edu.cn;yifu.wu@gmail.com;pvskand@protonmail.com;ws383@scarletmail.rutgers.edu;singh.gautam.iitg@gmail.com;fei.deng@rutgers.edu;jindong.jiang@rutgers.edu;sjn.ahn@gmail.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nLin2020SPACE:,\ntitle={SPACE: Unsupervised Object-Oriented Scene Representation via Spatial Attention and Decomposition},\nauthor={Zhixuan Lin and Yi-Fu Wu and Skand Vishwanath Peri and Weihao Sun and Gautam Singh and Fei Deng and Jindong Jiang and Sungjin Ahn},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkl03ySYDH}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=rkl03ySYDH)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkl03ySYDH",
        "pdf_size": 0,
        "rating": "3;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "475;790;350;363",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "526;1642;704;806",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;3;1;2",
        "rating_avg": [
            5.25,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            494.5,
            177.3929254508195
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            919.5,
            429.0020396221911
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.75,
            0.82915619758885
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 269,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17255078810062024666&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkl2s34twS",
        "title": "Wildly Unsupervised Domain Adaptation and Its Powerful and Efficient Solution",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In unsupervised domain adaptation (UDA), classifiers for the target domain (TD) are trained with clean labeled data from the source domain (SD) and unlabeled data from TD. However, in the wild, it is hard to acquire a large amount of perfectly clean labeled data in SD given limited budget. Hence, we consider a new, more realistic and more challenging problem setting, where classifiers have to be trained with noisy labeled data from SD and unlabeled data from TD---we name it wildly UDA (WUDA). We show that WUDA ruins all UDA methods if taking no care of label noise in SD, and to this end, we propose a Butterfly framework, a powerful and efficient solution to WUDA. Butterfly maintains four models (e.g., deep networks) simultaneously, where two take care of all adaptations (i.e., noisy-to-clean, labeled-to-unlabeled, and SD-to-TD-distributional) and then the other two can focus on classification in TD. As a consequence, Butterfly possesses all the conceptually necessary components for solving WUDA. Experiments demonstrate that under WUDA, Butterfly significantly outperforms existing baseline methods.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Feng Liu;Jie Lu;Bo Han;Gang Niu;Guangquan Zhang;Masashi Sugiyama",
        "authorids": "feng.liu-2@student.uts.edu.au;jie.lu@uts.edu.au;bo.han@riken.jp;gang.niu@riken.jp;guangquan.zhang@uts.edu.au;sugi@k.u-tokyo.ac.jp",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nliu2020wildly,\ntitle={Wildly Unsupervised Domain Adaptation and Its Powerful and Efficient Solution},\nauthor={Feng Liu and Jie Lu and Bo Han and Gang Niu and Guangquan Zhang and Masashi Sugiyama},\nyear={2020},\nurl={https://openreview.net/forum?id=rkl2s34twS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkl2s34twS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "530;361;427",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1193;613;223",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            439.3333333333333,
            69.54295235480171
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            676.3333333333334,
            398.5250584621025
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:daB1h2OZYIsJ:scholar.google.com/&scioq=Wildly+Unsupervised+Domain+Adaptation+and+Its+Powerful+and+Efficient+Solution&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rkl3m1BFDB",
        "title": "Exploratory Not Explanatory: Counterfactual Analysis of Saliency Maps for Deep Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "Proposing a new counterfactual-based methodology to evaluate the hypotheses generated from saliency maps about deep RL agent behavior. ",
        "abstract": "Saliency maps are frequently used to support explanations of the behavior of deep reinforcement learning (RL) agents. However, a review of how saliency maps are used in practice indicates that the derived explanations are often unfalsifiable and can be highly subjective. We introduce an empirical approach grounded in counterfactual reasoning to test the hypotheses generated from saliency maps and assess the degree to which they correspond to the semantics of RL environments. We use Atari games, a common benchmark for deep RL, to evaluate three types of saliency maps. Our results show the extent to which existing claims about Atari games can be evaluated and suggest that saliency maps are best viewed as an exploratory tool rather than an explanatory tool.",
        "keywords": "explainability;saliency maps;representations;deep reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Akanksha Atrey;Kaleigh Clary;David Jensen",
        "authorids": "aatrey@cs.umass.edu;kclary@cs.umass.edu;jensen@cs.umass.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nAtrey2020Exploratory,\ntitle={Exploratory Not Explanatory: Counterfactual Analysis of Saliency Maps for Deep Reinforcement Learning},\nauthor={Akanksha Atrey and Kaleigh Clary and David Jensen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkl3m1BFDB}\n}",
        "github": "https://github.com/KDL-umass/saliency_maps",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkl3m1BFDB",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "103;135;1015",
        "wc_reply_reviewers": "0;303;208",
        "wc_reply_authors": "233;814;307",
        "reply_reviewers": "0;1;2",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.6666666666667,
            422.58043284352647
        ],
        "wc_reply_reviewers_avg": [
            170.33333333333334,
            126.53414119868553
        ],
        "wc_reply_authors_avg": [
            451.3333333333333,
            258.21739333790475
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 124,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6988064126122361563&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rkl44TEtwH",
        "title": "Composable Semi-parametric Modelling for Long-range Motion Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a semi-parametric model to generate long-range, diverse and visually natural motion sequence.",
        "abstract": "Learning diverse and natural behaviors is one of the longstanding goal for creating intelligent characters in the animated world. In this paper, we propose ``COmposable Semi-parametric MOdelling'' (COSMO), a method for generating long range diverse and distinctive behaviors to achieve a specific goal location. Our proposed method learns to model the motion of human by combining the complementary strengths of both non-parametric techniques and parametric ones. Given the starting and ending state, a memory bank is used to retrieve motion references that are provided as source material to a deep network. The synthesis is performed by a deep network that controls the style of the provided motion material and modifies it to become natural. On skeleton datasets with diverse motion, we show that the proposed method outperforms existing parametric and non-parametric baselines. We also demonstrate the generated sequences are useful as subgoals for actual physical execution in the animated world. ",
        "keywords": "Semi-parametric;Long-range;Motion Generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jingwei Xu;Huazhe Xu;Bingbing Ni;Xiaokang Yang;Trevor Darrell",
        "authorids": "xjwxjw@sjtu.edu.cn;huazhe_xu@eecs.berkeley.edu;nibingbing@sjtu.edu.cn;xkyang@sjtu.edu.cn;trevor@eecs.berkeley.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nxu2020composable,\ntitle={Composable Semi-parametric Modelling for Long-range Motion Generation},\nauthor={Jingwei Xu and Huazhe Xu and Bingbing Ni and Xiaokang Yang and Trevor Darrell},\nyear={2020},\nurl={https://openreview.net/forum?id=rkl44TEtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkl44TEtwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "452;370;461",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "214;275;336",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            427.6666666666667,
            40.94169295745136
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            275.0,
            49.80629143659129
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yCGSV4yK8VQJ:scholar.google.com/&scioq=Composable+Semi-parametric+Modelling+for+Long-range+Motion+Generation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkl8dlHYvB",
        "title": "Learning to Group: A Bottom-Up Framework for 3D Part Discovery in Unseen Categories",
        "track": "main",
        "status": "Poster",
        "tldr": "A zero-shot segmentation framework for 3D shapes. Model the segmentation as a decision-making process, we propose an iterative method to dynamically extend the receptive field for achieving universal shape segmentation.",
        "abstract": "We address the problem of learning to discover 3D parts for objects in unseen categories. Being able to learn the geometry prior of parts and transfer this prior to unseen categories pose fundamental challenges on data-driven shape segmentation approaches. Formulated as a contextual bandit problem, we propose a learning-based iterative grouping framework which learns a grouping policy to progressively merge small part proposals into bigger ones in a bottom-up fashion. At the core of our approach is to restrict the local context for extracting part-level features, which encourages the generalizability to novel categories. On a recently proposed large-scale fine-grained 3D part dataset, PartNet, we demonstrate that our method can transfer knowledge of parts learned from 3 training categories to 21 unseen testing categories without seeing any annotated samples. Quantitative comparisons against four strong shape segmentation baselines show that we achieve the state-of-the-art performance.",
        "keywords": "Shape Segmentation;Zero-Shot Learning;Learning Representations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tiange Luo;Kaichun Mo;Zhiao Huang;Jiarui Xu;Siyu Hu;Liwei Wang;Hao Su",
        "authorids": "luotg@pku.edu.cn;kaichun@cs.stanford.edu;z2huang@eng.ucsd.edu;jxuat@connect.ust.hk;sy89128@mail.ustc.edu.cn;wanglw@cis.pku.edu.cn;haosu@eng.ucsd.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nLuo2020Learning,\ntitle={Learning to Group: A Bottom-Up Framework for 3D Part Discovery in Unseen Categories},\nauthor={Tiange Luo and Kaichun Mo and Zhiao Huang and Jiarui Xu and Siyu Hu and Liwei Wang and Hao Su},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkl8dlHYvB}\n}",
        "github": "https://github.com/tiangeluo/Learning-to-Group",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkl8dlHYvB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "325;558;375",
        "wc_reply_reviewers": "0;78;0",
        "wc_reply_authors": "441;767;547",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            419.3333333333333,
            100.15432536285634
        ],
        "wc_reply_reviewers_avg": [
            26.0,
            36.76955262170047
        ],
        "wc_reply_authors_avg": [
            585.0,
            135.77432256014635
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11555751649018705803&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkl8sJBYvH",
        "title": "Harnessing the Power of Infinitely Wide Deep Nets on Small-data Tasks",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We verify neural tangent kernel is powerful on small data via experiments on UCI datasets, small CIFAR 10 and low-shot learning on VOC07.",
        "abstract": "Recent research shows that the following two models are equivalent: (a) infinitely wide neural networks (NNs) trained under l2 loss by gradient descent with infinitesimally small learning rate (b) kernel regression with respect to so-called Neural Tangent Kernels (NTKs) (Jacot et al., 2018). An efficient algorithm to compute the NTK, as well as its convolutional counterparts, appears in Arora et al. (2019a), which allowed studying performance of infinitely wide nets on datasets like CIFAR-10. However, super-quadratic running time of kernel methods makes them best suited for small-data tasks. We report results suggesting neural tangent kernels perform strongly on low-data tasks.\n1. On a standard testbed of classification/regression tasks from the UCI database, NTK SVM beats the previous gold standard, Random Forests (RF), and also the corresponding finite nets.\n2. On CIFAR-10 with 10 \u2013 640 training samples, Convolutional NTK consistently beats ResNet-34 by 1% - 3%.\n3. On VOC07 testbed for few-shot image classification tasks on ImageNet with transfer learning (Goyal et al., 2019), replacing the linear SVM currently used with a Convolutional NTK SVM consistently improves performance.\n4. Comparing the performance of NTK with the finite-width net it was derived from, NTK behavior starts at lower net widths than suggested by theoretical analysis(Arora et al., 2019a). NTK\u2019s efficacy may trace to lower variance of output.",
        "keywords": "small data;neural tangent kernel;UCI database;few-shot learning;kernel SVMs;deep learning theory;kernel design",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sanjeev Arora;Simon S. Du;Zhiyuan Li;Ruslan Salakhutdinov;Ruosong Wang;Dingli Yu",
        "authorids": "arora@cs.princeton.edu;ssdu@ias.edu;zhiyuanli@cs.princeton.edu;rsalakhu@cs.cmu.edu;ruosongw@andrew.cmu.edu;dingliy@cs.princeton.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nArora2020Harnessing,\ntitle={Harnessing the Power of Infinitely Wide Deep Nets on Small-data Tasks},\nauthor={Sanjeev Arora and Simon S. Du and Zhiyuan Li and Ruslan Salakhutdinov and Ruosong Wang and Dingli Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkl8sJBYvH}\n}",
        "github": "[![github](/images/github_icon.svg) LeoYu/neural-tangent-kernel-UCI](https://github.com/LeoYu/neural-tangent-kernel-UCI) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=rkl8sJBYvH)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkl8sJBYvH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "232;452;528",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "183;64;146",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.0,
            125.51759504813127
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            131.0,
            49.72591544322403
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 202,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5915084017375187299&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rklB76EKPr",
        "title": "Can gradient clipping mitigate label noise?",
        "track": "main",
        "status": "Poster",
        "tldr": "Gradient clipping doesn't endow robustness to label noise, but a simple loss-based variant does.",
        "abstract": "Gradient clipping is a widely-used technique in the training of deep networks, and is generally motivated from an optimisation lens: informally, it controls the dynamics of iterates, thus enhancing the rate of convergence to a local minimum. This intuition has been made precise in a line of recent works, which show that suitable clipping  can yield significantly faster convergence than vanilla gradient descent. In this paper, we propose a new lens for studying gradient clipping, namely, robustness: informally, one expects clipping to provide robustness to noise, since one does not overly trust any single sample. Surprisingly, we prove that  for the common problem of label noise in classification, standard gradient clipping does not in general provide robustness. On the other hand, we show that  a simple variant of gradient clipping is provably robust, and corresponds to suitably modifying the underlying loss function. This yields a simple, noise-robust alternative to the standard cross-entropy loss which performs well empirically.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aditya Krishna Menon;Ankit Singh Rawat;Sashank J. Reddi;Sanjiv Kumar",
        "authorids": "adityakmenon@google.com;ankitsrawat@google.com;sashank@google.com;sanjivk@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nMenon2020Can,\ntitle={Can gradient clipping mitigate label noise?},\nauthor={Aditya Krishna Menon and Ankit Singh Rawat and Sashank J. Reddi and Sanjiv Kumar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklB76EKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rklB76EKPr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "213;297;395",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "91;301;236",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            301.6666666666667,
            74.3744281023764
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            209.33333333333334,
            87.78129388175795
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 196,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14356663270369324918&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rklEj2EFvB",
        "title": "Estimating Gradients for Discrete Random Variables by Sampling without Replacement",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We derive a low-variance, unbiased gradient estimator for expectations over discrete random variables based on sampling without replacement",
        "abstract": "We derive an unbiased estimator for expectations over discrete random variables based on sampling without replacement, which reduces variance as it avoids duplicate samples. We show that our estimator can be derived as the Rao-Blackwellization of three different estimators. Combining our estimator with REINFORCE, we obtain a policy gradient estimator and we reduce its variance using a built-in control variate which is obtained without additional model evaluations. The resulting estimator is closely related to other gradient estimators. Experiments with a toy problem, a categorical Variational Auto-Encoder and a structured prediction problem show that our estimator is the only estimator that is consistently among the best estimators in both high and low entropy settings.",
        "keywords": "gradient;estimator;discrete;categorical;sampling;without replacement;reinforce;baseline;variance;gumbel;vae;structured prediction",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Wouter Kool;Herke van Hoof;Max Welling",
        "authorids": "w.w.m.kool@uva.nl;h.c.vanhoof@uva.nl;m.welling@uva.nl",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nKool2020Estimating,\ntitle={Estimating Gradients for Discrete Random Variables by Sampling without Replacement},\nauthor={Wouter Kool and Herke van Hoof and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklEj2EFvB}\n}",
        "github": "https://github.com/wouterkool/estimating-gradients-without-replacement",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklEj2EFvB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "515;314;760",
        "wc_reply_reviewers": "0;0;105",
        "wc_reply_authors": "335;429;991",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            529.6666666666666,
            182.373852901731
        ],
        "wc_reply_reviewers_avg": [
            35.0,
            49.49747468305833
        ],
        "wc_reply_authors_avg": [
            585.0,
            289.63885558858755
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 60,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8729691714489659626&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rklFh34Kwr",
        "title": "Bayesian Inference for Large Scale Image Classification",
        "track": "main",
        "status": "Reject",
        "tldr": "We scale Bayesian Inference to ImageNet classification and achieve competitive results accuracy and uncertainty calibration.",
        "abstract": "Bayesian inference promises to ground and improve the performance of deep neural networks. It promises to be robust to overfitting, to simplify the training procedure and the space of hyperparameters, and to provide a calibrated measure of uncertainty that can enhance decision making, agent exploration and prediction fairness.\nMarkov Chain Monte Carlo (MCMC) methods enable Bayesian inference by generating samples from the posterior distribution over model parameters.\nDespite the theoretical advantages of Bayesian inference and the similarity between MCMC and optimization methods, the performance of sampling methods has so far lagged behind  optimization methods for large scale deep learning tasks.\nWe aim to fill this gap and introduce ATMC, an adaptive noise MCMC algorithm that estimates and is able to sample from the posterior of a neural network.\nATMC dynamically adjusts the amount of momentum and noise applied to each parameter update in order to compensate for the use of stochastic gradients.\nWe use a ResNet architecture without batch normalization to test ATMC on the Cifar10 benchmark and the large scale ImageNet benchmark and show that, despite the  absence of batch normalization, ATMC outperforms a strong optimization baseline in terms of both classification accuracy and test log-likelihood. We show that ATMC is intrinsically robust to overfitting on the training data and that ATMC provides a better calibrated measure of uncertainty compared to the optimization baseline.",
        "keywords": "image classification;bayesian inference;mcmc;imagenet",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jonathan Heek;Nal Kalchbrenner",
        "authorids": "jheek@google.com;nalk@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nheek2020bayesian,\ntitle={Bayesian Inference for Large Scale Image Classification},\nauthor={Jonathan Heek and Nal Kalchbrenner},\nyear={2020},\nurl={https://openreview.net/forum?id=rklFh34Kwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rklFh34Kwr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "342;561;285",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "466;536;347",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            396.0,
            118.97058459972364
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            449.6666666666667,
            78.01851632073561
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 49,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3896775484866354130&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rklHqRVKvH",
        "title": "Harnessing Structures for Value-Based Planning and Reinforcement Learning",
        "track": "main",
        "status": "Talk",
        "tldr": "We propose a generic framework that allows for exploiting the low-rank structure in both planning and deep reinforcement learning.",
        "abstract": "Value-based methods constitute a fundamental methodology in planning and deep reinforcement learning (RL). In this paper, we propose to exploit the underlying structures of the state-action value function, i.e., Q function, for both planning and deep RL. In particular, if the underlying system dynamics lead to some global structures of the Q function, one should be capable of inferring the function better by leveraging such structures. Specifically, we investigate the low-rank structure, which widely exists for big data matrices. We verify empirically the existence of low-rank Q functions in the context of control and deep RL tasks. As our key contribution, by leveraging Matrix Estimation (ME) techniques, we propose a general framework to exploit the underlying low-rank structure in Q functions. This leads to a more efficient planning procedure for classical control, and additionally, a simple scheme that can be applied to value-based RL techniques to consistently achieve better performance on \"low-rank\" tasks. Extensive experiments on control tasks and Atari games confirm the efficacy of our approach.",
        "keywords": "Deep reinforcement learning;value-based reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuzhe Yang;Guo Zhang;Zhi Xu;Dina Katabi",
        "authorids": "yuzhe@mit.edu;guozhang@mit.edu;zhixu@mit.edu;dina@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nYang2020Harnessing,\ntitle={Harnessing Structures for Value-Based Planning and Reinforcement Learning},\nauthor={Yuzhe Yang and Guo Zhang and Zhi Xu and Dina Katabi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklHqRVKvH}\n}",
        "github": "https://github.com/YyzHarry/SV-RL",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rklHqRVKvH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "557;1119;273",
        "wc_reply_reviewers": "0;110;22",
        "wc_reply_authors": "1201;1057;691",
        "reply_reviewers": "0;1;1",
        "reply_authors": "2;2;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            649.6666666666666,
            351.53883932725773
        ],
        "wc_reply_reviewers_avg": [
            44.0,
            47.525431788324305
        ],
        "wc_reply_authors_avg": [
            983.0,
            214.6811589310995
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 40,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4756177392092487919&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rklJ2CEYPH",
        "title": "Point Process Flows",
        "track": "main",
        "status": "Reject",
        "tldr": "A non-parametric point process model via Normalizing Flow",
        "abstract": "Event sequences can be modeled by temporal point processes (TPPs) to capture their asynchronous and probabilistic nature. We propose an intensity-free framework that directly models the point process as a non-parametric distribution by utilizing normalizing flows. This approach is capable of capturing highly complex temporal distributions and does not rely on restrictive parametric forms. Comparisons with state-of-the-art baseline models on both synthetic and challenging real-life datasets show that the proposed framework is effective at modeling the stochasticity of discrete event sequences. ",
        "keywords": "Temporal Point Process;Intensity-free Point Process",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nazanin Mehrasa;Ruizhi Deng;Mohamed Osama Ahmed;Bo Chang;Jiawei He;Thibaut Durand;Marcus Brubaker;Greg Mori",
        "authorids": "nmehrasa@sfu.ca;ruizhid@sfu.ca;mohamed.o.ahmed@borealisai.com;bchang@stat.ubc.ca;jha203@sfu.ca;thibaut.p.durand@borealisai.com;marcus.brubaker@borealisai.com;mori@cs.sfu.ca",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nmehrasa2020point,\ntitle={Point Process Flows},\nauthor={Nazanin Mehrasa and Ruizhi Deng and Mohamed Osama Ahmed and Bo Chang and Jiawei He and Thibaut Durand and Marcus Brubaker and Greg Mori},\nyear={2020},\nurl={https://openreview.net/forum?id=rklJ2CEYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklJ2CEYPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "256;307;373",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "902;853;794",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            312.0,
            47.89572005931219
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            849.6666666666666,
            44.15377170248942
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10895996251281278944&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rklKdJSYPS",
        "title": "Mixing Up Real Samples and Adversarial Samples for Semi-Supervised Learning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Consistency regularization methods have shown great success in semi-supervised learning tasks. Most existing methods focus on either the local neighborhood or in-between neighborhood of training samples to enforce the consistency constraint. In this paper, we propose a novel generalized framework called Adversarial Mixup (AdvMixup), which unifies the local and in-between neighborhood approaches by defining a virtual data distribution along the paths between the training samples and adversarial samples. Experimental results on both synthetic data and benchmark datasets exhibit the benefits of AdvMixup on semi-supervised learning.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yun Ma;Xudong Mao;Yangbin Chen;Qing Li",
        "authorids": "mayun371@gmail.com;xudong.xdmao@gmail.com;robinchen2-c@my.cityu.edu.hk;qing-prof.li@polyu.edu.hk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklKdJSYPS",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "312;461;178",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            317.0,
            115.58835004734112
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14087152270964703810&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rklMnyBtPB",
        "title": "Adversarial Robustness Against the Union of Multiple Perturbation Models",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop a generalization of the standard PGD-based procedure to train architectures which are robust against multiple perturbation models, outperforming past approaches on the MNIST and CIFAR10 datasets.",
        "abstract": "Owing to the susceptibility of deep learning systems to adversarial attacks, there has been a great deal of work in developing (both empirically and certifiably) robust classifiers, but the vast majority has defended against single types of attacks. Recent work has looked at defending against multiple attacks, specifically on the MNIST dataset, yet this approach used a relatively complex architecture, claiming that standard adversarial training can not apply because it \"overfits\" to a particular norm. In this work, we show that it is indeed possible to adversarially train a robust model against a union of norm-bounded attacks, by using a natural generalization of the standard PGD-based procedure for adversarial training to multiple threat models. With this approach, we are able to train standard architectures which are robust against l_inf, l_2, and l_1 attacks, outperforming past approaches on the MNIST dataset and providing the first CIFAR10 network trained to be simultaneously robust against (l_inf, l_2, l_1) threat models, which achieves adversarial accuracy rates of (47.6%, 64.3%, 53.4%) for (l_inf, l_2, l_1) perturbations with epsilon radius = (0.03,0.5,12).",
        "keywords": "adversarial;robustness;multiple perturbation;MNIST;CIFAR10",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pratyush Maini;Eric Wong;Zico Kolter",
        "authorids": "pratyush.maini@gmail.com;ericwong@cs.cmu.edu;zkolter@cs.cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmaini2020adversarial,\ntitle={Adversarial Robustness Against the Union of Multiple Perturbation Models},\nauthor={Pratyush Maini and Eric Wong and Zico Kolter},\nyear={2020},\nurl={https://openreview.net/forum?id=rklMnyBtPB}\n}",
        "github": "https://github.com/msd-2019/MSD2019",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rklMnyBtPB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "324;314;281",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "667;300;379",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            306.3333333333333,
            18.372685039360892
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            448.6666666666667,
            157.71775070957895
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 195,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7466169251019166105&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rklOg6EFwS",
        "title": "Improving Adversarial Robustness Requires Revisiting Misclassified Examples",
        "track": "main",
        "status": "Poster",
        "tldr": "By differentiating misclassified and correctly classified data, we propose a new misclassification aware defense that improves the state-of-the-art adversarial robustness.",
        "abstract": "Deep neural networks (DNNs) are vulnerable to adversarial examples crafted by imperceptible perturbations. A range of defense techniques have been proposed to improve DNN robustness to adversarial examples, among which adversarial training has been demonstrated to be the most effective. Adversarial training is often formulated as a min-max optimization problem, with the inner maximization for generating adversarial examples. However, there exists a simple, yet easily overlooked fact that adversarial examples are only defined on correctly classified (natural) examples, but inevitably, some (natural) examples will be misclassified during training. In this paper, we investigate the distinctive influence of misclassified and correctly classified examples on the final robustness of adversarial training. Specifically, we find that misclassified examples indeed have a significant impact on the final robustness. More surprisingly, we find that different maximization techniques on misclassified examples may have a negligible influence on the final robustness, while different minimization techniques are crucial. Motivated by the above discovery, we propose a new defense algorithm called {\\em Misclassification Aware adveRsarial Training} (MART), which explicitly differentiates the misclassified and correctly classified examples during the training. We also propose a semi-supervised extension of MART, which can leverage the unlabeled data to further improve the robustness. Experimental results show that MART and its variant could significantly improve the state-of-the-art adversarial robustness.",
        "keywords": "Robustness;Adversarial Defense;Adversarial Training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yisen Wang;Difan Zou;Jinfeng Yi;James Bailey;Xingjun Ma;Quanquan Gu",
        "authorids": "eewangyisen@gmail.com;knowzou@ucla.edu;jinfengyi.ustc@gmail.com;baileyj@unimelb.edu.au;xingjun.ma@unimelb.edu.au;qgu@cs.ucla.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nWang2020Improving,\ntitle={Improving Adversarial Robustness Requires Revisiting Misclassified Examples},\nauthor={Yisen Wang and Difan Zou and Jinfeng Yi and James Bailey and Xingjun Ma and Quanquan Gu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklOg6EFwS}\n}",
        "github": "[![github](/images/github_icon.svg) YisenWang/MART](https://github.com/YisenWang/MART)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rklOg6EFwS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "368;549;309",
        "wc_reply_reviewers": "0;30;0",
        "wc_reply_authors": "164;657;308",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            408.6666666666667,
            102.11213879303914
        ],
        "wc_reply_reviewers_avg": [
            10.0,
            14.142135623730951
        ],
        "wc_reply_authors_avg": [
            376.3333333333333,
            206.9852383357057
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 925,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7100431639219605362&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rklPITVKvS",
        "title": "BRIDGING ADVERSARIAL SAMPLES AND ADVERSARIAL NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce adversarial training on real samples that does not exist in standard GANs to make discriminator more robust, which can stabilize training, accelerate convergence, and achieve better performance.",
        "abstract": "Generative adversarial networks have achieved remarkable performance on various tasks but suffer from sensitivity to hyper-parameters, training instability, and mode collapse. We find that this is partly due to gradient given by non-robust discriminator containing non-informative adversarial noise, which can hinder generator from catching the pattern of real samples. Inspired by defense against adversarial samples, we introduce adversarial training of discriminator on real samples that does not exist in classic GANs framework to make adversarial training symmetric, which can balance min-max game and make discriminator more robust. Robust discriminator can give more informative gradient with less adversarial noise, which can stabilize training and accelerate convergence. We validate the proposed method on image generation tasks with varied network architectures quantitatively. Experiments show that training stability, perceptual quality, and diversity of generated samples are consistently improved with small additional training computation cost.",
        "keywords": "ADVERSARIAL SAMPLES;ADVERSARIAL NETWORKS",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Faqiang Liu;Mingkun Xu;Guoqi Li;Jing Pei;Luping Shi",
        "authorids": "lfq18@mails.tsinghua.edu.cn;xmk18@mails.tsinghua.edu.cn;liguoqi@mail.tsinghua.edu.cn;peij@mail.tsinghua.edu.cn;lpshi@mail.tsinghua.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nliu2020bridging,\ntitle={{\\{}BRIDGING{\\}} {\\{}ADVERSARIAL{\\}} {\\{}SAMPLES{\\}} {\\{}AND{\\}} {\\{}ADVERSARIAL{\\}} {\\{}NETWORKS{\\}}},\nauthor={Faqiang Liu and Mingkun Xu and Guoqi Li and Jing Pei and Luping Shi},\nyear={2020},\nurl={https://openreview.net/forum?id=rklPITVKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rklPITVKvS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "417;221;558",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "916;558;1215",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            398.6666666666667,
            138.1890814146408
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            896.3333333333334,
            268.57939029063436
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tGdmpeY4yDkJ:scholar.google.com/&scioq=BRIDGING+ADVERSARIAL+SAMPLES+AND+ADVERSARIAL+NETWORKS&hl=en&as_sdt=0,33",
        "gs_version_total": 2
    },
    {
        "id": "rklTmyBKPH",
        "title": "Fast Neural Network Adaptation via Parameter Remapping and Architecture Search",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Deep neural networks achieve remarkable performance in many computer vision tasks. Most state-of-the-art~(SOTA) semantic segmentation and object detection approaches reuse neural network architectures designed for image classification as the backbone, commonly pre-trained on ImageNet. However, performance gains can be achieved by designing network architectures specifically for detection and segmentation, as shown by recent neural architecture search (NAS) research for detection and segmentation. One major challenge though, is that ImageNet pre-training of the search space representation (a.k.a. super network) or the searched networks incurs huge computational cost. In this paper, we propose a Fast Neural Network Adaptation (FNA) method, which can adapt both the architecture and parameters of a seed network (e.g. a high performing manually designed backbone) to become a network with different depth, width, or kernels via a Parameter Remapping technique, making it possible to utilize NAS for detection/segmentation tasks a lot more efficiently. In our experiments, we conduct FNA on MobileNetV2 to obtain new networks for both segmentation and detection that clearly out-perform existing networks designed both manually and by NAS. The total computation cost of FNA is significantly less than SOTA segmentation/detection NAS approaches: 1737$\\times$ less than DPC, 6.8$\\times$ less than Auto-DeepLab and 7.4$\\times$ less than DetNAS. The code is available at https://github.com/JaminFong/FNA .",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiemin Fang*;Yuzhu Sun*;Kangjian Peng*;Qian Zhang;Yuan Li;Wenyu Liu;Xinggang Wang",
        "authorids": "jaminfong@hust.edu.cn;yzsun@hust.edu.cn;kangjian.peng@horizon.ai;qian01.zhang@horizon.ai;yuan.li@horizon.ai;liuwy@hust.edu.cn;xgwang@hust.edu.cn",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nFang*2020Fast,\ntitle={Fast Neural Network Adaptation via Parameter Remapping and Architecture Search},\nauthor={Jiemin Fang* and Yuzhu Sun* and Kangjian Peng* and Qian Zhang and Yuan Li and Wenyu Liu and Xinggang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklTmyBKPH}\n}",
        "github": "https://github.com/JaminFong/FNA",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklTmyBKPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "408;134;397",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2104;120;717",
        "reply_reviewers": "0;0;0",
        "reply_authors": "5;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            313.0,
            126.65175350806109
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            980.3333333333334,
            831.0925874346906
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.8856180831641267
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 44,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15290218977288886818&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rklVOnNtwH",
        "title": "Out-of-Distribution Detection Using Layerwise Uncertainty in Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a method that extracts the uncertainties of features in each layer of DNNs and combines them for detecting OOD samples when solving classification tasks.",
        "abstract": "In this paper, we tackle the problem of detecting samples that are not drawn from the training distribution, i.e., out-of-distribution (OOD) samples, in classification. Many previous studies have attempted to solve this problem by regarding samples with low classification confidence as OOD examples using deep neural networks (DNNs). However, on difficult datasets or models with low classification ability, these methods incorrectly regard in-distribution samples close to the decision boundary as OOD samples. This problem arises because their approaches use only the features close to the output layer and disregard the uncertainty of the features. Therefore, we propose a method that extracts the uncertainties of features in each layer of DNNs using a reparameterization trick and combines them. In experiments, our method outperforms the existing methods by a large margin, achieving state-of-the-art detection performance on several datasets and classification models. For example, our method increases the AUROC score of prior work (83.8%) to 99.8% in DenseNet on the CIFAR-100 and Tiny-ImageNet datasets.",
        "keywords": "out-of-distribution;uncertainty",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hirono Okamoto;Masahiro Suzuki;Yutaka Matsuo",
        "authorids": "h-okamoto@weblab.t.u-tokyo.ac.jp;masa@weblab.t.u-tokyo.ac.jp;matsuo@weblab.t.u-tokyo.ac.jp",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nokamoto2020outofdistribution,\ntitle={Out-of-Distribution Detection Using Layerwise Uncertainty in Deep Neural Networks},\nauthor={Hirono Okamoto and Masahiro Suzuki and Yutaka Matsuo},\nyear={2020},\nurl={https://openreview.net/forum?id=rklVOnNtwH}\n}",
        "github": "https://github.com/unknown-ai/UFEL",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklVOnNtwH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "373;1299;659",
        "wc_reply_reviewers": "0;378;0",
        "wc_reply_authors": "280;693;656",
        "reply_reviewers": "0;2;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            777.0,
            387.1364961698479
        ],
        "wc_reply_reviewers_avg": [
            126.0,
            178.19090885900997
        ],
        "wc_reply_authors_avg": [
            543.0,
            186.5815282032674
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2ZMTfmG09-cJ:scholar.google.com/&scioq=Out-of-Distribution+Detection+Using+Layerwise+Uncertainty+in+Deep+Neural+Networks&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkl_Ch4YwS",
        "title": "A TWO-STAGE FRAMEWORK FOR MATHEMATICAL EXPRESSION RECOGNITION",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "\nAlthough mathematical expressions (MEs) recognition have achieved great progress, the development of MEs recognition in real scenes is still unsatisfactory. Inspired by the recent work of neutral network, this paper proposes a novel two-stage approach which takes a printed mathematical expression image as input and generates LaTeX sequence as output. In the first stage, this method locates and recognizes the math symbols of input image by object detection algorithm. In the second stage, it translates math symbols with position information into LaTeX sequences by seq2seq model equipped with attention mechanism. In particular, the detection of mathematical symbols and the structural analysis of mathematical formulas are carried out separately in two steps, which effectively improves the recognition accuracy and enhances the generalization ability. The experiment demonstrates that the two-stage method significantly outperforms the end-to-end method. Especially, the ExpRate(expression recognition rate) of our model is 74.1%, 20.3 percentage points higher than that of the end-to-end model on the test data that doesn\u2019t come from the same source as training data.",
        "keywords": "mathematical expressions recognition;seq2seq model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jin Zhang;Weipeng Ming;Pengfei Liu",
        "authorids": "zhangjin9@100tal.com;mingweipeng@100tal.com;liupengfei1@100tal.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhang2020a,\ntitle={A {\\{}TWO{\\}}-{\\{}STAGE{\\}} {\\{}FRAMEWORK{\\}} {\\{}FOR{\\}} {\\{}MATHEMATICAL{\\}} {\\{}EXPRESSION{\\}} {\\{}RECOGNITION{\\}}},\nauthor={Jin Zhang and Weipeng Ming and Pengfei Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=rkl_Ch4YwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkl_Ch4YwS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "561;192;137",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            296.6666666666667,
            188.25573622660804
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16054068385332120767&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkl_f6EFPS",
        "title": "The Probabilistic Fault Tolerance of Neural Networks in the Continuous Limit",
        "track": "main",
        "status": "Reject",
        "tldr": "We give a bound for NNs on the output error in case of random weight failures using a Taylor expansion in the continuous limit where nearby neurons are similar",
        "abstract": "The loss of a few neurons in a brain rarely results in any visible loss of function. However, the insight into what \u201cfew\u201d means in this context is unclear. How many random neuron failures will it take to lead to a visible loss of function? In this paper, we address the fundamental question of the impact of the crash of a random subset of neurons on the overall computation of a neural network and the error in the output it produces. We study fault tolerance of neural networks subject to small random neuron/weight crash failures in a probabilistic setting. We give provable guarantees on the robustness of the network to these crashes. Our main contribution is a bound on the error in the output of a network under small random Bernoulli crashes proved by using a Taylor expansion in the continuous limit, where close-by neurons at a layer are similar. The failure mode we adopt in our model is characteristic of neuromorphic hardware, a promising technology to speed up artificial neural networks, as well as of biological networks. We show that our theoretical bounds can be used to compare the fault tolerance of different architectures and to design a regularizer improving the fault tolerance of a given architecture. We design an algorithm achieving fault tolerance using a reasonable number of neurons. In addition to the theoretical proof, we also provide experimental validation of our results and suggest a connection to the generalization capacity problem.",
        "keywords": "Robustness;theory of neural networks;fault tolerance;continuous limit;Taylor expansion;error bound;neuromorphic computing;continuous networks;functional derivative",
        "primary_area": "",
        "supplementary_material": "",
        "author": "El-Mahdi El-Mhamdi;Rachid Guerraoui;Andrei Kucharavy;Sergei Volodin",
        "authorids": "elmahdi.elmhamdi@epfl.ch;rachid.guerraoui@epfl.ch;andrei.kucharavy@epfl.ch;sergei.volodin@epfl.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nel-mhamdi2020the,\ntitle={The Probabilistic Fault Tolerance of Neural Networks in the Continuous Limit},\nauthor={El-Mahdi El-Mhamdi and Rachid Guerraoui and Andrei Kucharavy and Sergei Volodin},\nyear={2020},\nurl={https://openreview.net/forum?id=rkl_f6EFPS}\n}",
        "github": "https://github.com/iclr-2020-fault-tolerance/code",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkl_f6EFPS",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "418;534;92",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "728;807;40",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.0,
            187.1113750328041
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            525.0,
            344.4599638080842
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1138622987495951522&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rklbKA4YDS",
        "title": "Gradient-Based Neural DAG Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We are proposing a new score-based approach to structure/causal learning leveraging neural networks and a recent continuous constrained formulation to this problem",
        "abstract": "We propose a novel score-based approach to learning a directed acyclic graph (DAG) from observational data. We adapt a recently proposed continuous constrained optimization formulation to allow for nonlinear relationships between variables using neural networks. This extension allows to model complex interactions while avoiding the combinatorial nature of the problem. In addition to comparing our method to existing continuous optimization methods, we provide missing empirical comparisons to nonlinear greedy search methods. On both synthetic and real-world data sets, this new method outperforms current continuous methods on most tasks while being competitive with existing greedy search methods on important metrics for causal inference.",
        "keywords": "Structure Learning;Causality;Density estimation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "S\u00e9bastien Lachapelle;Philippe Brouillard;Tristan Deleu;Simon Lacoste-Julien",
        "authorids": "sebastien.lachapelle@umontreal.ca;philippebrouillard@gmail.com;tristan.deleu@gmail.com;slacoste@iro.umontreal.ca",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLachapelle2020Gradient-Based,\ntitle={Gradient-Based Neural DAG Learning},\nauthor={S\u00e9bastien Lachapelle and Philippe Brouillard and Tristan Deleu and Simon Lacoste-Julien},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklbKA4YDS}\n}",
        "github": "https://github.com/kurowasan/GraN-DAG",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rklbKA4YDS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "237;217;1724",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "407;520;474",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            726.0,
            705.7398009653889
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            467.0,
            46.396838972786355
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 306,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10487378596908501013&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rklfIeSFwS",
        "title": "CNAS: Channel-Level Neural Architecture Search",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "There is growing interest in automating designing good neural network architectures. The NAS methods proposed recently have significantly reduced architecture search cost by sharing parameters, but there is still a challenging problem of designing search space. We consider search space is typically defined with its shape and a set of operations and propose a channel-level architecture search\\,(CNAS) method using only a fixed type of operation. The resulting architecture is sparse in terms of channel and has different topology at different cell. The experimental results for CIFAR-10 and ImageNet show that a fine-granular and sparse model searched by CNAS achieves very competitive performance with dense models searched by the existing methods.",
        "keywords": "Neural architecture search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Heechul Lim;Min-Soo Kim;Jinjun Xiong",
        "authorids": "skyde1021@dgist.ac.kr;mskim@dgist.ac.kr;jinjun@us.ibm.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlim2020cnas,\ntitle={{\\{}CNAS{\\}}: Channel-Level Neural Architecture Search},\nauthor={Heechul Lim and Min-Soo Kim and Jinjun Xiong},\nyear={2020},\nurl={https://openreview.net/forum?id=rklfIeSFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklfIeSFwS",
        "pdf_size": 0,
        "rating": "1;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "204;273;566;440",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            370.75,
            141.66752450720665
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7208057233416138765&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rklhqkHFDB",
        "title": "LARGE SCALE REPRESENTATION LEARNING FROM TRIPLET COMPARISONS",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, we discuss the fundamental problem of representation learning from a new perspective. It has been observed in many supervised/unsupervised DNNs that the final layer of the network often provides an informative representation for many tasks, even though the network has been trained to perform a particular task. The common ingredient in all previous studies is a low-level feature representation for items, for example, RGB values of images in the image context. In the present work, we assume that no meaningful representation of the items is given. Instead, we are provided with the answers to some triplet comparisons of the following form: Is item A more similar to item B or item C? We provide a fast algorithm based on DNNs that constructs a Euclidean representation for the items, using solely the answers to the above-mentioned triplet comparisons. This problem has been studied in a sub-community of machine learning by the name \"Ordinal Embedding\". Previous approaches to the problem are painfully slow and cannot scale to larger datasets. We demonstrate that our proposed approach is significantly faster than available methods, and can scale to real-world large datasets.\n\nThereby, we also draw attention to the less explored idea of using neural networks to directly, approximately solve non-convex, NP-hard optimization problems that arise naturally in unsupervised learning problems.",
        "keywords": "representation learning;triplet comparison;contrastive learning;ordinal embedding",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Siavash Haghiri;Leena Chennuru Vankadara;Ulrike von Luxburg",
        "authorids": "siyavash.haghiri@gmail.com;leena.chennuru-vankadara@uni-tuebingen.de;luxburg@informatik.uni-tuebingen.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhaghiri2020large,\ntitle={{\\{}LARGE{\\}} {\\{}SCALE{\\}} {\\{}REPRESENTATION{\\}} {\\{}LEARNING{\\}} {\\{}FROM{\\}} {\\{}TRIPLET{\\}} {\\{}COMPARISONS{\\}}},\nauthor={Siavash Haghiri and Leena Chennuru Vankadara and Ulrike von Luxburg},\nyear={2020},\nurl={https://openreview.net/forum?id=rklhqkHFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklhqkHFDB",
        "pdf_size": 0,
        "rating": "1;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "207;268;699;371",
        "wc_reply_reviewers": "0;313;0;0",
        "wc_reply_authors": "686;1225;495;576",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "1;4;1;1",
        "rating_avg": [
            3.25,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            386.25,
            189.8412165468816
        ],
        "wc_reply_reviewers_avg": [
            78.25,
            135.53297569226464
        ],
        "wc_reply_authors_avg": [
            745.5,
            285.0179818888626
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.75,
            1.299038105676658
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10002671575763923079&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkliHyrFDB",
        "title": "Information Theoretic Model Predictive Q-Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Combining model free soft Q learning and information theoretic model predictive control for efficient learning in robotics problems  ",
        "abstract": "Model-free Reinforcement Learning (RL) algorithms work well in sequential decision-making problems when experience can be collected cheaply and model-based RL is effective when system dynamics can be modeled accurately. However, both of these assumptions can be violated in real world problems such as robotics, where querying the system can be prohibitively expensive and real-world dynamics can be difficult to model accurately. Although sim-to-real approaches such as domain randomization attempt to mitigate the effects of biased simulation, they can still suffer from optimization challenges such as local minima and hand-designed distributions for randomization, making it difficult to learn an accurate global value function or policy that directly transfers to the real world. In contrast to RL, Model Predictive Control (MPC) algorithms use a simulator to optimize a simple policy class online, constructing a closed-loop controller that can effectively contend with real-world dynamics. MPC performance is usually limited by factors such as model bias and the limited horizon of optimization. In this work, we present a novel theoretical connection between information theoretic MPC and entropy regularized RL and develop a Q-learning algorithm that can leverage biased models. We validate the proposed algorithm on sim-to-sim control tasks to demonstrate the improvements over optimal control and reinforcement learning from scratch. Our approach paves the way for deploying reinforcement learning algorithms on real-robots in a systematic manner.",
        "keywords": "entropy regularized reinforcement learning;information theoretic MPC;robotics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mohak Bhardwaj;Ankur Handa;Dieter Fox;Byron Boots",
        "authorids": "mohakb@cs.washington.edu;ahanda@nvidia.com;fox@cs.washington.edu;bboots@cs.washington.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nbhardwaj2020information,\ntitle={Information Theoretic Model Predictive Q-Learning},\nauthor={Mohak Bhardwaj and Ankur Handa and Dieter Fox and Byron Boots},\nyear={2020},\nurl={https://openreview.net/forum?id=rkliHyrFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkliHyrFDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "235;412;565",
        "wc_reply_reviewers": "0;0;103",
        "wc_reply_authors": "775;1457;893",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;2;3",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            404.0,
            134.8406466908254
        ],
        "wc_reply_reviewers_avg": [
            34.333333333333336,
            48.55466564147626
        ],
        "wc_reply_authors_avg": [
            1041.6666666666667,
            297.60973699722183
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 30,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1710363093653044598&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rklj3gBYvH",
        "title": "NORML: Nodal Optimization for Recurrent Meta-Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel meta-learning method is introduced where a meta-learner learns to optimize a learner's weight updates by optimizing the input and output to and from each node in the learner network.",
        "abstract": "Meta-learning is an exciting and powerful paradigm that aims to improve the effectiveness of current learning systems. By formulating the learning process as an optimization problem, a model can learn how to learn while requiring significantly less data or experience than traditional approaches. Gradient-based meta-learning methods aims to do just that, however recent work have shown that the effectiveness of these approaches are primarily due to feature reuse and very little has to do with priming the system for rapid learning (learning to make effective weight updates on unseen data distributions). This work introduces Nodal Optimization for Recurrent Meta-Learning (NORML), a novel meta-learning framework where an LSTM-based meta-learner performs neuron-wise optimization on a learner for efficient task learning. Crucially, the number of meta-learner parameters needed in NORML, increases linearly relative to the number of learner parameters. Allowing NORML to potentially scale to learner networks with very large numbers of parameters. While NORML also benefits from feature reuse it is shown experimentally that the meta-learner LSTM learns to make effective weight updates using information from previous data-points and update steps.",
        "keywords": "meta-learning;learning to learn;few-shot classification;memory-based optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David van Niekerk",
        "authorids": "davidpetrus94@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nniekerk2020norml,\ntitle={{\\{}NORML{\\}}: Nodal Optimization for Recurrent Meta-Learning},\nauthor={David van Niekerk},\nyear={2020},\nurl={https://openreview.net/forum?id=rklj3gBYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklj3gBYvH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "599;437;873",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            636.3333333333334,
            179.94320091505418
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PpoqFQPhKxIJ:scholar.google.com/&scioq=NORML:+Nodal+Optimization+for+Recurrent+Meta-Learning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rklk_ySYPB",
        "title": "Provable robustness against all adversarial $l_p$-perturbations for $p\\geq 1$",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce a method to train models with provable robustness wrt all the $l_p$-norms for $p\\geq 1$ simultaneously.",
        "abstract": "In recent years several adversarial attacks and defenses have been proposed. Often seemingly robust models turn out to be non-robust when more sophisticated attacks are used. One way out of this dilemma are provable robustness guarantees. While provably robust models for specific $l_p$-perturbation models have been developed, we show that they do not come with any guarantee against other $l_q$-perturbations. We propose a new regularization scheme, MMR-Universal, for ReLU networks which enforces robustness wrt $l_1$- \\textit{and} $l_\\infty$-perturbations and show how that leads to the first provably robust models wrt any $l_p$-norm for $p\\geq 1$.",
        "keywords": "adversarial robustness;provable guarantees",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Francesco Croce;Matthias Hein",
        "authorids": "francesco91.croce@gmail.com;matthias.hein@uni-tuebingen.de",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nCroce2020Provable,\ntitle={Provable robustness against all adversarial $l_p$-perturbations for $p\\geq 1$},\nauthor={Francesco Croce and Matthias Hein},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklk_ySYPB}\n}",
        "github": "https://github.com/fra31/mmr-universal",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rklk_ySYPB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "207;90;346",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "295;245;701",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            214.33333333333334,
            104.64012402303219
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            413.6666666666667,
            204.1981608362056
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14050453960562252546&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rklklCVYvB",
        "title": "Time2Vec: Learning a Vector Representation of Time",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Time is an important feature in many applications involving events that occur synchronously and/or asynchronously. To effectively consume time information, recent studies have focused on designing new architectures. In this paper, we take an orthogonal but complementary approach by providing a model-agnostic vector representation for time, called Time2Vec, that can be easily imported into many existing and future architectures and improve their performances. We show on a range of models and problems that replacing the notion of time with its Time2Vec representation improves the performance of the final model.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Seyed Mehran Kazemi;Rishab Goel;Sepehr Eghbali;Janahan Ramanan;Jaspreet Sahota;Sanjay Thakur;Stella Wu;Cathal Smyth;Pascal Poupart;Marcus Brubaker",
        "authorids": "mehran.kazemi@borealisai.com;rishab.goel@borealisai.com;sepehr.eghbali@rbc.com;janahan.ramanan@borealisai.com;jaspreet.sahota@borealisai.com;sttsanjay@gmail.com;stella.wu@borealisai.com;cathal.smyth@rbc.com;pascal.poupart@borealisai.com;marcus.brubaker@borealisai.com",
        "gender": ";;;;;;;;;",
        "homepage": ";;;;;;;;;",
        "dblp": ";;;;;;;;;",
        "google_scholar": ";;;;;;;;;",
        "orcid": ";;;;;;;;;",
        "linkedin": ";;;;;;;;;",
        "or_profile": ";;;;;;;;;",
        "aff": ";;;;;;;;;",
        "aff_domain": ";;;;;;;;;",
        "position": ";;;;;;;;;",
        "bibtex": "@misc{\nkazemi2020timevec,\ntitle={Time2Vec: Learning a Vector Representation of Time},\nauthor={Seyed Mehran Kazemi and Rishab Goel and Sepehr Eghbali and Janahan Ramanan and Jaspreet Sahota and Sanjay Thakur and Stella Wu and Cathal Smyth and Pascal Poupart and Marcus Brubaker},\nyear={2020},\nurl={https://openreview.net/forum?id=rklklCVYvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer5;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rklklCVYvB",
        "pdf_size": 0,
        "rating": "1;3;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "364;688;345;126",
        "wc_reply_reviewers": "270;0;0;0",
        "wc_reply_authors": "455;501;227;7",
        "reply_reviewers": "1;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            4.5,
            2.692582403567252
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            380.75,
            200.53600050863685
        ],
        "wc_reply_reviewers_avg": [
            67.5,
            116.91342951089922
        ],
        "wc_reply_authors_avg": [
            297.5,
            197.21751950574776
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            10,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 493,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13747822062731693627&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkllGyBFPH",
        "title": "Beyond Linearization: On Quadratic and Higher-Order Approximation of Wide Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "Wide neural networks can escape the NTK regime and couple with quadratic models, with provably nice optimization landscape and better generalization.",
        "abstract": "Recent theoretical work has established connections between over-parametrized neural networks and linearized models governed by the Neural Tangent Kernels (NTKs). NTK theory leads to concrete convergence and generalization results, yet the empirical performance of neural networks are observed to exceed their linearized models, suggesting insufficiency of this theory.\nTowards closing this gap, we investigate the training of over-parametrized neural networks that are beyond the NTK regime yet still governed by the Taylor expansion of the network. We bring forward the idea of randomizing the neural networks, which allows them to escape their NTK and couple with quadratic models. We show that the optimization landscape of randomized two-layer networks are nice and amenable to escaping-saddle algorithms. We prove concrete generalization and expressivity results on these randomized networks, which lead to sample complexity bounds (of learning certain simple functions) that match the NTK and can in addition be better by a dimension factor when mild distributional assumptions are present. We demonstrate that our randomization technique can be generalized systematically beyond the quadratic case, by using it to find networks that are coupled with higher-order terms in their Taylor series.\n",
        "keywords": "Neural Tangent Kernels;over-parametrized neural networks;deep learning theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Bai;Jason D. Lee",
        "authorids": "yubai.pku@gmail.com;jasondlee88@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nBai2020Beyond,\ntitle={Beyond Linearization: On Quadratic and Higher-Order Approximation of Wide Neural Networks},\nauthor={Yu Bai and Jason D. Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkllGyBFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkllGyBFPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "479;410;314",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "467;346;300",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            401.0,
            67.66091929614909
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            371.0,
            70.43200030289263
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 146,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=935270610324415470&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rklnA34twH",
        "title": "Universal Learning Approach for Adversarial Defense",
        "track": "main",
        "status": "Reject",
        "tldr": "We demonstrate a novel universal-learning driven adversarial defense method to increase robustness and detect adversarial examples.",
        "abstract": "Adversarial attacks were shown to be very effective in degrading the performance of neural networks. By slightly modifying the input, an almost identical input is misclassified by the network. To address this problem, we adopt the universal learning framework. In particular, we follow the recently suggested Predictive Normalized Maximum Likelihood (pNML) scheme for universal learning, whose goal is to optimally compete with a reference learner that knows the true label of the test sample but is restricted to use a learner from a given hypothesis class. In our case, the reference learner is using his knowledge on the true test label to perform minor refinements to the adversarial input. This reference learner achieves perfect results on any adversarial input. The proposed strategy is designed to be as close as possible to the reference learner in the worst-case scenario.  Specifically, the defense essentially refines the test data according to the different hypotheses, where each hypothesis assumes a different label for the sample. Then by comparing the resulting hypotheses probabilities, we predict the label and detect whether the sample is adversarial or natural. Combining our method with adversarial training we create a robust scheme which can handle adversarial input along with detection of the attack. The resulting scheme is demonstrated empirically.",
        "keywords": "Adversarial examples;Adversarial training;Universal learning;pNML for DNN",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Uriya Pesso;Koby Bibas;Meir Feder",
        "authorids": "uriyapes@gmail.com;kobybibas@gmail.com;meir@eng.tau.ac.il",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npesso2020universal,\ntitle={Universal Learning Approach for Adversarial Defense},\nauthor={Uriya Pesso and Koby Bibas and Meir Feder},\nyear={2020},\nurl={https://openreview.net/forum?id=rklnA34twH}\n}",
        "github": "https://anonymous.4open.science/r/dc3d1be7-639c-4463-b580-f8b523c37047/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklnA34twH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "469;156;127",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "798;215;647",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            250.66666666666666,
            154.83826687511356
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            553.3333333333334,
            247.05240109921797
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qk8hZYw7BHoJ:scholar.google.com/&scioq=Universal+Learning+Approach+for+Adversarial+Defense&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rklnDgHtDS",
        "title": "Compositional Language Continual Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Motivated by the human's ability to continually learn and gain knowledge over time, several research efforts have been pushing the limits of machines to constantly learn while alleviating catastrophic forgetting. Most of the existing methods have been focusing on continual learning of label prediction tasks, which have fixed input and output sizes. In this paper, we propose a new scenario of continual learning which handles sequence-to-sequence tasks common in language learning. We further propose an approach to use label prediction continual learning algorithm for sequence-to-sequence continual learning by leveraging compositionality. Experimental results show that the proposed method has significant improvement over state-of-the-art methods. It enables knowledge transfer and prevents catastrophic forgetting, resulting in more than 85% accuracy up to 100 stages, compared with less than 50% accuracy for baselines in instruction learning task. It also shows significant improvement in machine translation task. This is the first work to combine continual learning and compositionality for language learning, and we hope this work will make machines more helpful in various tasks.",
        "keywords": "Compositionality;Continual Learning;Lifelong Learning;Sequence to Sequence Modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuanpeng Li;Liang Zhao;Kenneth Church;Mohamed Elhoseiny",
        "authorids": "yuanpeng16@gmail.com;lzhao4ever@gmail.com;kenneth.ward.church@gmail.com;mohamed.elhoseiny@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLi2020Compositional,\ntitle={Compositional Language Continual Learning},\nauthor={Yuanpeng Li and Liang Zhao and Kenneth Church and Mohamed Elhoseiny},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklnDgHtDS}\n}",
        "github": "https://github.com/yli1/CLCL",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklnDgHtDS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "292;872;216",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "682;479;62",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            460.0,
            292.9755393657748
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            407.6666666666667,
            258.0908539427325
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14411385158062059738&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rklp93EtwH",
        "title": "Automated Relational Meta-learning",
        "track": "main",
        "status": "Poster",
        "tldr": "Addressing task heterogeneity problem in meta-learning by introducing meta-knowledge graph",
        "abstract": "In order to efficiently learn with small amount of data on new tasks, meta-learning transfers knowledge learned from previous tasks to the new ones. However, a critical challenge in meta-learning is the task heterogeneity which cannot be well handled by traditional globally shared meta-learning methods. In addition, current task-specific meta-learning methods may either suffer from hand-crafted structure design or lack the capability to capture complex relations between tasks. In this paper, motivated by the way of knowledge organization in knowledge bases, we propose an automated relational meta-learning (ARML) framework that automatically extracts the cross-task relations and constructs the meta-knowledge graph. When a new task arrives, it can quickly find the most relevant structure and tailor the learned structure knowledge to the meta-learner. As a result, the proposed framework not only addresses the challenge of task heterogeneity by a learned meta-knowledge graph, but also increases the model interpretability. We conduct extensive experiments on 2D toy regression and few-shot image classification and the results demonstrate the superiority of ARML over state-of-the-art baselines.",
        "keywords": "meta-learning;task heterogeneity;meta-knowledge graph",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Huaxiu Yao;Xian Wu;Zhiqiang Tao;Yaliang Li;Bolin Ding;Ruirui Li;Zhenhui Li",
        "authorids": "huaxiuyao@psu.edu;xwu9@nd.edu;zqtao@ece.neu.edu;yaliangl.ub@gmail.com;bolin.ding@alibaba-inc.com;rrli@cs.ucla.edu;jessieli@ist.psu.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nYao2020Automated,\ntitle={Automated Relational Meta-learning},\nauthor={Huaxiu Yao and Xian Wu and Zhiqiang Tao and Yaliang Li and Bolin Ding and Ruirui Li and Zhenhui Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklp93EtwH}\n}",
        "github": "https://github.com/huaxiuyao/ARML",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklp93EtwH",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "356;258;472",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "977;623;351",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            362.0,
            87.46808941932291
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            650.3333333333334,
            256.29323483506585
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 119,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12701522525812856519&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "rklr9kHFDB",
        "title": "Rotation-invariant clustering of neuronal responses in primary visual cortex",
        "track": "main",
        "status": "Talk",
        "tldr": "We classify mouse V1 neurons into putative functional cell types based on their representations in a CNN predicting neural responses",
        "abstract": "Similar to a convolutional neural network (CNN), the mammalian retina encodes visual information into several dozen nonlinear feature maps, each formed by one ganglion cell type that tiles the visual space in an approximately shift-equivariant manner. Whether such organization into distinct cell types is maintained at the level of cortical image processing is an open question. Predictive models building upon convolutional features have been shown to provide state-of-the-art performance, and have recently been extended to include rotation equivariance in order to account for the orientation selectivity of V1 neurons. However, generally no direct correspondence between CNN feature maps and groups of individual neurons emerges in these models, thus rendering it an open question whether V1 neurons form distinct functional clusters. Here we build upon the rotation-equivariant representation of a CNN-based V1 model and propose a methodology for clustering the representations of neurons in this model to find functional cell types independent of preferred orientations of the neurons. We apply this method to a dataset of 6000 neurons and visualize the preferred stimuli of the resulting clusters. Our results highlight the range of non-linear computations in mouse V1.",
        "keywords": "computational neuroscience;neural system identification;functional cell types;deep learning;rotational equivariance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ivan Ustyuzhaninov;Santiago A. Cadena;Emmanouil Froudarakis;Paul G. Fahey;Edgar Y. Walker;Erick Cobos;Jacob Reimer;Fabian H. Sinz;Andreas S. Tolias;Matthias Bethge;Alexander S. Ecker",
        "authorids": "ivan.ustyuzhaninov@bethgelab.org;santiago.cadena@bethgelab.org;froudara@bcm.edu;paul.fahey@bcm.edu;eywalker@bcm.edu;ecobos@bcm.edu;reimer@bcm.edu;fabian.sinz@bcm.edu;astolias@bcm.edu;matthias@bethgelab.org;alexander.ecker@uni-tuebingen.de",
        "gender": ";;;;;;;;;;",
        "homepage": ";;;;;;;;;;",
        "dblp": ";;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;",
        "orcid": ";;;;;;;;;;",
        "linkedin": ";;;;;;;;;;",
        "or_profile": ";;;;;;;;;;",
        "aff": ";;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;",
        "position": ";;;;;;;;;;",
        "bibtex": "@inproceedings{\nUstyuzhaninov2020Rotation-invariant,\ntitle={Rotation-invariant clustering of neuronal responses in primary visual cortex},\nauthor={Ivan Ustyuzhaninov and Santiago A. Cadena and Emmanouil Froudarakis and Paul G. Fahey and Edgar Y. Walker and Erick Cobos and Jacob Reimer and Fabian H. Sinz and Andreas S. Tolias and Matthias Bethge and Alexander S. Ecker},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rklr9kHFDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rklr9kHFDB",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "205;658;175",
        "wc_reply_reviewers": "0;173;0",
        "wc_reply_authors": "908;665;140",
        "reply_reviewers": "0;2;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            346.0,
            220.95700939323015
        ],
        "wc_reply_reviewers_avg": [
            57.666666666666664,
            81.55298209684848
        ],
        "wc_reply_authors_avg": [
            571.0,
            320.50273009757655
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            11,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=421447193066403161&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rklraTNFwB",
        "title": "Robust Instruction-Following in a Situated Agent via Transfer-Learning from Text",
        "track": "main",
        "status": "Reject",
        "tldr": "Transfer learning from powerful text-based language models makes an agent more robust to human instructions in a 3D simulated world.",
        "abstract": "Recent work has described neural-network-based agents that are trained to execute language-like commands in simulated worlds, as a step towards an intelligent agent or robot that can be instructed by human users. However, the instructions that such agents are trained to follow are typically generated from templates (by an environment simulator), and do not reflect the varied or ambiguous expressions used by real people. We address this issue by integrating language encoders that are pretrained on large text corpora into a situated, instruction-following agent. In a procedurally-randomized first-person 3D world, we first train agents to follow synthetic instructions requiring the identification, manipulation and relative positioning of visually-realistic objects models. We then show how these abilities can transfer to a context where humans provide instructions in natural language, but only when agents are endowed with language encoding components that were pretrained on text-data. We explore techniques for integrating text-trained and environment-trained components into an agent, observing clear advantages for the fully-contextual phrase representations computed by the well-known BERT model, and additional gains by integrating a self-attention operation optimized to adapt BERT's representations for the agent's tasks and environment. These  results bridge the gap between two successful strands of recent AI research: agent-centric behavior optimization and text-based representation learning. ",
        "keywords": "agent;language;3D;simulation;policy;instruction;transfer",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Felix Hill;Sona Mokra;Nathaniel Wong;Tim Harley",
        "authorids": "felixhill@google.com;sonka@google.com;nathanielwong@google.com;tharley@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhill2020robust,\ntitle={Robust Instruction-Following in a Situated Agent via Transfer-Learning from Text},\nauthor={Felix Hill and Sona Mokra and Nathaniel Wong and Tim Harley},\nyear={2020},\nurl={https://openreview.net/forum?id=rklraTNFwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklraTNFwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "1179;389;271",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            613.0,
            403.11123361507384
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12429093230628696869&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkltE0VKwH",
        "title": "Coordinated Exploration via Intrinsic Rewards for Multi-Agent Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose several intrinsic reward functions for encouraging coordinated exploration in multi-agent problems, and introduce an approach to dynamically selecting the best exploration method for a given task, online.",
        "abstract": "Solving tasks with sparse rewards is one of the most important challenges in reinforcement learning. In the single-agent setting, this challenge has been addressed by introducing intrinsic rewards that motivate agents to explore unseen regions of their state spaces. Applying these techniques naively to the multi-agent setting results in agents exploring independently, without any coordination among themselves. We argue that learning in cooperative multi-agent settings can be accelerated and improved if agents coordinate with respect to what they have explored. In this paper we propose an approach for learning how to dynamically select between different types of intrinsic rewards which consider not just what an individual agent has explored, but all agents, such that the agents can coordinate their exploration and maximize extrinsic returns. Concretely, we formulate the approach as a hierarchical policy where a high-level controller selects among sets of policies trained on different types of intrinsic rewards and the low-level controllers learn the action policies of all agents under these specific rewards. We demonstrate the effectiveness of the proposed approach in a multi-agent gridworld domain with sparse rewards, and then show that our method scales up to more complex settings by evaluating on the VizDoom platform.",
        "keywords": "multi-agent reinforcement learning;multi-agent;exploration;intrinsic motivation;MARL;coordinated exploration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shariq Iqbal;Fei Sha",
        "authorids": "shariqiqbal2810@gmail.com;fsha@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\niqbal2020coordinated,\ntitle={Coordinated Exploration via Intrinsic Rewards for Multi-Agent Reinforcement Learning},\nauthor={Shariq Iqbal and Fei Sha},\nyear={2020},\nurl={https://openreview.net/forum?id=rkltE0VKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkltE0VKwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "759;488;672",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "695;348;586",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            639.6666666666666,
            112.97295644927102
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            543.0,
            144.88846284872605
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15525520358765042319&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rklv-a4tDB",
        "title": "Mesh-Free Unsupervised Learning-Based PDE Solver of Forward and Inverse problems",
        "track": "main",
        "status": "Reject",
        "tldr": "Solving PDEs with deep learning techniques in an unsupervised fashion with regularizers for forward and inverse problems.",
        "abstract": "We introduce a novel neural network-based partial differential equations solver for forward and inverse problems. The solver is grid free, mesh free and shape free, and the solution is approximated by a neural network. \nWe employ an unsupervised approach such that the input to the network is a points set in an arbitrary domain, and the output is the\nset of the corresponding function values.  The network is trained to minimize deviations of the learned function from the PDE solution and \nsatisfy the boundary conditions. \nThe resulting solution in turn is an explicit smooth differentiable function with a known analytical form. \n \nUnlike other numerical methods such as finite differences and finite elements, the derivatives of the desired function can be analytically calculated to any order. This framework therefore, enables the solution of high order non-linear PDEs. The proposed algorithm is a unified formulation of both forward and inverse problems\nwhere the optimized loss function consists of few elements: fidelity terms of L2 and L infinity norms, boundary conditions constraints and additional regularizers. This setting is flexible in the sense that regularizers can be tailored to specific \nproblems. We demonstrate our method on a free shape 2D second order elliptical system with application to Electrical Impedance Tomography (EIT). ",
        "keywords": "PDEs;forward problems;inverse problems;unsupervised learning;deep networks;EIT",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Leah Bar;Nir Sochen",
        "authorids": "barleah.libra@gmail.com;sochen@tauex.tau.ac.il",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nbar2020meshfree,\ntitle={Mesh-Free Unsupervised Learning-Based {\\{}PDE{\\}} Solver of Forward and Inverse problems},\nauthor={Leah Bar and Nir Sochen},\nyear={2020},\nurl={https://openreview.net/forum?id=rklv-a4tDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rklv-a4tDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "637;274;782",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "308;365;194",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            564.3333333333334,
            213.66068634376538
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            289.0,
            71.09149034870488
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fKaH_9LUQCoJ:scholar.google.com/&scioq=Mesh-Free+Unsupervised+Learning-Based+PDE+Solver+of+Forward+and+Inverse+problems&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rklw4AVtDH",
        "title": "Optimistic Adaptive Acceleration for Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This paper considers a new variant of AMSGrad called Optimistic-AMSGrad. AMSGrad is a popular adaptive gradient based optimization algorithm that is widely used in training deep neural networks. The new variant assumes that mini-batch gradients in consecutive iterations have some underlying structure, which makes the gradients sequentially predictable. By exploiting the predictability and some ideas from Optimistic Online learning, the proposed algorithm can accelerate the convergence and also enjoys a tighter regret bound. We evaluate Optimistic-AMSGrad and AMSGrad in terms of various performance measures (i.e., training loss, testing loss, and classification accuracy on training/testing data), which demonstrate that Optimistic-AMSGrad improves AMSGrad.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jun-Kun Wang;Xiaoyun Li;Ping Li",
        "authorids": "jimwang@gatech.edu;xl374@scarletmail.rutgers.edu;liping11@baidu.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nwang2020optimistic,\ntitle={Optimistic Adaptive Acceleration for Optimization},\nauthor={Jun-Kun Wang and Xiaoyun Li and Ping Li},\nyear={2020},\nurl={https://openreview.net/forum?id=rklw4AVtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rklw4AVtDH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "262;669;462",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "73;143;171",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            464.3333333333333,
            166.16524573113622
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            129.0,
            41.21488404286329
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17400686382385085254&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rklx-gSYPS",
        "title": "Learning to Optimize via Dual space Preconditioning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Preconditioning an minimization algorithm improve its convergence and can lead to a minimizer in one iteration in some extreme cases. There is currently no analytical way for finding a suitable preconditioner. We present a general methodology for learning the preconditioner and show that it can lead to dramatic speed-ups over standard optimization techniques.\n   ",
        "keywords": "Optimization;meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "S\u00e9lim Chraibi;Adil Salim;Samuel Horv\u00e1th;Filip Hanzely;Peter Richt\u00e1rik",
        "authorids": "selimsepthuit@gmail.com;adil.salim@kaust.edu.sa;samuel.horvath@kaust.edu.sa;filip.hanzely@kaust.edu.sa;richtarik@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nchraibi2020learning,\ntitle={Learning to Optimize via Dual space Preconditioning},\nauthor={S{\\'e}lim Chraibi and Adil Salim and Samuel Horv{\\'a}th and Filip Hanzely and Peter Richt{\\'a}rik},\nyear={2020},\nurl={https://openreview.net/forum?id=rklx-gSYPS}\n}",
        "github": "https://drive.google.com/file/d/10XrwojJQUxUWb2Khp9cFp5JyM7usQt5D/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rklx-gSYPS",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "344;244;299",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "793;361;325",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.6666666666667,
            40.89281382128433
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            493.0,
            212.6405417600322
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UGukD11SnmgJ:scholar.google.com/&scioq=Learning+to+Optimize+via+Dual+space+Preconditioning&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rklxF0NtDr",
        "title": "Policy Message Passing: A New Algorithm for Probabilistic Graph Inference",
        "track": "main",
        "status": "Reject",
        "tldr": "An probabilistic inference algorithm driven by neural network for graph-structured models",
        "abstract": "A general graph-structured neural network architecture operates on graphs through two core components: (1) complex enough message functions; (2) a fixed information aggregation process. In this paper, we present the Policy Message Passing algorithm, which takes a probabilistic perspective and reformulates the whole information aggregation as stochastic sequential processes. The algorithm works on a much larger search space, utilizes reasoning history to perform inference, and is robust to noisy edges. We apply our algorithm to multiple complex graph reasoning and prediction tasks and show that our algorithm consistently outperforms state-of-the-art graph-structured models by a significant margin.",
        "keywords": "graph inference algorithm;graph reasoning;variational inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhiwei Deng;Greg Mori",
        "authorids": "zhiweid@princeton.edu;mori@cs.sfu.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ndeng2020policy,\ntitle={Policy Message Passing: A New Algorithm for Probabilistic Graph Inference},\nauthor={Zhiwei Deng and Greg Mori},\nyear={2020},\nurl={https://openreview.net/forum?id=rklxF0NtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rklxF0NtDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "790;184;324",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            432.6666666666667,
            259.05640741394956
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0PR1zqlIn4sJ:scholar.google.com/&scioq=Policy+Message+Passing:+A+New+Algorithm+for+Probabilistic+Graph+Inference&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "rkly70EKDH",
        "title": "Mildly Overparametrized Neural Nets can Memorize Training Data Efficiently",
        "track": "main",
        "status": "Reject",
        "tldr": "We show even mildly overparametrized networks (much smaller than existing results) can be trained to perfectly memorize training data.",
        "abstract": "It has been observed \\citep{zhang2016understanding} that deep neural networks can memorize: they achieve 100\\% accuracy on training data. Recent theoretical results explained such behavior in highly overparametrized regimes, where the number of neurons in each layer is larger than the number of training samples. In this paper, we show that neural networks can be trained to memorize training data perfectly in a mildly overparametrized regime, where the number of parameters is just a constant factor more than the number of training samples, and the number of neurons is much smaller.",
        "keywords": "nonconvex optimization;optimization landscape;overparametrization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rong Ge;Runzhe Wang;Haoyu Zhao",
        "authorids": "rongge@cs.duke.edu;wrz16@mails.tsinghua.edu.cn;zhaohy16@mails.tsinghua.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nge2020mildly,\ntitle={Mildly Overparametrized Neural Nets can Memorize Training Data Efficiently},\nauthor={Rong Ge and Runzhe Wang and Haoyu Zhao},\nyear={2020},\nurl={https://openreview.net/forum?id=rkly70EKDH}\n}",
        "github": "https://www.dropbox.com/sh/sirnp8dmxwtivx8/AACfHgVzLXFSkvuTPsLNCkH-a?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkly70EKDH",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "538;686;452",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "513;579;195",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            558.6666666666666,
            96.6413760019773
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            429.0,
            167.642476717567
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 21,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14703785666015534602&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rklz16Vtvr",
        "title": "ISBNet: Instance-aware Selective Branching Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Recent years have witnessed growing interests in designing efficient neural networks and neural architecture search (NAS). Although remarkable efficiency and accuracy have been achieved, existing expert designed and NAS models neglect the fact that input instances are of varying complexity and thus different amounts of computation are required. Inference with a fixed model that processes all instances through the same transformations would incur computational resources unnecessarily. Customizing the model capacity in an instance-aware manner is required to alleviate such a problem. In this paper, we propose a novel Instance-aware Selective Branching Network-ISBNet to support efficient instance-level inference by selectively bypassing transformation branches of insignificant importance weight. These weights are dynamically determined by a lightweight hypernetwork SelectionNet and recalibrated by gumbel-softmax for sparse branch selection. Extensive experiments show that ISBNet achieves extremely efficient inference in terms of parameter size and FLOPs comparing to existing networks. For example, ISBNet takes only 8.70% parameters and 31.01% FLOPs of the efficient network MobileNetV2 with comparable accuracy on CIFAR-10.",
        "keywords": "neural networks;neural architecture search;efficient inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shaofeng Cai;Yao Shu;Wei Wang;Gang Chen;Beng Chin Ooi",
        "authorids": "shaofeng@comp.nus.edu.sg;shuyao@comp.nus.edu.sg;wangwei@comp.nus.edu.sg;cg@zju.edu.cn;ooibc@comp.nus.edu.sg",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ncai2020isbnet,\ntitle={{\\{}ISBN{\\}}et: Instance-aware Selective Branching Networks},\nauthor={Shaofeng Cai and Yao Shu and Wei Wang and Gang Chen and Beng Chin Ooi},\nyear={2020},\nurl={https://openreview.net/forum?id=rklz16Vtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rklz16Vtvr",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "295;412",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "108;746",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            353.5,
            58.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            427.0,
            319.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6111959589903879358&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rklzaCNtvS",
        "title": "Read, Highlight and Summarize: A Hierarchical Neural Semantic Encoder-based Approach",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Traditional sequence-to-sequence (seq2seq) models and other variations of the attention-mechanism such as hierarchical attention have been applied to the text summarization problem. Though there is a hierarchy in the way humans use language by forming paragraphs from sentences and sentences from words, hierarchical models have usually not worked that much better than their traditional seq2seq counterparts. This effect is mainly because either the hierarchical attention mechanisms are too sparse using hard attention or noisy using soft attention. In this paper, we propose a method based on extracting the highlights of a document; a key concept that is conveyed in a few sentences. In a typical text summarization dataset consisting of documents that are 800 tokens in length (average), capturing long-term dependencies is very important, e.g., the last sentence can be grouped with the first sentence of a document to form a summary. LSTMs (Long Short-Term Memory) proved useful for machine translation. However, they often fail to capture long-term dependencies while modeling long sequences. To address these issues, we have adapted Neural Semantic Encoders (NSE) to text summarization, a class of memory-augmented neural networks by improving its functionalities and proposed a novel hierarchical NSE that outperforms similar previous models significantly. The quality of summarization was improved by augmenting linguistic factors, namely lemma, and Part-of-Speech (PoS) tags, to each word in the dataset for improved vocabulary coverage and generalization. The hierarchical NSE model on factored dataset outperformed the state-of-the-art by nearly 4 ROUGE points. We further designed and used the first GPU-based self-critical Reinforcement Learning model.",
        "keywords": "Abstractive summarization;text summarization;memory augmented neural network;extractive summarization;self critical reinforcement learning;policy gradient",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rajeev Bhatt Ambati;Saptarashmi Bandyopadhyay;Prasenjit Mitra",
        "authorids": "rajeev24811@gmail.com;sbandyo20@gmail.com;pum10@psu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nambati2020read,\ntitle={Read, Highlight and Summarize: A Hierarchical Neural Semantic Encoder-based Approach},\nauthor={Rajeev Bhatt Ambati and Saptarashmi Bandyopadhyay and Prasenjit Mitra},\nyear={2020},\nurl={https://openreview.net/forum?id=rklzaCNtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rklzaCNtvS",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "638;778",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "0;0",
        "reply_reviewers": "0;0",
        "reply_authors": "0;0",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            708.0,
            70.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            3,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:v-LZoEOMzhMJ:scholar.google.com/&scioq=Read,+Highlight+and+Summarize:+A+Hierarchical+Neural+Semantic+Encoder-based+Approach&hl=en&as_sdt=0,5",
        "gs_version_total": 3
    },
    {
        "id": "rkx-wA4YPS",
        "title": "Adapting to Label Shift with Bias-Corrected Calibration",
        "track": "main",
        "status": "Reject",
        "tldr": "calibration strategies that use class-specific bias correction produce strong performance on label shift domain adaptation",
        "abstract": "Label shift refers to the phenomenon where the marginal probability p(y) of observing a particular class changes between the training and test distributions, while the conditional probability p(x|y) stays fixed. This is relevant in settings such as medical diagnosis, where a classifier trained to predict disease based on observed symptoms may need to be adapted to a different distribution where the baseline frequency of the disease is higher. Given estimates of p(y|x) from a predictive model, one can apply domain adaptation procedures including Expectation Maximization (EM) and Black-Box Shift Estimation (BBSE) to efficiently correct for the difference in class proportions between the training and test distributions. Unfortunately, modern neural networks typically fail to produce well-calibrated estimates of p(y|x), reducing the effectiveness of these approaches. In recent years, Temperature Scaling has emerged as an efficient approach to combat miscalibration. However, the effectiveness of Temperature Scaling in the context of adaptation to label shift has not been explored. In this work, we study the impact of various calibration approaches on shift estimates produced by EM or BBSE. In experiments with image classification and diabetic retinopathy detection, we find that calibration consistently tends to improve shift estimation. In particular, calibration approaches that include class-specific bias parameters are significantly better than approaches that lack class-specific bias parameters, suggesting that reducing systematic bias in the calibrated probabilities is especially important for domain adaptation.",
        "keywords": "calibration;label shift;domain adaptation;temperature scaling;em;bbse",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Avanti Shrikumar;Amr M. Alexandari;Anshul Kundaje",
        "authorids": "avanti.shrikumar@gmail.com;amr.alexandari@gmail.com;anshul@kundaje.net",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nshrikumar2020adapting,\ntitle={Adapting to Label Shift with Bias-Corrected Calibration},\nauthor={Avanti Shrikumar and Amr M. Alexandari and Anshul Kundaje},\nyear={2020},\nurl={https://openreview.net/forum?id=rkx-wA4YPS}\n}",
        "github": "https://github.com/blindauth/labelshiftexperiments",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkx-wA4YPS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "298;122;500",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "405;607;374",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            306.6666666666667,
            154.43948832975184
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            462.0,
            103.30859919032233
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6747840484929287874&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkx1b64Fvr",
        "title": "A New Multi-input Model with the Attention Mechanism for Text Classification",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new multi-input  model with a novel attention mechanism, can effectively solve the issues of the shallow text classification model such as  doing not extract long-range associations, global representations, and hierarchical features.",
        "abstract": "Recently, deep learning has made extraordinary achievements in text classification. However, most of present models, especially convolutional neural network (CNN), do not extract long-range associations, global representations, and hierarchical features well due to their relatively shallow and simple structures. This causes a negative effect on text classification. Moreover, we find that there are many express methods of texts. It is appropriate to design the multi-input model to improve the classification effect. But most of models of text classification only use words or characters and do not use the multi-input model. Inspired by the above points and Densenet (Gao Huang, Zhuang Liu, Laurens Van Der Maaten, and Kilian Q Weinberger. Densely connected convolutional networks. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 4700\u20134708, 2017.), we propose a new text classification model, which uses words, characters, and labels as input. The model, which is a deep CNN with a novel attention mechanism, can effectively leverage the input information and solve the above issues of the shallow model. We conduct experiments on six large text classification datasets. Our model achieves the state of the art results on all datasets compared to multiple baseline models.",
        "keywords": "Natural Language Processing;Text Classification;Densent;Multi-input Model;Attention Mechanism",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Junhao Qiu;Ronghua Shi;Fangfang Li (the corresponding author);Jinjing Shi;Wangmin Liao",
        "authorids": "qiujunhao@csu.edu.cn;shirh@csu.edu.cn;lifangfang@csu.edu.cn;shijinjing@csu.edu.cn;0909123117@csu.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nqiu2020a,\ntitle={A New Multi-input Model with the Attention Mechanism for Text Classification},\nauthor={Junhao Qiu and Ronghua Shi and Fangfang Li (the corresponding author) and Jinjing Shi and Wangmin Liao},\nyear={2020},\nurl={https://openreview.net/forum?id=rkx1b64Fvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkx1b64Fvr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "253;80;130",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            154.33333333333334,
            72.69265590293301
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16691424052062298135&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkx3-04FwB",
        "title": "MONET: Debiasing Graph Embeddings via the Metadata-Orthogonal Training Unit",
        "track": "main",
        "status": "Reject",
        "tldr": "Introduces a novel graph neural network method for debiasing graph embeddings from metadata and embedding the metadata effect.",
        "abstract": "Are Graph Neural Networks (GNNs) fair? In many real world graphs, the formation of edges is related to certain node attributes (e.g. gender, community, reputation). In this case, any GNN using these edges will be biased by this information, as it is encoded in the structure of the adjacency matrix itself.  In this paper, we show that when metadata is correlated with the formation of node neighborhoods, unsupervised node embedding dimensions learn this metadata. This bias implies an inability to control for important covariates in real-world applications, such as recommendation systems. \n\nTo solve these issues, we introduce the Metadata-Orthogonal Node Embedding Training (MONET) unit, a general model for debiasing embeddings of nodes in a graph. MONET achieves this by ensuring that the node embeddings are trained on a hyperplane orthogonal to that of the node metadata. This effectively organizes unstructured embedding dimensions into an interpretable topology-only, metadata-only division with no linear interactions. We illustrate the effectiveness of MONET though our experiments on a variety of real world graphs, which shows that our method can learn and remove the effect of arbitrary covariates in tasks such as preventing the leakage of political party affiliation in a blog network, and thwarting the gaming of embedding-based recommendation systems.",
        "keywords": "Graph Embeddings;Representation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "John Palowitch;Bryan Perozzi",
        "authorids": "johnpalowitch@gmail.com;bperozzi@acm.org",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\npalowitch2020monet,\ntitle={{\\{}MONET{\\}}: Debiasing Graph Embeddings via the Metadata-Orthogonal Training Unit},\nauthor={John Palowitch and Bryan Perozzi},\nyear={2020},\nurl={https://openreview.net/forum?id=rkx3-04FwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkx3-04FwB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "348;175;573",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "657;128;277",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.3333333333333,
            162.94443497367098
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            354.0,
            222.72105124272977
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5274319987509008672&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkx35lHKwB",
        "title": "Generalizing Reinforcement Learning to Unseen Actions",
        "track": "main",
        "status": "Reject",
        "tldr": "We address the problem of generalization of reinforcement learning to unseen action spaces.",
        "abstract": "A fundamental trait of intelligence is the ability to achieve goals in the face of novel circumstances. In this work, we address one such setting which requires solving a task with a novel set of actions. Empowering machines with this ability requires generalization in the way an agent perceives its available actions along with the way it uses these actions to solve tasks. Hence, we propose a framework to enable generalization over both these aspects: understanding an action\u2019s functionality, and using actions to solve tasks through reinforcement learning. Specifically, an agent interprets an action\u2019s behavior using unsupervised representation learning over a collection of data samples reflecting the diverse properties of that action. We employ a reinforcement learning architecture which works over these action representations, and propose regularization metrics essential for enabling generalization in a policy. We illustrate the generalizability of the representation learning method and policy, to enable zero-shot generalization to previously unseen actions on challenging sequential decision-making environments. Our results and videos can be found at sites.google.com/view/action-generalization/",
        "keywords": "reinforcement learning;unsupervised representation learning;generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ayush Jain*;Andrew Szot*;Jincheng Zhou;Joseph J. Lim",
        "authorids": "ayushj@usc.edu;szot@usc.edu;jinchenz@usc.edu;limjj@usc.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njain2020generalizing,\ntitle={Generalizing Reinforcement Learning to Unseen Actions},\nauthor={Ayush Jain and Andrew Szot and Jincheng Zhou and Joseph J. Lim},\nyear={2020},\nurl={https://openreview.net/forum?id=rkx35lHKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkx35lHKwB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "188;243;196",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "714;296;1176",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            209.0,
            24.26245384677046
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            728.6666666666666,
            359.4081554753902
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3zY4N0cizJ4J:scholar.google.com/&scioq=Generalizing+Reinforcement+Learning+to+Unseen+Actions&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rkx6MJSFPH",
        "title": "Unrestricted Adversarial Attacks For Semantic Segmentation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Despite the rapid development of adversarial attacks for machine learning models, many types of new adversarial examples still remain unknown.  Uncovered types of adversarial attacks pose serious concern for the safety of the models,  which raise the question about the effectiveness of current adversarial robustness evaluation. Semantic segmentation is one of the most impactful applications of machine learning;  however, their robustness under adversarial attack is not well studied. In this paper,  we focus on generating unrestricted adversarial examples for semantic segmentation models.  We demonstrate a simple yet effective method to generate unrestricted adversarial examples using conditional generative adversarial networks (CGAN) without any hand-crafted metric. The naive implementation of CGAN, however, yields inferior image quality and low attack success rate. Instead, we leverage the SPADE (Spatially-adaptive denormalization) structure with an additional loss item, which is able to generate effective adversarial attacks in a single step.   We validate our approach on the well studied Cityscapes andADE20K datasets,  and demonstrate that our synthetic adversarial examples are not only realistic, but also improves the attack success rate by up to 41.0% compared with the state of the art adversarial attack methods including PGD attack.",
        "keywords": "Adversarial Attacks;Semantic Segmentation;Computer Vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guangyu Shen;Chengzhi Mao;Junfeng Yang;Baishakhi Ray",
        "authorids": "shen447@purdue.edu;cm3797@columbia.edu;junfeng@cs.columbia.edu;rayb@cs.columbia.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkx6MJSFPH",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "565;543;446",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            518.0,
            51.697840058039816
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18171877160321307276&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkx9_gHtvS",
        "title": "Interactive Classification by Asking Informative Questions",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose an interactive approach for classifying natural language queries by asking users for additional information using information gain and a reinforcement learning policy controller.",
        "abstract": "We  propose an  interactive  classification approach  for natural  language  queries. Instead of classifying given the natural language query only, we ask the user for additional information using a sequence of binary and multiple-choice questions. At each turn, we use a policy controller to decide if to present a question or pro-vide the user the final answer, and select the best question to ask by maximizing the system information gain. Our formulation enables bootstrapping the system without any interaction data, instead relying on non-interactive crowdsourcing an-notation tasks. Our evaluation shows the interaction helps the system increase its accuracy and handle ambiguous queries, while our approach effectively balances the number of questions and the final accuracy.",
        "keywords": "NLP;interactive classification;interactive system;text classification;data collection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lili Yu;Howard Chen;Sida I. Wang;Yoav Artzi and Tao Lei",
        "authorids": "liliyu@asapp.com;;;",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkx9_gHtvS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "167;363;498",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            342.6666666666667,
            135.89293171055252
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 27,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11750042530315199025&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkxDoJBYPB",
        "title": "Reinforced Genetic Algorithm Learning for Optimizing Computation Graphs",
        "track": "main",
        "status": "Poster",
        "tldr": "We use deep RL to learn a policy that directs the search of a genetic algorithm to better optimize the execution cost of computation graphs, and show improved results on real-world TensorFlow graphs.",
        "abstract": "We present a deep reinforcement learning approach to minimizing the execution cost of neural network computation graphs in an optimizing compiler. Unlike earlier learning-based works that require training the optimizer on the same graph to be optimized, we propose a learning approach that trains an optimizer offline and then generalizes to previously unseen graphs without further training. This allows our approach to produce high-quality execution decisions on real-world TensorFlow graphs in seconds instead of hours. We consider two optimization tasks for computation graphs: minimizing running time and peak memory usage. In comparison to an extensive set of baselines, our approach achieves significant improvements over classical and other learning-based methods on these two tasks. ",
        "keywords": "reinforcement learning;learning to optimize;combinatorial optimization;computation graphs;model parallelism;learning for systems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aditya Paliwal;Felix Gimeno;Vinod Nair;Yujia Li;Miles Lubin;Pushmeet Kohli;Oriol Vinyals",
        "authorids": "adipal@google.com;fgimeno@google.com;vinair@google.com;yujiali@google.com;mlubin@google.com;pushmeet@google.com;vinyals@google.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nPaliwal2020Reinforced,\ntitle={Reinforced Genetic Algorithm Learning for Optimizing Computation Graphs},\nauthor={Aditya Paliwal and Felix Gimeno and Vinod Nair and Yujia Li and Miles Lubin and Pushmeet Kohli and Oriol Vinyals},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxDoJBYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkxDoJBYPB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "313;455;184",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "809;1088;429",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            317.3333333333333,
            110.67771029234187
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            775.3333333333334,
            270.08681731785595
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 77,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17049841988698665616&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rkxDon4Yvr",
        "title": "Discriminator Based Corpus Generation for General Code Synthesis",
        "track": "main",
        "status": "Reject",
        "tldr": "A way to generate training corpora for neural code synthesis using a discriminator trained on unlabelled data",
        "abstract": "Current work on neural code synthesis consists of increasingly sophisticated architectures being trained on highly simplified domain-specific languages, using uniform sampling across program space of those languages for training. By comparison, program space for a C-like language is vast, and extremely sparsely populated in terms of `useful' functionalities; this requires a far more intelligent approach to corpus generation for effective training. We use a genetic programming approach using an iteratively retrained discriminator to produce a population suitable as labelled training data for a neural code synthesis architecture. We demonstrate that use of a discriminator-based training corpus generator, trained using only unlabelled problem specifications in classic Programming-by-Example format, greatly improves network performance compared to current uniform sampling techniques.",
        "keywords": "Code Synthesis;Neural Code Synthesis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexander Wild;Barry Porter",
        "authorids": "a.wild3@lancaster.ac.uk;b.f.porter@lancaster.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nwild2020discriminator,\ntitle={Discriminator Based Corpus Generation for General Code Synthesis},\nauthor={Alexander Wild and Barry Porter},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxDon4Yvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkxDon4Yvr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "314;723;865",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            634.0,
            233.58224818394626
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FNTYsZZKRzcJ:scholar.google.com/&scioq=Discriminator+Based+Corpus+Generation+for+General+Code+Synthesis&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkxEKp4Fwr",
        "title": "Training Data Distribution Search with Ensemble Active Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep Neural Networks (DNNs) often rely on very large datasets for training. Given the large size of such datasets, it is conceivable that they contain certain samples that either do not contribute or negatively impact the DNN's optimization. Modifying the training distribution in a way that excludes such samples could provide an effective solution to both improve performance and reduce training time. In this paper, we propose to scale up ensemble Active Learning methods to perform acquisition at a large scale (10k to 500k samples at a time). We do this with ensembles of hundreds of models, obtained at a minimal computational cost by reusing intermediate training checkpoints. This allows us to automatically and efficiently perform a training data distribution search for large labeled datasets. We observe that our approach obtains favorable subsets of training data, which can be used to train more accurate DNNs than training with the entire dataset. We perform an extensive experimental study of this phenomenon on three image classification benchmarks (CIFAR-10, CIFAR-100 and ImageNet), analyzing the impact of initialization schemes, acquisition functions and ensemble configurations. We demonstrate that data subsets identified with a lightweight ResNet-18 ensemble remain effective when used to train deep models like ResNet-101 and DenseNet-121. Our results provide strong empirical evidence that optimizing the training data distribution can provide significant benefits on large scale vision tasks.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kashyap Chitta;Jose M. Alvarez;Elmar Haussmann;Clement Farabet",
        "authorids": "kashyap.chitta@tue.mpg.de;josea@nvidia.com;ehaussmann@nvidia.com;cfarabet@nvidia.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nchitta2020training,\ntitle={Training Data Distribution Search with Ensemble Active Learning},\nauthor={Kashyap Chitta and Jose M. Alvarez and Elmar Haussmann and Clement Farabet},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxEKp4Fwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkxEKp4Fwr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "668;652;371",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "693;547;438",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            563.6666666666666,
            136.39240773917325
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            559.3333333333334,
            104.46796425486406
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1649523100112263106&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rkxKwJrKPS",
        "title": "QXplore: Q-Learning Exploration by Maximizing Temporal Difference Error",
        "track": "main",
        "status": "Reject",
        "tldr": "A method for reward-focused efficient exploration in RL using temporal difference errors to train an exploration Q-function",
        "abstract": "A major challenge in reinforcement learning is exploration, especially when reward landscapes are sparse. Several recent methods provide an intrinsic motivation to explore by directly encouraging agents to seek novel states. A potential disadvantage of pure state novelty-seeking behavior is that unknown states are treated equally regardless of their potential for future reward. In this paper, we propose an exploration objective using the temporal difference error experienced on extrinsic rewards as a secondary reward signal for exploration in deep reinforcement learning. Our objective yields novelty-seeking in the absence of extrinsic reward, while accelerating exploration of reward-relevant states in sparse (but nonzero) reward landscapes. This objective draws inspiration from dopaminergic pathways in the brain that influence animal behavior. We implement the objective with an adversarial Q-learning method in which Q and Qx are the action-value functions for extrinsic and secondary rewards, respectively. Secondary reward is given by the absolute value of the TD-error of Q. Training is off-policy, based on a replay buffer containing a mix of trajectories sampled using Q and Qx.  We characterize performance on a set of continuous control benchmark tasks, and demonstrate comparable or faster convergence on all tasks when compared with other state-of-the-art exploration methods.",
        "keywords": "Deep Reinforcement Learning;Exploration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Riley Simmons-Edler;Ben Eisner;Daniel Yang;Anthony Bisulco;Eric Mitchell;Sebastian Seung;Daniel Lee",
        "authorids": "rileys@cs.princeton.edu;ben.a.eisner@gmail.com;daniel.yang17@gmail.com;arb426@cornell.edu;eric.anthony.mitchell95@gmail.com;sseung@princeton.edu;daniel.d.lee@samsung.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nsimmons-edler2020qxplore,\ntitle={{\\{}QX{\\}}plore: Q-Learning Exploration by Maximizing Temporal Difference Error},\nauthor={Riley Simmons-Edler and Ben Eisner and Daniel Yang and Anthony Bisulco and Eric Mitchell and Sebastian Seung and Daniel Lee},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxKwJrKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkxKwJrKPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "651;902;747",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "783;799;533",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            766.6666666666666,
            103.40964923814197
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            705.0,
            121.79764639214777
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10518448372078555484&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkxMKerYwr",
        "title": "Towards Interpreting Deep Neural Networks via Understanding Layer Behaviors",
        "track": "main",
        "status": "Reject",
        "tldr": "Towards Interpreting Deep Neural Networks via Understanding Layer Behaviors",
        "abstract": "Deep neural networks (DNNs) have achieved unprecedented practical success in many applications.\nHowever, how to interpret DNNs is still an open problem.\nIn particular, what do hidden layers behave is not clearly understood. \nIn this paper, relying on a teacher-student paradigm, we seek to understand the layer behaviors of DNNs by ``monitoring\" both across-layer and single-layer distribution evolution to some target distribution in the training. Here, the ``across-layer\" and ``single-layer\" considers the layer behavior \\emph{along the depth} and  a specific layer \\emph{along training epochs}, respectively. \nRelying on optimal transport theory, we employ the Wasserstein distance ($W$-distance)  to measure the divergence between the layer distribution and the target distribution. \nTheoretically, we prove that i) the $W$-distance of across layers to the target distribution tends to decrease along the depth. ii) the $W$-distance of a specific layer to the target distribution tends to decrease along training iterations. iii) \nHowever, a deep layer is not always better than a shallow layer for some samples. Moreover, our results helps to analyze the stability of layer distributions and explains why auxiliary losses helps the training of DNNs. Extensive experiments on real-world datasets justify our theoretical findings.",
        "keywords": "Interpretability of DNNs;Wasserstein distance;Layer behavior",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiezhang Cao;Jincheng Li;Xiping Hu;Peilin Zhao;Mingkui Tan",
        "authorids": "secaojiezhang@mail.scut.edu.cn;sejinchengli@mail.scut.edu.cn;huxp@lzu.edu.cn;peilinzhao@hotmail.com;mingkuitan@scut.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ncao2020towards,\ntitle={Towards Interpreting Deep Neural Networks via Understanding Layer Behaviors},\nauthor={Jiezhang Cao and Jincheng Li and Xiping Hu and Peilin Zhao and Mingkui Tan},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxMKerYwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkxMKerYwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "212;165;227",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "446;204;341",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            201.33333333333334,
            26.411277052720408
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            330.3333333333333,
            99.08357863720687
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wzePPdPLRLgJ:scholar.google.com/&scioq=Towards+Interpreting+Deep+Neural+Networks+via+Understanding+Layer+Behaviors&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkxNQJrFPH",
        "title": "Towards Effective and Efficient Zero-shot Learning by Fine-tuning with Task Descriptions",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "While current machine learning models have achieved great success with labeled data, we have to deal with classes that have little or no training data in many real-world applications. This leads to the study of zero-shot learning. The typical approach in zero-shot learning is to embed seen and unseen classes into a shared space using class meta-data, and construct classifiers on top of that. Yet previous methods either still require significant manual labor in obtaining useful meta-data, or utilize automatically collected meta-data while trading in performance. To achieve satisfactory performance under practical meta-data efficiency constraint, we propose \\textbf{N\\textsuperscript{3}} (\\textbf{N}eural \\textbf{N}etworks from \\textbf{N}atural Language), a meta-model that maps natural language class descriptions to corresponding neural network classifiers. N\\textsuperscript{3} leverages readily available online documents combined with pretrained language representations such as BERT to obtain expressive class embeddings. In addition, N\\textsuperscript{3} generates parameter adaptations for pretrained neural networks using these class embeddings, effectively ``finetuneing'' the network to classify unseen classes. Our experiments show that N\\textsuperscript{3} is able to outperform previous methods across 8 different benchmark evaluations and we show through ablation studies the contribution of each model component. To offer insight into how N\\textsuperscript{3} ``finetunes'' the pretrained network, we also performed a range of qualitative and quantitative analysis. Our code will be released after the review period.",
        "keywords": "zero-shot learning;meta-learning;convolutional neural;dynamic parameter generation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tian Jin*;Zhun Liu*;Shengjia Yan;Alexandre Eichenberger;Louis-Philippe Morency",
        "authorids": "tian.jin1@ibm.com;zhunl@andrew.cmu.edu;sjyan@nyu.edu;alexe@us.ibm.com;morency@cs.cmu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkxNQJrFPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "471;332;541",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            448.0,
            86.86004067847693
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7RLVvRAs-QsJ:scholar.google.com/&scioq=Towards+Effective+and+Efficient+Zero-shot+Learning+by+Fine-tuning+with+Task+Descriptions&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rkxNelrKPB",
        "title": "On Stochastic Sign Descent Methods",
        "track": "main",
        "status": "Reject",
        "tldr": "General analysis of sign-based methods (e.g. signSGD) for non-convex optimization, built on intuitive bounds on success probabilities.",
        "abstract": "Various gradient compression schemes have been proposed to mitigate the communication cost in distributed training of large scale machine learning models. Sign-based methods, such as signSGD (Bernstein et al., 2018), have recently been gaining popularity because of their simple compression rule and connection to adaptive gradient methods, like ADAM. In this paper, we perform a general analysis of sign-based methods for non-convex optimization. Our analysis is built on intuitive bounds on success probabilities and does not rely on special noise distributions nor on the boundedness of the variance of stochastic gradients. Extending the theory to distributed setting within a parameter server framework, we assure exponentially fast variance reduction with respect to number of nodes, maintaining 1-bit compression in both directions and using small mini-batch sizes. We validate our theoretical findings experimentally.",
        "keywords": "non-convex optimization;stochastic optimization;gradient compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mher Safaryan;Peter Richt\u00e1rik",
        "authorids": "mher.safaryan@gmail.com;peter.richtarik@kaust.edu.sa",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nsafaryan2020on,\ntitle={On Stochastic Sign Descent Methods},\nauthor={Mher Safaryan and Peter Richt{\\'a}rik},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxNelrKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkxNelrKPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "106;1175;301",
        "wc_reply_reviewers": "0;298;0",
        "wc_reply_authors": "153;821;129",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            527.3333333333334,
            464.83712798737963
        ],
        "wc_reply_reviewers_avg": [
            99.33333333333333,
            140.47854719572743
        ],
        "wc_reply_authors_avg": [
            367.6666666666667,
            320.70477943983866
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18023946996181054829&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rkxNh1Stvr",
        "title": "Quantifying Point-Prediction Uncertainty in Neural Networks via Residual Estimation with an I/O Kernel",
        "track": "main",
        "status": "Poster",
        "tldr": "Learning to Estimate Point-Prediction Uncertainty and Correct Output in Neural Networks",
        "abstract": "Neural Networks (NNs) have been extensively used for a wide spectrum of real-world regression tasks, where the goal is to predict a numerical outcome such as revenue, effectiveness, or a quantitative result. In many such tasks, the point prediction is not enough: the uncertainty (i.e. risk or confidence) of that prediction must also be estimated. Standard NNs, which are most often used in such tasks, do not provide uncertainty information. Existing approaches address this issue by combining Bayesian models with NNs, but these models are hard to implement, more expensive to train, and usually do not predict as accurately as standard NNs. In this paper, a new framework (RIO) is developed that makes it possible to estimate uncertainty in any pretrained standard NN. The behavior of the NN is captured by modeling its prediction residuals with a Gaussian Process, whose kernel includes both the NN's input and its output. The framework is justified theoretically and evaluated in twelve real-world datasets, where it is found to (1) provide reliable estimates of uncertainty, (2) reduce the error of the point predictions, and (3) scale well to large datasets. Given that RIO can be applied to any standard NN without modifications to model architecture or training pipeline, it provides an important ingredient for building real-world NN applications.",
        "keywords": "Uncertainty Estimation;Neural Networks;Gaussian Process",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xin Qiu;Elliot Meyerson;Risto Miikkulainen",
        "authorids": "qiuxin.nju@gmail.com;elliot.meyerson@cognizant.com;risto@cognizant.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nQiu2020Quantifying,\ntitle={Quantifying Point-Prediction Uncertainty in Neural Networks via Residual Estimation with an I/O Kernel},\nauthor={Xin Qiu and Elliot Meyerson and Risto Miikkulainen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxNh1Stvr}\n}",
        "github": "https://github.com/leaf-ai/rio-paper",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkxNh1Stvr",
        "pdf_size": 0,
        "rating": "6;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "326;1721;209;227",
        "wc_reply_reviewers": "0;506;0;0",
        "wc_reply_authors": "954;3036;572;294",
        "reply_reviewers": "0;1;0;0",
        "reply_authors": "2;5;1;1",
        "rating_avg": [
            6.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            620.75,
            636.7897514093643
        ],
        "wc_reply_reviewers_avg": [
            126.5,
            219.10442715746296
        ],
        "wc_reply_authors_avg": [
            1214.0,
            1077.7114641684016
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            2.25,
            1.6393596310755
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 75,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10327919157136760182&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkxUfANKwB",
        "title": "All SMILES Variational Autoencoder for Molecular Property Prediction and Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "We pool messages amongst multiple SMILES strings of the same molecule to pass information along all paths through the molecular graph, producing latent representations that significantly surpass the state-of-the-art in a variety of tasks.",
        "abstract": "Variational autoencoders (VAEs) defined over SMILES string and graph-based representations of molecules promise to improve the optimization of molecular properties, thereby revolutionizing the pharmaceuticals and materials industries. However, these VAEs are hindered by the non-unique nature of SMILES strings and the computational cost of graph convolutions. To efficiently pass messages along all paths through the molecular graph, we encode multiple SMILES strings of a single molecule using a set of stacked recurrent neural networks, harmonizing hidden representations of each atom between SMILES representations, and use attentional pooling to build a final fixed-length latent representation. By then decoding to a disjoint set of SMILES strings of the molecule, our All SMILES VAE learns an almost bijective mapping between molecules and latent representations near the high-probability-mass subspace of the prior. Our SMILES-derived but molecule-based latent representations significantly surpass the state-of-the-art in a variety of fully- and semi-supervised property regression and molecular property optimization tasks.",
        "keywords": "generative modelling;variational autoencoder;chemistry;cheminformatics;chemoinformatics;molecular property optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zaccary Alperstein;Artem Cherkasov;Jason Rolfe",
        "authorids": "zalperst@gmail.com;artc@interchange.ubc.ca;rolfe22@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nalperstein2020all,\ntitle={All {\\{}SMILES{\\}} Variational Autoencoder for Molecular Property Prediction and Optimization},\nauthor={Zaccary Alperstein and Artem Cherkasov and Jason Rolfe},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxUfANKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkxUfANKwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "560;505;294",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1508;1804;110",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;3;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            453.0,
            114.65019261504389
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1140.6666666666667,
            738.7418734277594
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4272047721228789284&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkxVz1HKwB",
        "title": "Certifiably Robust Interpretation in Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop an interpretation procedure for deep learning models which is certifiably robust to adversarial attack.",
        "abstract": "Deep learning interpretation is essential to explain the reasoning behind model predictions. Understanding the robustness of interpretation methods is important especially in sensitive domains such as medical applications since interpretation results are often used in downstream tasks. Although gradient-based saliency maps are popular methods for deep learning interpretation, recent works show that they can be vulnerable to adversarial attacks. In this paper, we address this problem and provide a certifiable defense method for deep learning interpretation. We show that a sparsified version of the popular SmoothGrad method, which computes the average saliency maps over random perturbations of the input, is certifiably robust against adversarial perturbations. We obtain this result by extending recent bounds for certifiably robust smooth classifiers to the interpretation setting. Experiments on ImageNet samples validate our theory.",
        "keywords": "deep learning interpretation;robustness certificates;adversarial examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexander Levine;Sahil Singla;Soheil Feizi",
        "authorids": "alevine0@cs.umd.edu;ssingla@cs.umd.edu;sfeizi@cs.umd.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nlevine2020certifiably,\ntitle={Certifiably Robust Interpretation in Deep Learning},\nauthor={Alexander Levine and Sahil Singla and Soheil Feizi},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxVz1HKwB}\n}",
        "github": "https://github.com/anonICLR5/robust-interpretation",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkxVz1HKwB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "152;474;310",
        "wc_reply_reviewers": "82;0;0",
        "wc_reply_authors": "356;152;150",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            312.0,
            131.4635564202744
        ],
        "wc_reply_reviewers_avg": [
            27.333333333333332,
            38.6551707048646
        ],
        "wc_reply_authors_avg": [
            219.33333333333334,
            96.6413760019773
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 66,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5519604804062369822&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkxWpCNKvS",
        "title": "Improved Image Augmentation for Convolutional Neural Networks by Copyout and CopyPairing",
        "track": "main",
        "status": "Reject",
        "tldr": "In this work we present two improvements of the state-of-the-art image augmentation techniques. ",
        "abstract": "Image augmentation is a widely used technique to improve the performance of convolutional neural networks (CNNs). In common image shifting, cropping, flipping, shearing and rotating are used for augmentation. But there are more advanced techniques like Cutout and SamplePairing.\n\nIn this work we present two improvements of the state-of-the-art Cutout and SamplePairing techniques. Our new method called Copyout takes a square patch of another random training image and copies it onto a random location of each image used for training. The second technique we discovered is called CopyPairing. It combines Copyout and SamplePairing for further augmentation and even better performance.\n\nWe apply different experiments with these augmentation techniques on the CIFAR-10 dataset to evaluate and compare them under different configurations. In our experiments we show that Copyout reduces the test error rate by 8.18% compared with Cutout and 4.27% compared with SamplePairing. CopyPairing reduces the test error rate by 11.97% compared with Cutout and 8.21% compared with SamplePairing.\n\nCopyout and CopyPairing implementations are available at https://github.com/anonym/anonym.",
        "keywords": "image augmentation;cnn;images;augmentation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Philip May",
        "authorids": "eniak.info@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nmay2020improved,\ntitle={Improved Image Augmentation for Convolutional Neural Networks by Copyout and CopyPairing},\nauthor={Philip May},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxWpCNKvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkxWpCNKvS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "123;172;154",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            149.66666666666666,
            20.237478982214054
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10962444452223789394&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkxXNR4tvH",
        "title": "Semantic Pruning for Single Class Interpretability",
        "track": "main",
        "status": "Reject",
        "tldr": "Semantic Pruning for Filter Interpretability",
        "abstract": "Convolutional Neural Networks (CNN) have achieved state-of-the-art performance in different computer vision tasks, but at a price of being computationally and power intensive. At the same time, only a few attempts were made toward a deeper understanding of CNNs. In this work, we propose to use semantic pruning technique toward not only CNN optimization but also as a way toward getting some insight information on convolutional filters correlation and interference. We start with a pre-trained network and prune it until it behaves as a single class classifier for a selected class. Unlike the more traditional approaches which apply retraining to the pruned CNN, the proposed semantic pruning  does not use retraining. Conducted experiments showed that a) for each class there is a pruning ration which allows removing filters with either an increase or no loss of classification accuracy, b) pruning can improve the interference between filters used for classification of different classes c) effect between classification accuracy and correlation between pruned filters groups specific for different classes. ",
        "keywords": "deep learning;semantic pruning;filter correlation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kamila Abdiyeva;Martin Lukac;Kanat Alimanov",
        "authorids": "kabdiyeva@nu.edu.kz;martin.lukac@nu.edu.kz;kanat.alimanov@nu.edu.kz",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nabdiyeva2020semantic,\ntitle={Semantic Pruning for Single Class Interpretability},\nauthor={Kamila Abdiyeva and Martin Lukac and Kanat Alimanov},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxXNR4tvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkxXNR4tvH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "168;213;1173",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "630;417;2096",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;3",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            518.0,
            463.51914739307153
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1047.6666666666667,
            746.3664575052184
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ijIC6dYBz_EJ:scholar.google.com/&scioq=Semantic+Pruning+for+Single+Class+Interpretability&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkxZCJrtwS",
        "title": "D3PG: Deep Differentiable Deterministic Policy Gradients",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel method that leverages the gradients from differentiable simulators to improve the performance of RL for robotics control",
        "abstract": "Over the last decade, two competing control strategies have emerged for solving complex control tasks with high efficacy. Model-based control algorithms, such as model-predictive control (MPC) and trajectory optimization, peer into the gradients of underlying system dynamics in order to solve control tasks with high sample efficiency.  However, like all gradient-based numerical optimization methods,model-based control methods are sensitive to intializations and are prone to becoming trapped in local minima. Deep reinforcement learning (DRL), on the other hand, can somewhat alleviate these issues by exploring the solution space through sampling \u2014 at the expense of computational cost. In this paper, we present a hybrid method that combines the best aspects of gradient-based methods and DRL. We base our algorithm on the deep deterministic policy gradients (DDPG) algorithm and propose a simple modification that uses true gradients from a differentiable physical simulator to increase the convergence rate of both the actor and the critic.  We demonstrate our algorithm on seven 2D robot control tasks, with the most complex one being a differentiable half cheetah with hard contact constraints. Empirical results show that our method boosts the performance of DDPGwithout sacrificing its robustness to local minima.",
        "keywords": "differentiable simulator;model-based control;policy gradients",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tao Du;Yunfei Li;Jie Xu;Andrew Spielberg;Kui Wu;Daniela Rus;Wojciech Matusik",
        "authorids": "taodu@csail.mit.edu;l-yf16@mails.tsinghua.edu.cn;jiex@csail.mit.edu;aespielberg@csail.mit.edu;walker.kui.wu@gmail.com;rus@csail.mit.edu;wojciech@csail.mit.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\ndu2020dpg,\ntitle={D3{\\{}PG{\\}}: Deep Differentiable Deterministic Policy Gradients},\nauthor={Tao Du and Yunfei Li and Jie Xu and Andrew Spielberg and Kui Wu and Daniela Rus and Wojciech Matusik},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxZCJrtwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkxZCJrtwS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "260;202;216",
        "wc_reply_reviewers": "0;82;0",
        "wc_reply_authors": "448;301;297",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            226.0,
            24.711670657134185
        ],
        "wc_reply_reviewers_avg": [
            27.333333333333332,
            38.6551707048646
        ],
        "wc_reply_authors_avg": [
            348.6666666666667,
            70.25825376581902
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Ql4krbwaohIJ:scholar.google.com/&scioq=D3PG:+Deep+Differentiable+Deterministic+Policy+Gradients&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rkxZveSFDS",
        "title": "Quantum Graph Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Introducing a new class of quantum neural networks for learning graph-based representations on quantum computers.",
        "abstract": "We introduce Quantum Graph Neural Networks (QGNN), a new class of quantum neural network ansatze which are tailored to represent quantum processes which have a graph structure, and are particularly suitable to be executed on distributed quantum systems over a quantum network. Along with this general class of ansatze, we introduce further specialized architectures, namely, Quantum Graph Recurrent Neural Networks (QGRNN) and Quantum Graph Convolutional Neural Networks (QGCNN). We provide four example applications of QGNN's: learning Hamiltonian dynamics of quantum systems, learning how to create multipartite entanglement in a quantum network, unsupervised learning for spectral clustering, and supervised learning for graph isomorphism classification. ",
        "keywords": "quantum neural networks;quantum machine learning;quantum deep learning;graph neural networks;graph convolutional neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guillaume Verdon;Trevor McCourt;Enxhell Luzhinca;Vikash Singh;Stefan Leichenauer;Jack Hidary",
        "authorids": "gverdon@google.com;trevormccrt@google.com;enxhell@google.com;singvikash@google.com;sleichenauer@google.com;hidary@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "",
        "site": "https://openreview.net/forum?id=rkxZveSFDS",
        "pdf_size": 0,
        "rating": "",
        "confidence": "",
        "wc_review": "",
        "wc_reply_reviewers": "",
        "wc_reply_authors": "",
        "reply_reviewers": "",
        "reply_authors": "",
        "rating_avg": [
            0,
            0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            0,
            0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            1,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 135,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1295820646380251071&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rkxZyaNtwB",
        "title": "Online and stochastic optimization beyond Lipschitz continuity: A Riemannian approach",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We introduce a novel version of Lipschitz objective continuity that allows stochastic mirror descent methodologies to achieve optimal convergence rates in problems with singularities.",
        "abstract": "Motivated by applications to machine learning and imaging science, we study a class of online and stochastic optimization problems with loss functions that are not Lipschitz continuous; in particular, the loss functions encountered by the optimizer could exhibit gradient singularities or be singular themselves. Drawing on tools and techniques from Riemannian geometry, we examine a Riemann\u2013Lipschitz (RL) continuity condition which is tailored to the singularity landscape of the problem\u2019s loss functions. In this way, we are able to tackle cases beyond the Lipschitz framework provided by a global norm, and we derive optimal regret bounds and last iterate convergence results through the use of regularized learning methods (such as online mirror descent). These results are subsequently validated in a class of stochastic Poisson inverse problems that arise in imaging science.",
        "keywords": "Online optimization;stochastic optimization;Poisson inverse problems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kimon Antonakopoulos;E. Veronica Belmega;Panayotis Mertikopoulos",
        "authorids": "kimon.antonakopoulos@inria.fr;veronica.belmega@ensea.fr;panayotis.mertikopoulos@imag.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nAntonakopoulos2020Online,\ntitle={Online and stochastic optimization beyond Lipschitz continuity: A Riemannian approach},\nauthor={Kimon Antonakopoulos and E. Veronica Belmega and Panayotis Mertikopoulos},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxZyaNtwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rkxZyaNtwB",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "302;444;411",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "596;120;393",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            385.6666666666667,
            60.67582348477486
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            369.6666666666667,
            195.02535447702405
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=20351219747988403&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "rkxawlHKDr",
        "title": "End to End Trainable Active Contours via Differentiable Rendering",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We present an image segmentation method that iteratively evolves a polygon. At each iteration, the vertices of the polygon are displaced based on the local value of a 2D shift map that is inferred from the input image via an encoder-decoder architecture. The main training loss that is used is the difference between the polygon shape and the ground truth segmentation mask. The network employs a neural renderer to create the polygon from its vertices, making the process fully differentiable. We demonstrate that our method outperforms the state of the art segmentation networks and deep active contour solutions in a variety of benchmarks, including medical imaging and aerial images.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shir Gur;Tal Shaharabany;Lior Wolf",
        "authorids": "shiretzet@gmail.com;shaharabany@mail.tau.ac.il;wolf@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nGur2020End,\ntitle={End to End Trainable Active Contours via Differentiable Rendering},\nauthor={Shir Gur and Tal Shaharabany and Lior Wolf},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxawlHKDr}\n}",
        "github": "[![github](/images/github_icon.svg) shirgur/ACDRNet](https://github.com/shirgur/ACDRNet) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rkxawlHKDr)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkxawlHKDr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "1009;196;454",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "589;60;376",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            553.0,
            339.20790085138054
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            341.6666666666667,
            217.32361941481545
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4625537332937451422&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rkxdexBYPB",
        "title": "Group-Transformer: Towards A Lightweight Character-level Language Model",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposes a novel lightweight Transformer for character-level language modeling, utilizing group-wise operations.",
        "abstract": "Character-level language modeling is an essential but challenging task in Natural Language Processing. \nPrior works have focused on identifying long-term dependencies between characters and have built deeper and wider networks for better performance. However, their models require substantial computational resources, which hinders the usability of character-level language models in applications with limited resources. In this paper, we propose a lightweight model, called Group-Transformer, that reduces the resource requirements for a Transformer, a promising method for modeling sequence with long-term dependencies. Specifically, the proposed method partitions linear operations to reduce the number of parameters and computational cost. As a result, Group-Transformer only uses 18.2\\% of parameters compared to the best performing LSTM-based model, while providing better performance on two benchmark tasks, enwik8 and text8. When compared to Transformers with a comparable number of parameters and time complexity, the proposed model shows better performance. The implementation code will be available.",
        "keywords": "Transformer;Lightweight model;Language Modeling;Character-level language modeling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sungrae Park;Geewook Kim;Junyeop Lee;Junbum Cha;Ji-Hoon Kim Hwalsuk Lee",
        "authorids": "sungrae.park@navercorp.com;;junyeop.lee@navercorp.com;junbum.cha@navercorp.com;",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\npark2020grouptransformer,\ntitle={Group-Transformer: Towards A Lightweight Character-level Language Model},\nauthor={Sungrae Park and Geewook Kim and Junyeop Lee and Junbum Cha and Ji-Hoon Kim Hwalsuk Lee},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxdexBYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkxdexBYPB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "569;975;437",
        "wc_reply_reviewers": "112;0;0",
        "wc_reply_authors": "1713;761;534",
        "reply_reviewers": "1;0;0",
        "reply_authors": "4;2;2",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            660.3333333333334,
            228.93570179322305
        ],
        "wc_reply_reviewers_avg": [
            37.333333333333336,
            52.797306328595546
        ],
        "wc_reply_authors_avg": [
            1002.6666666666666,
            510.7591300103101
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.9428090415820634
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kwUdqanwIE4J:scholar.google.com/&scioq=Group-Transformer:+Towards+A+Lightweight+Character-level+Language+Model&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rkxgHerKvH",
        "title": "DEEP GRAPH SPECTRAL EVOLUTION NETWORKS FOR GRAPH TOPOLOGICAL TRANSFORMATION",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Characterizing the underlying mechanism of graph topological evolution from a source graph to a target graph has attracted fast increasing attention in the deep graph learning domain. However, there lacks expressive and efficient that can handle global and local evolution patterns between source and target graphs. On the other hand, graph topological evolution has been investigated in the graph signal processing domain historically, but it involves intensive labors to manually determine suitable prescribed spectral models and prohibitive difficulty to fit their potential combinations and compositions. To address these challenges, this paper proposes the deep Graph Spectral Evolution Network (GSEN) for modeling the graph topology evolution problem by the composition of newly-developed generalized graph kernels. GSEN can effectively fit a wide range of existing graph kernels and their combinations and compositions with the theoretical guarantee and experimental verification. GSEN has outstanding efficiency in terms of time complexity ($O(n)$) and parameter complexity ($O(1)$), where $n$ is the number of nodes of the graph. Extensive experiments on multiple synthetic and real-world datasets have demonstrated outstanding performance.",
        "keywords": "deep graph learning;graph transformation;brain network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liang Zhao;Qingzhe Li;Negar Etemadyrad;Xiaojie Guo",
        "authorids": "lzhao9@gmu.edu;lzhao9@gmu.edu;lzhao9@gmu.edu;lzhao9@gmu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhao2020deep,\ntitle={{\\{}DEEP{\\}} {\\{}GRAPH{\\}} {\\{}SPECTRAL{\\}} {\\{}EVOLUTION{\\}} {\\{}NETWORKS{\\}} {\\{}FOR{\\}} {\\{}GRAPH{\\}} {\\{}TOPOLOGICAL{\\}} {\\{}TRANSFORMATION{\\}}},\nauthor={Liang Zhao and Qingzhe Li and Negar Etemadyrad and Xiaojie Guo},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxgHerKvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkxgHerKvH",
        "pdf_size": 0,
        "rating": "3;6",
        "confidence": "0;0",
        "wc_review": "322;92",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "881;296",
        "reply_reviewers": "0;0",
        "reply_authors": "2;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            207.0,
            115.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            588.5,
            292.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wWyYTKJjunUJ:scholar.google.com/&scioq=DEEP+GRAPH+SPECTRAL+EVOLUTION+NETWORKS+FOR+GRAPH+TOPOLOGICAL+TRANSFORMATION&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rkxloaEYwB",
        "title": "EfferenceNets for latent space planning",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We present a neuroscience-inspired method based on neural networks for latent space search",
        "abstract": "Planning in high-dimensional space remains a challenging problem, even with recent advances in algorithms and computational power. We are inspired by efference copy and sensory reafference theory from neuroscience.  Our aim is to allow agents to form mental models of their environments for planning.  The cerebellum is emulated with a two-stream, fully connected, predictor network. The network receives as inputs the efference as well as the features of the current state. Building on insights gained from knowledge distillation methods, we choose as our features the outputs of a pre-trained network,  yielding a compressed representation of the current state.  The representation is chosen such that it allows for fast search using classical graph search algorithms. We display the effectiveness of our approach on a viewpoint-matching task using a modified best-first search algorithm.",
        "keywords": "Informed Search;Deep Learning;Heuristics;Transfer Learning;Efference Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hlynur Dav\u00ed\u00f0 Hlynsson;Merlin Sch\u00fcler;Robin Schiewer;Laurenz Wiskott",
        "authorids": "hlynurd@gmail.com;merlin.schueler@ini.rub.de;robin.schiewer@ini.rub.de;laurenz.wiskott@ini.rub.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkxloaEYwB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "353;532;258",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            381.0,
            113.59870891285105
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TXqj_OhNo3MJ:scholar.google.com/&scioq=EfferenceNets+for+latent+space+planning&hl=en&as_sdt=0,14",
        "gs_version_total": 0
    },
    {
        "id": "rkxmPgrKwB",
        "title": "Weight-space symmetry in neural network loss landscapes revisited",
        "track": "main",
        "status": "Reject",
        "tldr": "Weight-space symmetry in neural network landscapes gives rise to numerous number of saddles and flat high-dimensional subspaces.",
        "abstract": "Neural network training depends on the structure of the underlying loss landscape, i.e. local minima, saddle points, flat plateaus, and loss barriers. In relation to the structure of the landscape, we study the permutation symmetry of neurons in each layer of a deep neural network, which gives rise not only to multiple equivalent global minima of the loss function but also to critical points in between partner minima. In a network of $d-1$ hidden layers with $n_k$ neurons in layers $k = 1, \\ldots, d$, we construct continuous paths between equivalent global minima that lead through a `permutation point' where the input and output weight vectors of two neurons in the same hidden layer $k$ collide and interchange. We show that such permutation points are critical points which lie inside high-dimensional subspaces of equal loss, contributing to the global flatness of the landscape. We also find that a permutation point for the exchange of neurons $i$ and $j$ transits into a flat high-dimensional plateau that enables all $n_k!$ permutations of neurons in a given layer $k$ at the same loss value. Moreover, we introduce higher-order permutation points by exploiting the hierarchical structure in the loss landscapes of neural networks, and find that the number of $K$-th order permutation points is much larger than the (already huge) number of equivalent global minima -- at least by a polynomial factor of order $K$. In two tasks, we demonstrate numerically with our path finding method that continuous paths between partner minima exist: first, in a toy network with a single hidden layer on a function approximation task and, second, in a multilayer network on the MNIST task. Our geometric approach yields a lower bound on the number of critical points generated by weight-space symmetries and provides a simple intuitive link between previous theoretical results and numerical observations.",
        "keywords": "Weight-space symmetry;neural network landscapes",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Berfin Simsek;Johanni Brea;Bernd Illing;Wulfram Gerstner",
        "authorids": "berfin.simsek@epfl.ch;johanni.brea@epfl.ch;bernd.illing@epfl.ch;wulfram.gerstner@epfl.ch",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nsimsek2020weightspace,\ntitle={Weight-space symmetry in neural network loss landscapes revisited},\nauthor={Berfin Simsek and Johanni Brea and Bernd Illing and Wulfram Gerstner},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxmPgrKwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkxmPgrKwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "382;626;600",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "114;387;332",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            536.0,
            109.41054184431529
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            277.6666666666667,
            117.88789401611837
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:404ZXMGewWgJ:scholar.google.com/&scioq=Weight-space+symmetry+in+neural+network+loss+landscapes+revisited&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rkxoh24FPH",
        "title": "On Mutual Information Maximization for Representation Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "The success of recent mutual information (MI)-based representation learning approaches strongly depends on the inductive bias in both the choice of network architectures and the parametrization of the employed MI estimators.",
        "abstract": "Many recent methods for unsupervised or self-supervised representation learning train feature extractors by maximizing an estimate of the mutual information (MI) between different views of the data. This comes with several immediate problems: For example, MI is notoriously hard to estimate, and using it as an objective for representation learning may lead to highly entangled representations due to its invariance under arbitrary invertible transformations. Nevertheless, these methods have been repeatedly shown to excel in practice. In this paper we argue, and provide empirical evidence, that the success of these methods cannot be attributed to the properties of MI alone, and that they strongly depend on the inductive bias in both the choice of feature extractor architectures and the parametrization of the employed MI estimators. Finally, we establish a connection to deep metric learning and argue that this interpretation may be a plausible explanation for the success of the recently introduced methods.",
        "keywords": "mutual information;representation learning;unsupervised learning;self-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael Tschannen;Josip Djolonga;Paul K. Rubenstein;Sylvain Gelly;Mario Lucic",
        "authorids": "mi.tschannen@gmail.com;josip@djolonga.com;paruby@gmail.com;sylvaingelly@google.com;lucic@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nTschannen2020On,\ntitle={On Mutual Information Maximization for Representation Learning},\nauthor={Michael Tschannen and Josip Djolonga and Paul K. Rubenstein and Sylvain Gelly and Mario Lucic},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxoh24FPH}\n}",
        "github": "https://storage.googleapis.com/mi_for_rl_files/code.zip",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkxoh24FPH",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "308;324;244",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "575;204;156",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.0,
            34.56395039150859
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            311.6666666666667,
            187.23306925386396
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 634,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13497843317340085742&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rkxs0yHFPH",
        "title": "SpikeGrad: An ANN-equivalent Computation Model for Implementing Backpropagation with Spikes",
        "track": "main",
        "status": "Poster",
        "tldr": "An implementation of the backpropagation algorithm using spiking neurons for forward and backward propagation.",
        "abstract": "Event-based neuromorphic systems promise to reduce the energy consumption of deep neural networks by replacing expensive floating point operations on dense matrices by low energy, sparse operations on spike events. While these systems can be trained increasingly well using approximations of the backpropagation algorithm, this usually requires high precision errors and is therefore incompatible with the typical communication infrastructure of neuromorphic circuits. In this work, we analyze how the gradient can be discretized into spike events when training a spiking neural network. To accelerate our simulation, we show that using a special implementation of the integrate-and-fire neuron allows us to describe the accumulated activations and errors of the spiking neural network in terms of an equivalent artificial neural network, allowing us to largely speed up training compared to an explicit simulation of all spike events. This way we are able to demonstrate that even for deep networks, the gradients can be discretized sufficiently well with spikes if the gradient is properly rescaled. This form of spike-based backpropagation enables us to achieve equivalent or better accuracies on the MNIST and CIFAR10 datasets than comparable state-of-the-art spiking neural networks trained with full precision gradients. The algorithm, which we call SpikeGrad, is based on only accumulation and comparison operations and can naturally exploit sparsity in the gradient computation, which makes it an interesting choice for a spiking neuromorphic systems with on-chip learning capacities.",
        "keywords": "spiking neural network;neuromorphic engineering;backpropagation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Johannes C. Thiele;Olivier Bichler;Antoine Dupret",
        "authorids": "johannes.thiele@cea.fr;olivier.bichler@cea.fr;antoine.dupret@cea.fr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nThiele2020SpikeGrad:,\ntitle={SpikeGrad: An ANN-equivalent Computation Model for Implementing Backpropagation with Spikes},\nauthor={Johannes C. Thiele and Olivier Bichler and Antoine Dupret},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxs0yHFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkxs0yHFPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "196;487;221",
        "wc_reply_reviewers": "0;31;0",
        "wc_reply_authors": "607;686;782",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            301.3333333333333,
            131.68227755557018
        ],
        "wc_reply_reviewers_avg": [
            10.333333333333334,
            14.613540144521982
        ],
        "wc_reply_authors_avg": [
            691.6666666666666,
            71.55572808812877
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 48,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2203507482593862518&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rkxsgkHKvH",
        "title": "POLYNOMIAL ACTIVATION FUNCTIONS",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose polynomial as activation functions.",
        "abstract": "Activation is a nonlinearity function that plays a predominant role in the convergence and performance of deep neural networks. While Rectified Linear Unit (ReLU) is the most successful activation function, its derivatives have shown superior performance on benchmark datasets. In this work, we explore the polynomials as activation functions (order \u2265 2) that can approximate continuous real valued function within a given interval. Leveraging this property, the main idea is to learn the nonlinearity, accepting that the ensuing function may not be monotonic. While having the ability to learn more suitable nonlinearity, we cannot ignore the fact that it is a challenge to achieve stable performance due to exploding gradients - which is prominent with the increase in order. To handle this issue, we introduce dynamic input scaling, output scaling, and lower learning rate for the polynomial weights. Moreover, lower learning rate will control the abrupt fluctuations of the polynomials between weight updates. In experiments on three public datasets, our proposed method matches the performance of prior activation functions, thus providing insight into a network\u2019s nonlinearity preference.",
        "keywords": "Activation functions;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vikas Gottemukkula",
        "authorids": "vikas11187@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkxsgkHKvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "283;225;245",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            251.0,
            24.055491403558285
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5636851191941628719&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rkxtNaNKwr",
        "title": "Evolutionary Reinforcement Learning for Sample-Efficient Multiagent Coordination",
        "track": "main",
        "status": "Reject",
        "tldr": "Reinforcement learning for problems that involve multiple agents coordinating to achieve a sparse team objective",
        "abstract": "Many cooperative multiagent reinforcement learning environments provide agents with a sparse team-based reward as well as a dense agent-specific reward that incentivizes learning basic skills. Training policies solely on the team-based reward is often difficult due to its sparsity. Also, relying solely on the agent-specific reward is sub-optimal because it usually does not capture the team coordination objective. A common approach is to use reward shaping to construct a proxy reward by combining the individual rewards. However, this requires manual tuning for each environment. We introduce Multiagent Evolutionary Reinforcement Learning (MERL), a split-level training platform that handles the two objectives separately through two optimization processes. An evolutionary algorithm maximizes the sparse team-based objective through neuroevolution on a population of teams. Concurrently, a gradient-based optimizer trains policies to only maximize the dense agent-specific rewards. The gradient-based policies are periodically added to the evolutionary population as a way of information transfer between the two optimization processes. This enables the evolutionary algorithm to use skills learned via the agent-specific rewards toward optimizing the global objective. Results demonstrate that MERL significantly outperforms state-of-the-art methods such as MADDPG on a number of difficult coordination benchmarks. ",
        "keywords": "reinforcement learning;multiagent;neuroevolution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shauharda Khadka;Somdeb Majumdar;Santiago Miret;Stephen McAleer;Kagan Tumer",
        "authorids": "shauharda.khadka@intel.com;somdeb.majumdar@intel.com;santiago.miret@intel.com;smcaleer@uci.edu;kagan.tumer@oregonstate.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nkhadka2020evolutionary,\ntitle={Evolutionary Reinforcement Learning for Sample-Efficient Multiagent Coordination},\nauthor={Shauharda Khadka and Somdeb Majumdar and Santiago Miret and Stephen McAleer and Kagan Tumer},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxtNaNKwr}\n}",
        "github": "https://anonymous.4open.science/r/1590ffb0-aa6b-4838-9d59-ae20cdd8df11/",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkxtNaNKwr",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "971;646;269",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "793;1508;152",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            628.6666666666666,
            286.8522654995464
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            817.6666666666666,
            553.8593884933451
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 79,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10453167673517341810&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rkxuWaVYDB",
        "title": "Optimal Attacks on Reinforcement Learning Policies",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Control policies, trained using the Deep Reinforcement Learning, have been recently shown to be vulnerable to adversarial attacks introducing even very small perturbations to the policy input. The attacks proposed so far have been designed using heuristics, and build on existing adversarial example crafting techniques used to dupe classifiers in supervised learning. In contrast, this paper investigates the problem of devising optimal attacks, depending on a well-defined attacker's objective, e.g., to minimize the main agent average reward. When the policy and the system dynamics, as well as rewards, are known to the attacker, a scenario referred to as a white-box attack, designing optimal attacks amounts to solving a Markov Decision Process. For what we call black-box attacks, where neither the policy nor the system is known, optimal attacks can be trained using Reinforcement Learning techniques. Through  numerical experiments, we demonstrate the efficiency of our attacks compared to existing attacks (usually based on Gradient methods). We further quantify the potential impact of attacks and establish its connection to the smoothness of the policy under attack. Smooth policies are naturally less prone to attacks (this explains why Lipschitz policies, with respect to the state, are more resilient). Finally, we show that from the main agent perspective, the system uncertainties and the attacker can be modelled as a Partially Observable Markov Decision Process. We actually demonstrate that using Reinforcement Learning techniques tailored to POMDP (e.g. using Recurrent Neural Networks) leads to more resilient policies. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alessio Russo;Alexandre Proutiere",
        "authorids": "alessior@kth.se;alepro@kth.se",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nrusso2020optimal,\ntitle={Optimal Attacks on Reinforcement Learning Policies},\nauthor={Alessio Russo and Alexandre Proutiere},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxuWaVYDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rkxuWaVYDB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "553;259;435",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1798;753;406",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            415.6666666666667,
            120.80103016484954
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            985.6666666666666,
            591.6171810291592
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11519934675009147072&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rkxxA24FDr",
        "title": "Neural Stored-program Memory",
        "track": "main",
        "status": "Poster",
        "tldr": "A neural simulation of Universal Turing Machine",
        "abstract": "Neural networks powered with external memory simulate computer behaviors. These models, which use the memory to store data for a neural controller, can learn algorithms and other complex tasks. In this paper, we introduce a new memory to store weights for the controller, analogous to the stored-program memory in modern computer architectures. The proposed model, dubbed Neural Stored-program Memory, augments current memory-augmented neural networks, creating differentiable machines that can switch programs through time, adapt to variable contexts and thus fully resemble the Universal Turing Machine. A wide range of experiments demonstrate that the resulting machines not only excel in classical algorithmic problems, but also have potential for compositional, continual, few-shot learning and question-answering tasks. ",
        "keywords": "Memory Augmented Neural Networks;Universal Turing Machine;fast-weight",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hung Le;Truyen Tran;Svetha Venkatesh",
        "authorids": "lethai@deakin.edu.au;truyen.tran@deakin.edu.au;svetha.venkatesh@deakin.edu.au",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLe2020Neural,\ntitle={Neural Stored-program Memory},\nauthor={Hung Le and Truyen Tran and Svetha Venkatesh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rkxxA24FDr}\n}",
        "github": "https://github.com/thaihungle/NSM",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rkxxA24FDr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "340;1282;1363",
        "wc_reply_reviewers": "213;170;0",
        "wc_reply_authors": "745;646;1652",
        "reply_reviewers": "1;1;0",
        "reply_authors": "2;1;2",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            995.0,
            464.33393156218943
        ],
        "wc_reply_reviewers_avg": [
            127.66666666666667,
            91.96496917607028
        ],
        "wc_reply_authors_avg": [
            1014.3333333333334,
            452.70618384211286
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15969516798219653164&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rkxxKhVYwr",
        "title": "Utility Analysis of Network Architectures for 3D Point Cloud Processing",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We diagnose deep neural networks for 3D point cloud processing to explore the utility of different network architectures. ",
        "abstract": "In this paper, we diagnose deep neural networks for 3D point cloud processing to explore the utility of different network architectures. We propose a number of hypotheses on the effects of specific network architectures on the representation capacity of DNNs. In order to prove the hypotheses, we design five metrics to diagnose various types of DNNs from the following perspectives, information discarding, information concentration, rotation robustness, adversarial robustness, and neighborhood inconsistency. We conduct comparative studies based on such metrics to verify the hypotheses, which may shed new lights on the architectural design of neural networks. Experiments demonstrated the effectiveness of our method. The code will be released when this paper is accepted.",
        "keywords": "3D Point Cloud Processing;Interpretability;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shikun Huang;Binbin Zhang;Wen Shen;Zhihua Wei;Quanshi Zhang",
        "authorids": "hsk@tongji.edu.cn;0206zbb@tongji.edu.cn;1810068@tongji.edu.cn;zhihua_wei@tongji.edu.cn;zqs1022@sjtu.edu.cn",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rkxxKhVYwr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "480;86;280",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "498;205;474",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            282.0,
            160.8560433016636
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            392.3333333333333,
            132.8265368399285
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12970371700827314563&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryGWhJBtDB",
        "title": "Hyperparameter Tuning and Implicit Regularization in Minibatch SGD",
        "track": "main",
        "status": "Reject",
        "tldr": "Smaller batch sizes can outperform very large batches on the test set under constant step budgets and with properly tuned learning rate schedules.",
        "abstract": "This paper makes two contributions towards understanding how the hyperparameters of stochastic gradient descent affect the final training loss and test accuracy of neural networks. First, we argue that stochastic gradient descent exhibits two regimes with different behaviours; a noise dominated regime which typically arises for small or moderate batch sizes, and a curvature dominated regime which typically arises when the batch size is large. In the noise dominated regime, the optimal learning rate increases as the batch size rises, and the training loss and test accuracy are independent of batch size under a constant epoch budget. In the curvature dominated regime, the optimal learning rate is independent of batch size, and the training loss and test accuracy degrade as the batch size rises. We support these claims with experiments on a range of architectures including ResNets, LSTMs and autoencoders. We always perform a grid search over learning rates at all batch sizes. Second, we demonstrate that small or moderately large batch sizes continue to outperform very large batches on the test set, even when both models are trained for the same number of steps and reach similar training losses. Furthermore, when training Wide-ResNets on CIFAR-10 with a constant batch size of 64, the optimal learning rate to maximize the test accuracy only decays by a factor of 2 when the epoch budget is increased by a factor of 128, while the optimal learning rate to minimize the training loss decays by a factor of 16. These results confirm that the noise in stochastic gradients can introduce beneficial implicit regularization.",
        "keywords": "SGD;momentum;batch size;learning rate;noise;temperature;implicit regularization;optimization;generalization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Samuel L Smith;Erich Elsen;Soham De",
        "authorids": "slsmith@google.com;eriche@google.com;sohamde@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsmith2020hyperparameter,\ntitle={Hyperparameter Tuning and Implicit Regularization in Minibatch {\\{}SGD{\\}}},\nauthor={Samuel L Smith and Erich Elsen and Soham De},\nyear={2020},\nurl={https://openreview.net/forum?id=ryGWhJBtDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryGWhJBtDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "416;338;94",
        "wc_reply_reviewers": "18;0;0",
        "wc_reply_authors": "687;679;250",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            282.6666666666667,
            137.15522431010623
        ],
        "wc_reply_reviewers_avg": [
            6.0,
            8.48528137423857
        ],
        "wc_reply_authors_avg": [
            538.6666666666666,
            204.14428448744013
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:K9h2u6VaTUYJ:scholar.google.com/&scioq=Hyperparameter+Tuning+and+Implicit+Regularization+in+Minibatch+SGD&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "rye4lT4tvB",
        "title": "PAD-Nets: Learning Dynamic Receptive Fields via Pixel-Wise Adaptive Dilation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Dilated convolution kernels are constrained by their shared dilation, keeping them from being aware of diverse spatial contents at different locations. We address such limitations by formulating the dilation as trainable weights respect to individual positions. We introduce Pixel-wise Adaptive Dilation (PAD), a light-weighted extension that allows convolution kernels to flexibly adjust receptive fields based on different contents at pixel level. By inferring dilation via modeling inter-layer patterns, PAD-Nets also provide a possible way to partially understand the hierarchical representations of CNNs. Our evaluation results indicate PAD-Nets can consistently outperform their conventional counterparts on various visual tasks.",
        "keywords": "receptive field;dilated CNN;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongdong Wang;Hao Hu;Jie Yao;Zihang Zou;Liqiang Wang",
        "authorids": "daniel.wang@knights.ucf.edu;hhu@fxpal.com;17112098@bjtu.edu.cn;zzz@knights.ucf.edu;lwang@cs.ucf.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rye4lT4tvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "262;377;206",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            281.6666666666667,
            71.1820826394458
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UxaKI7rUDn4J:scholar.google.com/&scioq=PAD-Nets:+Learning+Dynamic+Receptive+Fields+via+Pixel-Wise+Adaptive+Dilation&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rye5YaEtPr",
        "title": "SAdam: A Variant of Adam for Strongly Convex Functions",
        "track": "main",
        "status": "Poster",
        "tldr": "A variant of Adam for strongly convex functions",
        "abstract": "The Adam algorithm has become extremely popular for large-scale machine learning. Under convexity condition, it has been proved to enjoy a data-dependent $O(\\sqrt{T})$ regret bound where $T$ is the time horizon. However, whether strong convexity can be utilized to further improve the performance remains an open problem. In this paper, we give an affirmative answer by developing a variant of Adam (referred to as SAdam) which achieves a data-dependent $O(\\log T)$ regret bound for strongly convex functions. The essential idea is to maintain a faster decaying yet under controlled step size for exploiting strong convexity. In addition, under a special configuration of hyperparameters, our SAdam reduces to SC-RMSprop, a recently proposed variant of RMSprop for strongly convex functions, for which we provide the first data-dependent logarithmic regret bound. Empirical results on optimizing strongly convex functions and training deep networks demonstrate the effectiveness of our method.",
        "keywords": "Online convex optimization;Adaptive online learning;Adam",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guanghui Wang;Shiyin Lu;Quan Cheng;Wei-wei Tu;Lijun Zhang",
        "authorids": "guhuwang@gmail.com;lsy1116@qq.com;chengquangm@gmail.com;tuwwcn@gmail.com;zljzju@gmail.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nWang2020SAdam:,\ntitle={SAdam: A Variant of Adam for Strongly Convex Functions},\nauthor={Guanghui Wang and Shiyin Lu and Quan Cheng and Wei-wei Tu and Lijun Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rye5YaEtPr}\n}",
        "github": "https://github.com/SAdam-ICLR2020/codes",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rye5YaEtPr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "665;304;194",
        "wc_reply_reviewers": "834;70;0",
        "wc_reply_authors": "1264;213;65",
        "reply_reviewers": "2;1;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            387.6666666666667,
            201.18040549605112
        ],
        "wc_reply_reviewers_avg": [
            301.3333333333333,
            377.7347687936015
        ],
        "wc_reply_authors_avg": [
            514.0,
            533.7608703030475
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 52,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4099818587284366739&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "ryeEr0EFvS",
        "title": "A Hierarchy of Graph Neural Networks Based on Learnable Local Features",
        "track": "main",
        "status": "Reject",
        "tldr": "We developed a theoretically-sound hierarchy of graph neural networks (GNNs) based on aggregation regions, and demonstrated how to create powerful GNNs systematically using such framework.",
        "abstract": "Graph neural networks (GNNs) are a powerful tool to learn representations on graphs by iteratively aggregating features from node neighbourhoods. Many variant models have been proposed, but there is limited understanding on both how to compare different architectures and how to construct GNNs systematically. Here, we propose a hierarchy of GNNs based on their aggregation regions. We derive theoretical results about the discriminative power and feature representation capabilities of each class. Then, we show how this framework can be utilized to systematically construct arbitrarily powerful GNNs. As an example, we construct a simple architecture that exceeds the expressiveness of the Weisfeiler-Lehman graph isomorphism test. We empirically validate our theory on both synthetic and real-world benchmarks, and demonstrate our example's theoretical power translates to state-of-the-art results on node classification, graph classification, and graph regression tasks. ",
        "keywords": "Graph Neural Networks;Hierarchy;Weisfeiler-Lehman;Discriminative Power",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael Lingzhi Li;Meng Dong;Jiawei Zhou;Alexander M. Rush",
        "authorids": "mlli@mit.edu;mengdong@g.harvard.edu;jzhou02@g.harvard.edu;srush@cornell.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nli2020a,\ntitle={A Hierarchy of Graph Neural Networks Based on Learnable Local Features},\nauthor={Michael Lingzhi Li and Meng Dong and Jiawei Zhou and Alexander M. Rush},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeEr0EFvS}\n}",
        "github": "https://anonymous.4open.science/r/13513fff-1a5f-42a6-857f-f5d39051b565/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryeEr0EFvS",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "193;130;237",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "602;726;267",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            186.66666666666666,
            43.911527213503085
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            531.6666666666666,
            193.8733836525501
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10880241845049392157&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryeFY0EFwS",
        "title": "Coherent Gradients: An Approach to Understanding Generalization in Gradient Descent-based Optimization",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a hypothesis for why gradient descent generalizes based on how per-example gradients interact with each other.",
        "abstract": "An open question in the Deep Learning community is why neural networks trained with Gradient Descent generalize well on real datasets even though they are capable of fitting random data. We propose an approach to answering this question based on a hypothesis about the dynamics of gradient descent that we call Coherent Gradients: Gradients from similar examples are similar and so the overall gradient is stronger in certain directions where these reinforce each other. Thus changes to the network parameters during training are biased towards those that (locally) simultaneously benefit many examples when such similarity exists. We support this hypothesis with heuristic arguments and perturbative experiments and outline how this can explain several common empirical observations about Deep Learning. Furthermore, our analysis is not just descriptive, but prescriptive. It suggests a natural modification to gradient descent that can greatly reduce overfitting.",
        "keywords": "generalization;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Satrajit Chatterjee",
        "authorids": "satrajit@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@inproceedings{\nChatterjee2020Coherent,\ntitle={Coherent Gradients: An Approach to Understanding Generalization in Gradient Descent-based Optimization},\nauthor={Satrajit Chatterjee},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeFY0EFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryeFY0EFwS",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "724;312;280",
        "wc_reply_reviewers": "1317;0;8",
        "wc_reply_authors": "2288;47;208",
        "reply_reviewers": "7;0;1",
        "reply_authors": "9;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            438.6666666666667,
            202.18363490209146
        ],
        "wc_reply_reviewers_avg": [
            441.6666666666667,
            618.9627524244805
        ],
        "wc_reply_authors_avg": [
            847.6666666666666,
            1020.5881746435348
        ],
        "reply_reviewers_avg": [
            2.6666666666666665,
            3.091206165165235
        ],
        "reply_authors_avg": [
            3.6666666666666665,
            3.7712361663282534
        ],
        "replies_avg": [
            26,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 65,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13451484099420950182&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "ryeFzT4YPr",
        "title": "Lift-the-flap: what, where and when for context reasoning",
        "track": "main",
        "status": "Reject",
        "tldr": "Putting vision in context: a recurrent neural network model captures human visual reasoning",
        "abstract": "Context reasoning is critical in a wide variety of applications where current inputs need to be interpreted in the light of previous experience and knowledge. Both spatial and temporal contextual information play a critical role in the domain of visual recognition. Here we investigate spatial constraints (what image features provide contextual information and where they are located), and temporal constraints (when different contextual cues matter) for visual recognition. The task is to reason about the scene context and infer what a target object hidden behind a flap is in a natural image. To tackle this problem, we first describe an online human psychophysics experiment recording active sampling via mouse clicks in lift-the-flap games and identify clicking patterns and features which are diagnostic for high contextual reasoning accuracy. As a proof of the usefulness of these clicking patterns and visual features, we extend a state-of-the-art recurrent model capable of attending to salient context regions, dynamically integrating useful information, making inferences, and predicting class label for the target object over multiple clicks. The proposed model achieves human-level contextual reasoning accuracy, shares human-like sampling behavior and learns interpretable features for contextual reasoning.",
        "keywords": "contextual reasoning;visual recognition;human behavior;intelligent sampling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Mengmi Zhang;Claire Tseng;Karla Montejo;Joseph Kwon;Gabriel Kreiman",
        "authorids": "mengmi.zhang@childrens.harvard.edu;ctseng@college.harvard.edu;kmont057@fiu.edu;joseph.kwon@yale.edu;gabriel.kreiman@tch.harvard.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang2020lifttheflap,\ntitle={Lift-the-flap: what, where and when for context reasoning},\nauthor={Mengmi Zhang and Claire Tseng and Karla Montejo and Joseph Kwon and Gabriel Kreiman},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeFzT4YPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryeFzT4YPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "594;364;306",
        "wc_reply_reviewers": "55;0;0",
        "wc_reply_authors": "1275;1120;678",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            421.3333333333333,
            124.36862769828875
        ],
        "wc_reply_reviewers_avg": [
            18.333333333333332,
            25.927248643506744
        ],
        "wc_reply_authors_avg": [
            1024.3333333333333,
            252.9378491953222
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WBXQR-uz-j0J:scholar.google.com/&scioq=Lift-the-flap:+what,+where+and+when+for+context+reasoning&hl=en&as_sdt=0,5",
        "gs_version_total": 6
    },
    {
        "id": "ryeG924twB",
        "title": "Learning Expensive Coordination: An Event-Based Deep RL Approach",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose an event-based policy gradient  to train the leader and an action abstraction policy gradient to train the followers in leader-follower Markov game.",
        "abstract": "Existing works in deep Multi-Agent Reinforcement Learning (MARL) mainly focus on coordinating cooperative agents to complete certain tasks jointly. However, in many cases of the real world, agents are self-interested such as employees in a company and clubs in a league. Therefore, the leader, i.e., the manager of the company or the league, needs to provide bonuses to followers for efficient coordination, which we call expensive coordination. The main difficulties of expensive coordination are that i) the leader has to consider the long-term effect and predict the followers' behaviors when assigning bonuses and ii) the complex interactions between followers make the training process hard to converge, especially when the leader's policy changes with time. In this work, we address this problem through an event-based deep RL approach. Our main contributions are threefold. (1) We model the leader's decision-making process as a semi-Markov Decision Process and propose a novel multi-agent event-based policy gradient to learn the leader's long-term policy. (2) We exploit the leader-follower consistency scheme to design a follower-aware module and a follower-specific attention module to predict the followers' behaviors and make accurate response to their behaviors. (3) We propose an action abstraction-based policy gradient algorithm to reduce the followers' decision space and thus accelerate the training process of followers. Experiments in resource collections, navigation, and the predator-prey game reveal that our approach outperforms the state-of-the-art methods dramatically.",
        "keywords": "Multi-Agent Deep Reinforcement Learning;Deep Reinforcement Learning;Leader\u2013Follower Markov Game;Expensive Coordination",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhenyu Shi*;Runsheng Yu*;Xinrun Wang*;Rundong Wang;Youzhi Zhang;Hanjiang Lai;Bo An",
        "authorids": "shizhy6@mail2.sysu.edu.cn;runsheng.yu@ntu.edu.sg;xwang033@e.ntu.edu.sg;rundong001@e.ntu.edu.sg;yzhang137@e.ntu.edu.sg;laihanj3@mail.sysu.edu.cn;boan@ntu.edu.sg",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nShi*2020Learning,\ntitle={Learning Expensive Coordination: An Event-Based Deep RL Approach},\nauthor={Zhenyu Shi* and Runsheng Yu* and Xinrun Wang* and Rundong Wang and Youzhi Zhang and Hanjiang Lai and Bo An},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeG924twB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryeG924twB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "614;198;227",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1284;349;201",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            346.3333333333333,
            189.63883802873525
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            611.3333333333334,
            479.46938263969355
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 11,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1163906298150552217&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "ryeHuJBtPH",
        "title": "Hyper-SAGNN: a self-attention based graph neural network for hypergraphs",
        "track": "main",
        "status": "Poster",
        "tldr": "We develop a new self-attention based graph neural network called Hyper-SAGNN applicable to homogeneous and heterogeneous hypergraphs with variable hyperedge sizes that can fulfill tasks like node classification and hyperedge prediction. ",
        "abstract": "Graph representation learning for hypergraphs can be utilized to extract patterns among higher-order interactions that are critically important in many real world problems. Current approaches designed for hypergraphs, however, are unable to handle different types of hypergraphs and are typically not generic for various learning tasks. Indeed, models that can predict variable-sized heterogeneous hyperedges have not been available. Here we develop a new self-attention based graph neural network called Hyper-SAGNN applicable to homogeneous and heterogeneous hypergraphs with variable hyperedge sizes. We perform extensive evaluations on multiple datasets, including four benchmark network datasets and two single-cell Hi-C datasets in genomics. We demonstrate that Hyper-SAGNN significantly outperforms state-of-the-art methods on traditional tasks while also achieving great performance on a new task called outsider identification. We believe that Hyper-SAGNN will be useful for graph representation learning to uncover complex higher-order interactions in different applications. ",
        "keywords": "graph neural network;hypergraph;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ruochi Zhang;Yuesong Zou;Jian Ma",
        "authorids": "ruochiz@andrew.cmu.edu;logic.zys@gmail.com;jianma@cs.cmu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nZhang2020Hyper-SAGNN:,\ntitle={Hyper-SAGNN: a self-attention based graph neural network for hypergraphs},\nauthor={Ruochi Zhang and Yuesong Zou and Jian Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeHuJBtPH}\n}",
        "github": "https://drive.google.com/drive/folders/1kIOc4SlAJllUJsrr2OnZ4izIQIw2JexU?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryeHuJBtPH",
        "pdf_size": 0,
        "rating": "8;8",
        "confidence": "0;0",
        "wc_review": "325;303",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "715;467",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            314.0,
            11.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            591.0,
            124.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 254,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10735269367403451355&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryeK6nNFDr",
        "title": "Effective and Robust Detection of Adversarial Examples via Benford-Fourier Coefficients",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Adversarial examples have been well known as a serious threat to deep neural\nnetworks (DNNs). To ensure successful and safe operations of DNNs on realworld tasks, \nit is urgent to equip DNNs with effective defense strategies. In this\nwork, we study the detection of adversarial examples, based on the assumption\nthat the output and internal responses of one DNN model for both adversarial and\nbenign examples follow the generalized Gaussian distribution (GGD), but with\ndifferent parameters (i.e., shape factor, mean, and variance). GGD is a general\ndistribution family to cover many popular distributions (e.g., Laplacian, Gaussian,\nor uniform). It is more likely to approximate the intrinsic distributions of internal\nresponses than any specific distribution. Besides, since the shape factor is more\nrobust to different databases rather than the other two parameters, we propose\nto construct discriminative features via the shape factor for adversarial detection,\nemploying the magnitude of Benford-Fourier coefficients (MBF), which can be\neasily estimated using responses. Finally, a support vector machine is trained\nas the adversarial detector through leveraging the MBF features. Through the\nKolmogorov-Smirnov (KS) test, we empirically verify that: 1) the posterior vectors \nof both adversarial and benign examples follow GGD; 2) the extracted MBF features \nof adversarial and benign examples follow different distributions. Extensive \nexperiments in terms of image classification demonstrate that the proposed \ndetector is much more effective and robust on detecting adversarial examples \nof different crafting methods and different sources, in contrast to state-of-the-art \nadversarial detection methods.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chengcheng Ma;Baoyuan Wu;Shibiao Xu;Yanbo Fan;Yong Zhang;Xiaopeng Zhang;Zhifeng Li",
        "authorids": "machengcheng2016@gmail.com;wubaoyuan1987@gmail.com;shibiao.xu@ia.ac.cn;fanyanbo0124@gmail.com;zhangyong201303@gmail.com;xiaopeng.zhang@ia.ac.cn;michaelzfli@tencent.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nma2020effective,\ntitle={Effective and Robust Detection of Adversarial Examples via Benford-Fourier Coefficients},\nauthor={Chengcheng Ma and Baoyuan Wu and Shibiao Xu and Yanbo Fan and Yong Zhang and Xiaopeng Zhang and Zhifeng Li},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeK6nNFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryeK6nNFDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "309;117;136",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "574;337;278",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            187.33333333333334,
            86.38029610713056
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            396.3333333333333,
            127.91750814055474
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2879463793427008605&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "ryeN5aEYDH",
        "title": "Deep RL for Blood Glucose Control: Lessons, Challenges, and Opportunities",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop a deep reinforcement learning algorithm to control blood glucose in people with diabetes. ",
        "abstract": "Individuals with type 1 diabetes (T1D) lack the ability to produce the insulin their bodies need. As a result, they must continually make decisions about how much insulin to self-administer in order to adequately control their blood glucose levels. Longitudinal data streams captured from wearables, like continuous glucose monitors, can help these individuals manage their health, but currently the majority of the decision burden remains on the user. To relieve this burden, researchers are working on closed-loop solutions that combine a continuous glucose monitor and an insulin pump with a control algorithm in an `artificial pancreas.' Such systems aim to estimate and deliver the appropriate amount of insulin. Here, we develop reinforcement learning (RL) techniques for automated blood glucose control. Through a series of experiments, we compare the performance of different deep RL approaches to non-RL approaches. We highlight the flexibility of RL approaches, demonstrating how they can adapt to new individuals with little additional data. On over 21k hours of simulated data across 30 patients, RL approaches outperform baseline control algorithms (increasing time spent in normal glucose range from 71% to 75%) without requiring meal announcements. Moreover, these approaches are adept at leveraging latent behavioral patterns (increasing time in range from 58% to 70%). This work demonstrates the potential of deep RL for controlling complex physiological systems with minimal expert knowledge. ",
        "keywords": "Deep Reinforcement Learning;Diabetes;Artificial Pancreas;Control",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ian Fox;Joyce Lee;Rodica Busui;Jenna Wiens",
        "authorids": "ifox@umich.edu;joyclee@med.umich.edu;rpbusui@umich.edu;wiensj@umich.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfox2020deep,\ntitle={Deep {\\{}RL{\\}} for Blood Glucose Control: Lessons, Challenges, and Opportunities},\nauthor={Ian Fox and Joyce Lee and Rodica Busui and Jenna Wiens},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeN5aEYDH}\n}",
        "github": "https://tinyurl.com/y6e2m68b",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryeN5aEYDH",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "315;523",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "390;305",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            419.0,
            104.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            347.5,
            42.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6gs9W-UH4hQJ:scholar.google.com/&scioq=Deep+RL+for+Blood+Glucose+Control:+Lessons,+Challenges,+and+Opportunities&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "ryeQmCVYPS",
        "title": "Defective Convolutional Layers Learn Robust CNNs",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a technique that modifies CNN structures to make predictions more relying on shape information and improve the defense ability against several types of attack.",
        "abstract": "Robustness of convolutional neural networks has recently been highlighted by the adversarial examples, i.e., inputs added with well-designed perturbations which are imperceptible to humans but can cause the network to give incorrect outputs. Recent research suggests that the noises in adversarial examples break the textural structure, which eventually leads to wrong predictions by convolutional neural networks. To help a convolutional neural network make predictions relying less on textural information, we propose defective convolutional layers which contain defective neurons whose activations are set to be a constant function. As the defective neurons contain no information and are far different from the standard neurons in its spatial neighborhood, the textural features cannot be accurately extracted and the model has to seek for other features for classification, such as the shape. We first show that predictions made by the defective CNN are less dependent on textural information, but more on shape information, and further find that adversarial examples generated by the defective CNN appear to have semantic shapes. Experimental results demonstrate the defective CNN has higher defense ability than the standard CNN against various types of attack. In particular, it achieves state-of-the-art performance against transfer-based attacks without applying any adversarial training.",
        "keywords": "adversarial examples;robust machine learning;cnn structure;deep feature representations",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tiange Luo;Tianle Cai;Xiaomeng Zhang;Siyu Chen;Di He;Liwei Wang",
        "authorids": "luotg@pku.edu.cn;caitianle1998@pku.edu.cn;zhan147@usc.edu;siyuchen@pku.edu.cn;dihe@microsoft.com;wanglw@cis.pku.edu.cn",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nluo2020defective,\ntitle={Defective Convolutional Layers Learn Robust {\\{}CNN{\\}}s},\nauthor={Tiange Luo and Tianle Cai and Xiaomeng Zhang and Siyu Chen and Di He and Liwei Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeQmCVYPS}\n}",
        "github": "https://drive.google.com/open?id=1ovRyQhW4jKKbxH8QeCOCOCcaaXA4eb8_",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryeQmCVYPS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "339;783;483",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "271;1033;242",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            535.0,
            184.95404834715026
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            515.3333333333334,
            366.2370210062816
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2195526592536180077&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryeRn3NtPH",
        "title": "Adversarial Inductive Transfer Learning with input and output space adaptation",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel method of inductive transfer learning that employs adversarial learning and multi-task learning to address the discrepancy in input and output space",
        "abstract": "We propose Adversarial Inductive Transfer Learning (AITL), a method for addressing discrepancies in input and output spaces between source and target domains. AITL utilizes adversarial domain adaptation and multi-task learning to address these discrepancies. Our motivating application is pharmacogenomics where the goal is to predict drug response in patients using their genomic information. The challenge is that clinical data (i.e. patients) with drug response outcome is very limited, creating a need for transfer learning to bridge the gap between large pre-clinical pharmacogenomics datasets (e.g. cancer cell lines) and clinical datasets. Discrepancies exist between 1) the genomic data of pre-clinical and clinical datasets (the input space), and 2) the different measures of the drug response (the output space). To the best of our knowledge, AITL is the first adversarial inductive transfer learning method to address both input and output discrepancies. Experimental results indicate that AITL outperforms state-of-the-art pharmacogenomics and transfer learning baselines and may guide precision oncology more accurately.",
        "keywords": "Inductive transfer learning;adversarial learning;multi-task learning;pharmacogenomics;precision oncology",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hossein Sharifi-Noghabi;Shuman Peng;Olga Zolotareva;Colin C. Collins;Martin Ester",
        "authorids": "hsharifi@sfu.ca;shumanp@sfu.ca;ozolotareva@techfak.uni-bielefeld.de;ccollins@prostatecentre.com;ester@sfu.ca",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsharifi-noghabi2020adversarial,\ntitle={Adversarial Inductive Transfer Learning with input and output space adaptation},\nauthor={Hossein Sharifi-Noghabi and Shuman Peng and Olga Zolotareva and Colin C. Collins and Martin Ester},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeRn3NtPH}\n}",
        "github": "https://github.com/tllabtl/AITL",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryeRn3NtPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "92;570;107",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "354;885;112",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            256.3333333333333,
            221.88034813585654
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            450.3333333333333,
            322.8439595566598
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RyIe2mjAGXEJ:scholar.google.com/&scioq=Adversarial+Inductive+Transfer+Learning+with+input+and+output+space+adaptation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ryeRwlSYPH",
        "title": "Learning transitional skills with intrinsic motivation",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "By maximizing an information theoretic objective, a few recent methods empower the agent to explore the environment and learn useful skills without supervision. However, when considering to use multiple consecutive skills to complete a specific task, the transition from one to another cannot guarantee the success of the process due to the evident gap between skills. In this paper, we propose to learn transitional skills (LTS) in addition to creating diverse primitive skills without a reward function. By introducing an extra latent variable for transitional skills, our LTS method discovers both primitive and transitional skills by minimizing the difference of mutual information and the similarity of skills. By considering various simulated robotic tasks, our results demonstrate the effectiveness of LTS on learning both diverse primitive skills and transitional skills, and show its superiority in smooth transition of skills over the state-of-the-art baseline DIAYN.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qiangxing Tian;Jinxin Liu;Donglin Wang",
        "authorids": "11821087@zju.edu.cn;liujinxin@westlake.edu.cn;wangdonglin@westlake.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\ntian2020learning,\ntitle={Learning transitional skills with intrinsic motivation},\nauthor={Qiangxing Tian and Jinxin Liu and Donglin Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeRwlSYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryeRwlSYPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "314;431;300",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "641;557;740",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            348.3333333333333,
            58.73291713813946
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            646.0,
            74.79304780526061
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9698914304688469768&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryeSKAVtPB",
        "title": "How Aggressive Can Adversarial Attacks Be: Learning Ordered Top-k Attacks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "ordered Top-k adversarial attacks",
        "abstract": "Deep Neural Networks (DNNs) are vulnerable to adversarial attacks, especially white-box targeted attacks. This paper studies the problem of how aggressive white-box targeted attacks can be to go beyond widely used Top-1 attacks. We propose to learn ordered Top-k attacks (k>=1), which enforce the Top-k predicted labels of an adversarial example to be the k (randomly) selected and ordered labels (the ground-truth label is exclusive). Two methods are presented. First, we extend the vanilla Carlini-Wagner (C&W) method and use it as a strong baseline. Second, we present an adversarial distillation framework consisting of two components: (i) Computing an adversarial probability distribution for any given ordered Top-$k$ targeted labels. (ii) Learning adversarial examples by minimizing the Kullback-Leibler (KL) divergence between the adversarial distribution and the predicted distribution, together with the perturbation energy penalty. In computing adversarial distributions, we explore how to leverage label semantic similarities, leading to knowledge-oriented attacks. In experiments, we test Top-k (k=1,2,5,10) attacks in the ImageNet-1000 val dataset using two popular DNNs trained with the clean ImageNet-1000  train dataset, ResNet-50 and DenseNet-121. Overall, the adversarial distillation approach obtains the best results, especially by large margin when computation budget is limited.. It reduces the perturbation energy consistently with the same attack success rate on all the four k's, and improve the attack success rate by large margin against the modified C&W method for k=10.   ",
        "keywords": "Adversarial Attack;Adversarial Distillation;Ordered Top-k Attack",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zekun Zhang;Tianfu Wu",
        "authorids": "zzhang56@ncsu.edu;tianfu_wu@ncsu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryeSKAVtPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "419;293;339",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            350.3333333333333,
            52.0597946809457
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:V4BEOE2AolAJ:scholar.google.com/&scioq=How+Aggressive+Can+Adversarial+Attacks+Be:+Learning+Ordered+Top-k+Attacks&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "ryeT10VKDH",
        "title": "Adapt-to-Learn: Policy Transfer in Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "In this paper, we present an architecture for adapting the policies learned from one RL domain to another.",
        "abstract": "Efficient and robust policy transfer remains a key challenge in reinforcement learning. Policy transfer through warm initialization, imitation, or interacting over a large set of agents with randomized instances, have been commonly applied to solve a variety of Reinforcement Learning (RL) tasks. However, this is far from how behavior transfer happens in the biological world: Humans and animals are able to quickly adapt the learned behaviors between similar tasks and learn new skills when presented with new situations. Here we seek to answer the question: Will learning to combine adaptation reward with environmental reward lead to a more efficient transfer of policies between domains? We introduce a principled mechanism that can \\textbf{``Adapt-to-Learn\"}, that is adapt the source policy to learn to solve a target task with significant transition differences and uncertainties.  We show through theory and experiments that our method leads to a significantly reduced sample complexity of transferring the policies between the tasks.",
        "keywords": "Transfer Learning;Reinforcement Learning;Adaptation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Girish Joshi;Girish Chowdhary",
        "authorids": "girishj2@illinois.edu;girishc@illinois.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\njoshi2020adapttolearn,\ntitle={Adapt-to-Learn: Policy Transfer in Reinforcement Learning},\nauthor={Girish Joshi and Girish Chowdhary},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeT10VKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryeT10VKDH",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "790;293;472",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            518.3333333333334,
            205.5275055936688
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3061380921269058456&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryeUg0VFwr",
        "title": "Striving for Simplicity in Off-Policy Deep Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "This paper advocates the use of offline (batch) reinforcement learning (RL) to help (1) isolate the contributions of exploitation vs. exploration in off-policy deep RL, (2) improve reproducibility of deep RL research, and (3) facilitate the design of simpler deep RL algorithms. We propose an offline RL benchmark on Atari 2600 games comprising all of the replay data of a DQN agent. Using this benchmark, we demonstrate that recent off-policy deep RL algorithms, even when trained solely on logged DQN data, can outperform online DQN. We present Random Ensemble Mixture (REM), a simple Q-learning algorithm that enforces optimal Bellman consistency on random convex combinations of multiple Q-value estimates. The REM algorithm outperforms more complex RL agents such as C51 and QR-DQN on the offline Atari benchmark and performs comparably in the online setting.",
        "keywords": "reinforcement learning;off-policy;batch RL;offline RL;benchmark",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rishabh Agarwal;Dale Schuurmans;Mohammad Norouzi",
        "authorids": "rishabhagarwal@google.com;schuurmans@google.com;mnorouzi@google.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nagarwal2020striving,\ntitle={Striving for Simplicity in Off-Policy Deep Reinforcement Learning},\nauthor={Rishabh Agarwal and Dale Schuurmans and Mohammad Norouzi},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeUg0VFwr}\n}",
        "github": "https://github.com/anonymous-code-github/offline-rl",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryeUg0VFwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "492;835;444",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1065;1423;990",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            590.3333333333334,
            174.11171381871148
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1159.3333333333333,
            188.93796748021705
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 76,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7468365401393241721&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "ryeYpJSKwr",
        "title": "Meta-Learning Acquisition Functions for Transfer Learning in Bayesian Optimization",
        "track": "main",
        "status": "Spotlight",
        "tldr": "We perform efficient and flexible transfer learning in the framework of Bayesian optimization through meta-learned neural acquisition functions.",
        "abstract": "Transferring knowledge across tasks to improve data-efficiency is one of the open key challenges in the field of global black-box optimization. Readily available algorithms are typically designed to be universal optimizers and, therefore, often suboptimal for specific tasks. We propose a novel transfer learning method to obtain customized optimizers within the well-established framework of Bayesian optimization, allowing our algorithm to utilize the proven generalization capabilities of Gaussian processes. Using reinforcement learning to meta-train an acquisition function (AF) on a set of related tasks, the proposed method learns to extract implicit structural information and to exploit it for improved data-efficiency. We present experiments on a simulation-to-real transfer task as well as on several synthetic functions and on two hyperparameter search problems. The results show that our algorithm (1) automatically identifies structural properties of objective functions from available source tasks or simulations, (2) performs favourably in settings with both scarse and abundant source data, and (3) falls back to the performance level of general AFs if no particular structure is present.",
        "keywords": "Transfer Learning;Meta Learning;Bayesian Optimization;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Michael Volpp;Lukas P. Fr\u00f6hlich;Kirsten Fischer;Andreas Doerr;Stefan Falkner;Frank Hutter;Christian Daniel",
        "authorids": "mvolpp89@googlemail.com;lukas.froehlich@de.bosch.com;k.fischer-lotte@online.de;andreas.doerr3@de.bosch.com;stefan.falkner@de.bosch.com;fh@cs.uni-freiburg.de;christian.daniel@de.bosch.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nVolpp2020Meta-Learning,\ntitle={Meta-Learning Acquisition Functions for Transfer Learning in Bayesian Optimization},\nauthor={Michael Volpp and Lukas P. Fr\u00f6hlich and Kirsten Fischer and Andreas Doerr and Stefan Falkner and Frank Hutter and Christian Daniel},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryeYpJSKwr}\n}",
        "github": "https://github.com/metabo-iclr2020/MetaBO",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryeYpJSKwr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "718;654;335",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1268;663;453",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            569.0,
            167.5131835607773
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            794.6666666666666,
            345.5028541448279
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 100,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14836714846017566907&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryebG04YvB",
        "title": "Adversarially robust transfer learning",
        "track": "main",
        "status": "Poster",
        "tldr": "Robust models have robust feature extractors which can be useful for transferring robustness to other domains",
        "abstract": "Transfer learning, in which a network is trained on one task and re-purposed on another, is often used to produce neural network classifiers when data is scarce or full-scale training is too costly.  When the goal is to produce a model that is not only accurate but also adversarially robust, data scarcity and computational limitations become even more cumbersome.\nWe consider robust transfer learning, in which we transfer not only performance but also robustness from a source model to a target domain.  We start by observing that robust networks contain robust feature extractors. By training classifiers on top of these feature extractors, we produce new models that inherit the robustness of their parent networks. We then consider the case of \"fine tuning\" a network by re-training end-to-end in the target domain. When using lifelong learning strategies, this process preserves the robustness of the source network while achieving high accuracy. By using such strategies, it is possible to produce accurate and robust models with little data, and without the cost of adversarial training. Additionally, we can improve the generalization of adversarially trained models, while maintaining their robustness.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ali Shafahi;Parsa Saadatpanah;Chen Zhu;Amin Ghiasi;Christoph Studer;David Jacobs;Tom Goldstein",
        "authorids": "ashafahi@cs.umd.edu;parsa@cs.umd.edu;chenzhu@cs.umd.edu;amin@cs.umd.edu;studer@cornell.edu;djacobs@cs.umd.edu;tomg@cs.umd.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nShafahi2020Adversarially,\ntitle={Adversarially robust transfer learning},\nauthor={Ali Shafahi and Parsa Saadatpanah and Chen Zhu and Amin Ghiasi and Christoph Studer and David Jacobs and Tom Goldstein},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryebG04YvB}\n}",
        "github": "[![github](/images/github_icon.svg) ashafahi/RobustTransferLWF](https://github.com/ashafahi/RobustTransferLWF)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryebG04YvB",
        "pdf_size": 0,
        "rating": "1;8;8",
        "confidence": "0;0;0",
        "wc_review": "407;721;393",
        "wc_reply_reviewers": "0;50;0",
        "wc_reply_authors": "994;1413;36",
        "reply_reviewers": "0;1;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            5.666666666666667,
            3.299831645537222
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            507.0,
            151.42875112298412
        ],
        "wc_reply_reviewers_avg": [
            16.666666666666668,
            23.570226039551585
        ],
        "wc_reply_authors_avg": [
            814.3333333333334,
            576.334586464803
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 157,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=247907928453605112&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryedjkSFwr",
        "title": "Global Momentum Compression for Sparse Communication in Distributed SGD",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel method combining global momentum and memory gradient for sparse communication, with an extra convergence guarantee.",
        "abstract": "With the rapid growth of data, distributed stochastic gradient descent~(DSGD) has been widely used for solving large-scale machine learning problems. Due to the latency and limited bandwidth of network, communication has become the bottleneck of DSGD when we need to train large scale models, like deep neural networks. Communication compression with sparsified gradient, abbreviated as \\emph{sparse communication}, has been widely used for reducing communication cost in DSGD. Recently, there has appeared one method, called deep gradient compression~(DGC), to combine memory gradient and momentum SGD for sparse communication. DGC has achieved promising performance in practice. However, the theory about the convergence of DGC is lack. In this paper, we propose a novel method, called \\emph{\\underline{g}}lobal \\emph{\\underline{m}}omentum \\emph{\\underline{c}}ompression~(GMC), for sparse communication in DSGD. GMC also combines memory gradient and momentum SGD. But different from DGC which adopts local momentum, GMC adopts global momentum. We theoretically prove the convergence rate of GMC for both convex and non-convex problems. To the best of our knowledge, this is the first work that proves the convergence of distributed momentum SGD~(DMSGD) with sparse communication and memory gradient. Empirical results show that, compared with the DMSGD counterpart without sparse communication, GMC can reduce the communication cost by approximately 100 fold without loss of generalization accuracy. GMC can also achieve comparable~(sometimes better) performance compared with DGC, with an extra theoretical guarantee.",
        "keywords": "Distributed momentum SGD;Communication compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shen-Yi Zhao;Yin-Peng Xie;Hao Gao;Wu-Jun Li",
        "authorids": "zhaosy@lamda.nju.edu.cn;xieyp@lamda.nju.edu.cn;gaoh@lamda.nju.edu.cn;liwujun@nju.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhao2020global,\ntitle={Global Momentum Compression for Sparse Communication in Distributed {\\{}SGD{\\}}},\nauthor={Shen-Yi Zhao and Yin-Peng Xie and Hao Gao and Wu-Jun Li},\nyear={2020},\nurl={https://openreview.net/forum?id=ryedjkSFwr}\n}",
        "github": "https://1drv.ms/u/s!Aq2YlVh622_x5gP5Lky-vj1XxzoV",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryedjkSFwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "828;334;196",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.6666666666667,
            271.3144956605812
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17744381226772158259&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryedqa4FwS",
        "title": "MANAS: Multi-Agent Neural Architecture Search",
        "track": "main",
        "status": "Reject",
        "tldr": "Scalable multi-agent formulation of neural architecture search",
        "abstract": "The Neural Architecture Search (NAS) problem is typically formulated as a graph search problem where the goal is to learn the optimal operations over edges in order to maximize a graph-level global objective. Due to the large architecture parameter space, efficiency is a key bottleneck preventing NAS from its practical use. In this paper, we address the issue by framing NAS as a multi-agent problem where agents control a subset of the network and coordinate to reach optimal architectures. We provide two distinct lightweight implementations, with reduced memory requirements ($1/8$th of state-of-the-art), and performances above those of much more computationally expensive methods.\nTheoretically, we demonstrate vanishing regrets of the form $\\mathcal{O}(\\sqrt{T})$, with $T$ being the total number of rounds. \nFinally, aware that random search is an (often ignored) effective baseline we perform additional experiments on $3$ alternative datasets and $2$ network configurations, and achieve favorable results in comparison with this baseline and other competing methods.",
        "keywords": "Neural Architecture Search;NAS;AutoML;Computer Vision",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fabio Maria Carlucci;Pedro M Esperan\u00e7a;Marco Singh;Victor Gabillon;Antoine Yang;Hang Xu;Zewei Chen;Jun Wang",
        "authorids": "fabiom.carlucci@gmail.com;pedro.esperanca@huawei.com;marco.singh@huawei.com;victor.gabillon@huawei.com;antoineyang3@gmail.com;xu.hang@huawei.com;chen.zewei@huawei.com;w.j@huawei.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\ncarlucci2020manas,\ntitle={{\\{}MANAS{\\}}: Multi-Agent Neural Architecture Search},\nauthor={Fabio Maria Carlucci and Pedro M Esperan{\\c{c}}a and Marco Singh and Victor Gabillon and Antoine Yang and Hang Xu and Zewei Chen and Jun Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=ryedqa4FwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryedqa4FwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "249;294;261",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "135;665;248",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            268.0,
            19.026297590440446
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            349.3333333333333,
            227.92737634801327
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10920133044130036550&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "ryefE1SYDr",
        "title": "LIA: Latently Invertible Autoencoder with Adversarial Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "A new model Latently Invertible Autoencoder is proposed to solve the problem of variational inference in VAE using the invertible network and two-stage adversarial training.",
        "abstract": "Deep generative models such as Variational AutoEncoder (VAE) and Generative Adversarial Network (GAN) play an increasingly important role in machine learning and computer vision. However, there are two fundamental issues hindering their real-world applications: the difficulty of conducting variational inference in VAE and the functional absence of encoding real-world samples in GAN. In this paper, we propose a novel algorithm named Latently Invertible Autoencoder (LIA) to address the above two issues in one framework. An invertible network and its inverse mapping are symmetrically embedded in the latent space of VAE. Thus the partial encoder first transforms the input into feature vectors and then the distribution of these feature vectors is reshaped to fit a prior by the invertible network. The decoder proceeds in the reverse order of the encoder's composite mappings. A two-stage stochasticity-free training scheme is designed to train LIA via adversarial learning, in the sense that the decoder of LIA is first trained as a standard GAN with the invertible network and then the partial encoder is learned from an autoencoder by detaching the invertible network from LIA.  Experiments conducted on the FFHQ face dataset and three LSUN datasets validate the effectiveness of LIA for inference and generation.",
        "keywords": "variational autoencoder;generative adversarial network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiapeng Zhu;Deli Zhao;Bolei Zhou;Bo Zhang",
        "authorids": "jengzhu0@gmail.com;zhaodeli@gmail.com;bzhou@ie.cuhk.edu.hk;zhangbo@xiaomi.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhu2020lia,\ntitle={{\\{}LIA{\\}}: Latently Invertible Autoencoder with Adversarial Learning},\nauthor={Jiapeng Zhu and Deli Zhao and Bolei Zhou and Bo Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=ryefE1SYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ryefE1SYDr",
        "pdf_size": 0,
        "rating": "3;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "3936;106;333;342",
        "wc_reply_reviewers": "0;0;17;0",
        "wc_reply_authors": "1035;0;510;333",
        "reply_reviewers": "0;0;1;0",
        "reply_authors": "2;0;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            1179.25,
            1594.4170368821326
        ],
        "wc_reply_reviewers_avg": [
            4.25,
            7.361215932167728
        ],
        "wc_reply_authors_avg": [
            469.5,
            374.33040218502157
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.7071067811865476
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 16,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6859116896492292146&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryefmpEYPr",
        "title": "iSparse: Output Informed Sparsification of Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "iSparse eliminates irrelevant or insignificant network edges with minimal impact on network performance by determining edge importance w.r.t. the final network output. ",
        "abstract": "Deep neural networks have demonstrated unprecedented success in various knowledge management applications. However, the networks created are often very complex, with large numbers of trainable edges which require extensive computational resources. We note that many successful networks nevertheless often contain large numbers of redundant edges. Moreover, many of these edges may have negligible contributions towards the overall network performance. In this paper, we propose a novel iSparse framework and experimentally show, that we can sparsify the network, by 30-50%, without impacting the network performance. iSparse leverages a novel edge significance score, E, to determine the importance of an edge with respect to the final network output. Furthermore, iSparse can be applied both while training a model or on top of a pre-trained model, making it a  retraining-free approach - leading to a minimal computational overhead. Comparisons of iSparse against PFEC, NISP, DropConnect, and Retraining-Free on benchmark datasets show that iSparse leads to effective network sparsifications.",
        "keywords": "dropout;dropconnect;sparsification;deep learning;neural network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yash Garg;K. Selcuk Candan",
        "authorids": "ygarg@asu.edu;candan@asu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ngarg2020isparse,\ntitle={iSparse: Output Informed Sparsification of Neural Networks},\nauthor={Yash Garg and K. Selcuk Candan},\nyear={2020},\nurl={https://openreview.net/forum?id=ryefmpEYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryefmpEYPr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "354;131;172",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "424;185;490",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            219.0,
            96.91577099041552
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            366.3333333333333,
            131.0224747472314
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7312344352708608623&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryegXAVKDB",
        "title": "UniLoss: Unified Surrogate Loss by Adaptive Interpolation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "We introduce UniLoss, a unified framework to generate surrogate losses for training deep networks with gradient descent, reducing the amount of manual design of task-specific surrogate losses. Our key observation is that in many cases, evaluating a model with a performance metric on a batch of examples can be re-factored into four steps: from input to real-valued scores, from scores to comparisons of pairs of scores, from comparisons to binary variables, and from binary variables to the final performance metric. Using this re-factorization we generate a unified differentiable approximation of the evaluation computation, through adaptive interpolation at selective configurations of the binary variables. Using UniLoss, we can optimize for different tasks and metrics using one unified framework, achieving comparable performance compared with task-specific losses. We validate the effectiveness of UniLoss on three tasks and four datasets.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lanlan Liu;Mingzhe Wang;Jia Deng",
        "authorids": "llanlan@umich.edu;mingzhew@cs.princeton.edu;jiadeng@princeton.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryegXAVKDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "794;535;463",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            597.3333333333334,
            142.13686674782477
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2506704581153135560&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryen_CEFwr",
        "title": "Unsupervised Disentanglement of Pose, Appearance and Background from Images and Videos",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Unsupervised landmark learning is the task of learning semantic keypoint-like\nrepresentations without the use of expensive keypoint-level annotations. A popular approach is to factorize an image into a pose and appearance data stream, then to reconstruct the image from the factorized components. The pose representation should capture a set of consistent and tightly localized landmarks in order to facilitate reconstruction of the input image. Ultimately, we wish for our learned landmarks to focus on the foreground object of interest. However, the reconstruction task of the entire image forces the model to allocate landmarks to model the background. This work explores the effects of factorizing the reconstruction task into separate foreground and background reconstructions, conditioning only the foreground reconstruction on the unsupervised landmarks. Our experiments demonstrate that the proposed factorization results in landmarks that are focused on the foreground object of interest. Furthermore, the rendered background quality is also improved, as the background rendering pipeline no longer requires the ill-suited landmarks to model its pose and appearance. We demonstrate this improvement in the context of the video-prediction.",
        "keywords": "unsupervised landmark discovery",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aysegul Dundar;Kevin J Shih;Animesh Garg;Robert Pottorf;Andrew Tao;Bryan Catanzaro",
        "authorids": "aysegul.dundar89@gmail.com;kjshih2@illinois.edu;garg@cs.stanford.edu;rpottorff@gmail.com;atao@nvidia.com;bcatanzaro@nvidia.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ndundar2020unsupervised,\ntitle={Unsupervised Disentanglement of Pose, Appearance and Background from Images and Videos},\nauthor={Aysegul Dundar and Kevin J Shih and Animesh Garg and Robert Pottorf and Andrew Tao and Bryan Catanzaro},\nyear={2020},\nurl={https://openreview.net/forum?id=ryen_CEFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryen_CEFwr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "93;396;162",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "215;654;497",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            217.0,
            129.6688088940436
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            455.3333333333333,
            181.62660108646594
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 35,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16898381896518214117&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "ryenvpEKDr",
        "title": "A Constructive Prediction of the Generalization Error Across Scales",
        "track": "main",
        "status": "Poster",
        "tldr": "We predict the generalization error and specify the model which attains it across model/data scales.",
        "abstract": "The dependency of the generalization error of neural networks on model and dataset size is of critical importance both in practice and for understanding the theory of neural networks. Nevertheless, the functional form of this dependency remains elusive. In this work, we present a functional form which approximates well the generalization error in practice. Capitalizing on the successful concept of model scaling (e.g., width, depth), we are able to simultaneously construct such a form and specify the exact models which can attain it across model/data scales. Our construction follows insights obtained from observations conducted over a range of model/data scales, in various model types and datasets, in vision and language tasks. We show that the form both fits the observations well across scales, and provides accurate predictions from small- to large-scale models and data.",
        "keywords": "neural networks;deep learning;generalization error;scaling;scalability;vision;language",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jonathan S. Rosenfeld;Amir Rosenfeld;Yonatan Belinkov;Nir Shavit",
        "authorids": "jonsr@mit.edu;amir@eecs.yorku.ca;belinkov@mit.edu;shanir@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nrosenfeld2020a,\ntitle={A Constructive Prediction of the Generalization Error Across Scales},\nauthor={Jonathan S. Rosenfeld and Amir Rosenfeld and Yonatan Belinkov and Nir Shavit},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryenvpEKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryenvpEKDr",
        "pdf_size": 0,
        "rating": "1;6;8",
        "confidence": "0;0;0",
        "wc_review": "162;452;503",
        "wc_reply_reviewers": "0;0;36",
        "wc_reply_authors": "637;227;234",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            372.3333333333333,
            150.17841241521882
        ],
        "wc_reply_reviewers_avg": [
            12.0,
            16.97056274847714
        ],
        "wc_reply_authors_avg": [
            366.0,
            191.64724539284842
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 223,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1994411424854698914&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "ryeot3EYPB",
        "title": "Bridging the domain gap in cross-lingual document classification",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Semi-supervised Cross-lingual Document Classification",
        "abstract": "The scarcity of labeled training data often prohibits the internationalization of NLP models to multiple languages.  Cross-lingual understanding has made progress in this area using language universal representations. However, most current approaches focus on the problem as one of aligning language and do not address the natural domain drift across languages and cultures.  In this paper, We address the domain gap in the setting of semi-supervised cross-lingual document classification, where labeled data is available in a source language and only unlabeled data is available in the target language.  We combine a state-of-the-art unsupervised learning method, masked language modeling pre-training, with a recent method for semi-supervised learning, Unsupervised Data Augmentation (UDA), to simultaneously close the language and the domain gap.  We show that addressing the domain gap in cross-lingual tasks is crucial.  We improve over strong baselines and achieve a new state-of-the-art for cross-lingual document classification.",
        "keywords": "cross-lingual;document classification;semi-supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guokun Lai;Barlas Oguz;Yiming Yang;Veselin Stoyanov",
        "authorids": "guokun@cs.cmu.edu;barlaso@fb.com;yiming@cs.cmu.edu;ves@fb.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryeot3EYPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "350;156;189",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            231.66666666666666,
            84.75192557628935
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5968601236579188571&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryesZANKPB",
        "title": "Meta Learning via Learned Loss",
        "track": "main",
        "status": "Reject",
        "tldr": "Meta learning a loss function for optimisation for supervised learning tasks as well as reinforcement learning",
        "abstract": "We present a meta-learning method for learning parametric loss functions that can generalize across different tasks and model architectures. We develop a pipeline for training such loss functions, targeted at maximizing the performance of model learn- ing with them. We observe that the loss landscape produced by our learned losses significantly improves upon the original task-specific losses in both supervised and reinforcement learning tasks. Furthermore, we show that our meta-learning framework is flexible enough to incorporate additional information at meta-train time. This information shapes the learned loss function such that the environment does not need to provide this information during meta-test time.",
        "keywords": "Meta Learning;Reinforcement Learning;Loss Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sarah Bechtle;Artem Molchanov;Yevgen Chebotar;Edward Grefenstette;Ludovic Righetti;Gaurav Sukhatme;Franziska Meier",
        "authorids": "sbechtle@tuebingen.mpg.de;molchano@usc.edu;ychebota@usc.edu;egrefen@gmail.com;ludovic.righetti@nyu.edu;gaurav@usc.edu;fmeier@fb.com",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nbechtle2020meta,\ntitle={Meta Learning via Learned Loss},\nauthor={Sarah Bechtle and Artem Molchanov and Yevgen Chebotar and Edward Grefenstette and Ludovic Righetti and Gaurav Sukhatme and Franziska Meier},\nyear={2020},\nurl={https://openreview.net/forum?id=ryesZANKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryesZANKPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "235;400;127",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "734;780;75",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            254.0,
            112.25862995778988
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            529.6666666666666,
            322.0458904497239
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 135,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10055425268804154624&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "ryestJBKPB",
        "title": "Graph Neural Networks for Soft Semi-Supervised Learning on Hypergraphs",
        "track": "main",
        "status": "Reject",
        "tldr": "We explore Graph Neural Networks for semi-supervised learning of probability distributions",
        "abstract": "Graph-based semi-supervised learning (SSL) assigns labels to initially unlabelled vertices in a graph.\nGraph neural networks (GNNs), esp. graph convolutional networks (GCNs), inspired the current-state-of-the art models for graph-based SSL problems.\nGCNs inherently assume that the labels of interest are numerical or categorical variables.\nHowever, in many real-world applications such as co-authorship networks, recommendation networks, etc., vertex labels can be naturally represented by probability distributions or histograms.\nMoreover, real-world network datasets have complex relationships going beyond pairwise associations.\nThese relationships can be modelled naturally and flexibly by hypergraphs.\nIn this paper, we explore GNNs for graph-based SSL of histograms.\nMotivated by complex relationships (those going beyond pairwise) in real-world networks, we propose a novel method for directed hypergraphs.\nOur work builds upon existing works on graph-based SSL of histograms derived from the theory of optimal transportation.\nA key contribution of this paper is to establish generalisation error bounds for a one-layer GNN within the framework of algorithmic stability.\nWe also demonstrate our proposed methods' effectiveness through detailed experimentation on real-world data.\nWe have made the code available.",
        "keywords": "Graph Neural Networks;Soft Semi-supervised Learning;Hypergraphs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Naganand Yadati;Tingran Gao;Shahab Asoodeh;Partha Talukdar;Anand Louis",
        "authorids": "y.naganand@gmail.com;trg17@uchicago.edu;shahab@seas.harvard.edu;ppt@iisc.ac.in;anandl@iisc.ac.in",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyadati2020graph,\ntitle={Graph Neural Networks for Soft Semi-Supervised Learning on Hypergraphs},\nauthor={Naganand Yadati and Tingran Gao and Shahab Asoodeh and Partha Talukdar and Anand Louis},\nyear={2020},\nurl={https://openreview.net/forum?id=ryestJBKPB}\n}",
        "github": "https://drive.google.com/file/d/1i7l5jPBPZ3TRG7YkG6NvJ10j19dnAfOf/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryestJBKPB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "127;381;367",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "234;131;202",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            291.6666666666667,
            116.57710848285019
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            189.0,
            43.04261454264444
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=203437541963552023&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "ryevtyHtPr",
        "title": "Do Deep Neural Networks for Segmentation Understand Insideness?",
        "track": "main",
        "status": "Reject",
        "tldr": "DNNs for image segmentation can implement solutions for the insideness problem but only some recurrent nets could learn them with a specific type of supervision.",
        "abstract": "Image segmentation aims at grouping pixels that belong to the same object or region. At the heart of image segmentation lies the problem of determining whether a pixel is inside or outside a region, which we denote as the \"insideness\" problem. Many Deep Neural Networks (DNNs) variants excel in segmentation benchmarks, but regarding insideness, they have not been well visualized or understood: What representations do DNNs use to address the long-range relationships of insideness? How do architectural choices affect the learning of these representations? In this paper, we take the reductionist approach by analyzing DNNs solving the insideness problem in isolation, i.e. determining the inside of closed (Jordan) curves. We demonstrate analytically that state-of-the-art feed-forward and recurrent architectures can implement solutions of the insideness problem for any given curve. Yet, only recurrent networks could  learn these general solutions when the training enforced a specific \"routine\" capable of breaking down the long-range relationships. Our results highlights the need for new training strategies that decompose the learning into appropriate stages, and that lead to the general class of solutions necessary for DNNs to understand insideness.",
        "keywords": "Image Segmentation;Deep Networks for Spatial Relationships;Visual Routines;Recurrent Neural Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kimberly M Villalobos;Vilim Stih;Amineh Ahmadinejad;Jamell Dozier;Andrew Francl;Frederico Azevedo;Tomotake Sasaki;Xavier Boix",
        "authorids": "kimvc@mit.edu;vilim@neuro.mpg.de;amineh@mit.edu;jamell@mit.edu;francl@mit.edu;fazevedo@mit.edu;tomotake.sasaki@fujitsu.com;xboix@mit.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nvillalobos2020do,\ntitle={Do Deep Neural Networks for Segmentation Understand Insideness?},\nauthor={Kimberly M Villalobos and Vilim Stih and Amineh Ahmadinejad and Jamell Dozier and Andrew Francl and Frederico Azevedo and Tomotake Sasaki and Xavier Boix},\nyear={2020},\nurl={https://openreview.net/forum?id=ryevtyHtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryevtyHtPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "413;295;226",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "679;132;419",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            311.3333333333333,
            77.21111031163557
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            410.0,
            223.40247685884475
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16717403952026523612&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryex8CEKPr",
        "title": "Knockoff-Inspired Feature Selection via Generative Models",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a feature selection algorithm for supervised learning inspired by the recently introduced  knockoff framework for variable selection in statistical regression.",
        "abstract": "We propose a feature selection algorithm for supervised learning inspired by the recently introduced \nknockoff framework for variable selection in statistical regression. While variable selection in statistics aims \nto distinguish between true and false predictors, feature selection in machine learning aims to reduce the \ndimensionality of the data while preserving the performance of the learning method. The knockoff framework \nhas attracted significant interest due to its strong control of false discoveries while preserving predictive \npower. In contrast to the original approach and later variants that assume a given probabilistic model for the \nvariables, our proposed approach relies on data-driven generative models that learn mappings from data \nspace to a parametric space that characterizes the probability distribution of the data. Our approach \nrequires only the availability of mappings from data space to a distribution in parametric space and from \nparametric space to a distribution in data space; thus, it can be integrated with multiple popular generative \nmodels from machine learning. We provide example knockoff designs using a variational autoencoder and \na Gaussian process latent variable model. We also propose a knockoff score metric for a softmax classifier \nthat accounts for the contribution of each feature and its knockoff during supervised learning. Experimental \nresults with multiple benchmark datasets for feature selection showcase the advantages of our knockoff \ndesigns and the knockoff framework with respect to existing approaches.",
        "keywords": "feature selection;variable selection;knockoff variables;supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Marco F. Duarte;Siwei Feng",
        "authorids": "mduarte@ecs.umass.edu;siwei@umass.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nduarte2020knockoffinspired,\ntitle={Knockoff-Inspired Feature Selection via Generative Models},\nauthor={Marco F. Duarte and Siwei Feng},\nyear={2020},\nurl={https://openreview.net/forum?id=ryex8CEKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryex8CEKPr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "208;316;601",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "176;264;243",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            375.0,
            165.77695859195873
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            227.66666666666666,
            37.52628708281999
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=516684236128402895&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryey1TNKvH",
        "title": "Topological based classification using graph convolutional networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Topology-Based Graph Convolutional Network (GCN)",
        "abstract": "In colored graphs, node classes are often associated with either their neighbors class or with information not incorporated in the graph associated with each node. We here propose that node classes are also associated with topological features of the nodes. We use this association to improve Graph machine learning in general and specifically, Graph Convolutional Networks (GCN). \n\nFirst, we show that even in the absence of any external information on nodes, a good accuracy can be obtained on the prediction of the node class using either topological features, or using the neighbors class as an input to a GCN. This accuracy is slightly less than the one that can be obtained using content based GCN.\n\nSecondly, we show that explicitly adding the topology as an input to the GCN does not improve the accuracy when combined with external information on nodes. However,  adding an additional adjacency matrix with edges between distant nodes with similar topology to the GCN does significantly improve its accuracy, leading to results better than all state of the art methods in multiple datasets. ",
        "keywords": "Graph Neural Networks;Graph Convolutional Networks;Graph;Topology",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Roy Abel;Idan Benami;Yoram Louzoun",
        "authorids": "royabel10@gmail.com;;",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "https://github.com/anonymous5253/T-GCN",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryey1TNKvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "323;215;247",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            261.6666666666667,
            45.29410067056807
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5388111744034911643&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryezBaNKDB",
        "title": "Learnable Higher-order Representation for Action Recognition",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Proposed higher order operation for context learning",
        "abstract": "Capturing spatiotemporal dynamics is an essential topic in video recognition. In this paper, we present learnable higher-order operation as a generic family of building blocks for capturing higher-order correlations from high dimensional input video space. We prove that several successful architectures for visual classification tasks are in the family of higher-order neural networks, theoretical and experimental analysis demonstrates their underlying mechanism is higher-order.  On the task of video recognition, even using RGB only without fine-tuning with other video datasets, our higher-order models can achieve results on par with or better than the existing state-of-the-art methods on both Something-Something (V1 and V2) and Charades datasets.",
        "keywords": "action recognition;context learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kai Hu;Bhiksha Raj",
        "authorids": "kaihu@cmu.edu;bhiksha@cs.cmu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryezBaNKDB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "577;423;357",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            452.3333333333333,
            92.17857065982793
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dx32kYtsU6YJ:scholar.google.com/&scioq=Learnable+Higher-order+Representation+for+Action+Recognition&hl=en&as_sdt=0,33",
        "gs_version_total": 3
    },
    {
        "id": "ryg2wlSFwS",
        "title": "Uncertainty-aware Variational-Recurrent Imputation Network for Clinical Time Series",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Our variational-recurrent imputation network (V-RIN) takes into account the correlated features, temporal dynamics, and further utilizes the uncertainty to alleviate the risk of biased missing values estimates.",
        "abstract": "Electronic Health Records (EHR) comprise of longitudinal clinical observations portrayed with sparsity, irregularity, and high-dimensionality which become the major obstacles in drawing reliable downstream outcome. Despite greatly numbers of imputation methods are being proposed to tackle these issues, most of the existing methods ignore correlated features or temporal dynamics and entirely put aside the uncertainty. In particular, since the missing values estimates have the risk of being imprecise, it motivates us to pay attention to reliable and less certain information differently. In this work, we propose a novel variational-recurrent imputation network (V-RIN), which unified imputation and prediction network, by taking into account the correlated features, temporal dynamics, and further utilizing the uncertainty to alleviate the risk of biased missing values estimates. Specifically, we leverage the deep generative model to estimate the missing values based on the distribution among variables and a recurrent imputation network to exploit the temporal relations in conjunction with utilization of the uncertainty. We validated the effectiveness of our proposed model with publicly available real-world EHR dataset, PhysioNet Challenge 2012, and compared the results with other state-of-the-art competing methods in the literature.",
        "keywords": "Missing data imputation;electronic health Records;deep generative models;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ahmad Wisnu Mulyadi;Eunji Jun;Heung-Il Suk",
        "authorids": "wisnumulyadi@korea.ac.kr;ejjun92@korea.ac.kr;hisuk@korea.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryg2wlSFwS",
        "pdf_size": 0,
        "rating": "1;1;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "259;518;389;318",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            2.75,
            2.0463381929681126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            371.0,
            96.54791556527775
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 74,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8812561110731314728&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "ryg48p4tPH",
        "title": "Action Semantics Network: Considering the Effects of Actions in Multiagent Systems",
        "track": "main",
        "status": "Poster",
        "tldr": "Our proposed ASN characterizes different actions' influence on other agents using neural networks based on the action semantics between them.",
        "abstract": "In multiagent systems (MASs), each agent makes individual decisions but all of them contribute globally to the system evolution. Learning in MASs is difficult since each agent's selection of actions must take place in the presence of other co-learning agents. Moreover, the environmental stochasticity and uncertainties increase exponentially with the increase in the number of agents. Previous works borrow various multiagent coordination mechanisms into deep learning architecture to facilitate multiagent coordination. However, none of them explicitly consider action semantics between agents that different actions have different influences on other agents. In this paper, we propose a novel network architecture, named Action Semantics Network (ASN), that explicitly represents such action semantics between agents. ASN characterizes different actions' influence on other agents using neural networks based on the action semantics between them. ASN can be easily combined with existing deep reinforcement learning (DRL) algorithms to boost their performance. Experimental results on StarCraft II micromanagement and Neural MMO show ASN significantly improves the performance of state-of-the-art DRL approaches compared with several network architectures.",
        "keywords": "multiagent coordination;multiagent learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Weixun Wang;Tianpei Yang;Yong Liu;Jianye Hao;Xiaotian Hao;Yujing Hu;Yingfeng Chen;Changjie Fan;Yang Gao",
        "authorids": "wxwang@tju.edu.cn;tpyang@tju.edu.cn;lucasliunju@gmail.com;jianye.hao@tju.edu.cn;xiaotianhao@tju.edu.cn;huyujing@corp.netease.com;chenyingfeng1@corp.netease.com;fanchangjie@corp.netease.com;gaoy@nju.edu.cn",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@inproceedings{\nwang2020action,\ntitle={Action Semantics Network: Considering the Effects of Actions in Multiagent Systems},\nauthor={Weixun Wang and Tianpei Yang and Yong Liu and Jianye Hao and Xiaotian Hao and Yujing Hu and Yingfeng Chen and Changjie Fan and Yang Gao},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryg48p4tPH}\n}",
        "github": "https://github.com/MAS-anony/ASN",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryg48p4tPH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "558;391;451",
        "wc_reply_reviewers": "34;0;0",
        "wc_reply_authors": "331;178;369",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            466.6666666666667,
            69.0716214438961
        ],
        "wc_reply_reviewers_avg": [
            11.333333333333334,
            16.027753706895076
        ],
        "wc_reply_authors_avg": [
            292.6666666666667,
            82.55234029928461
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 48,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12922359203743384074&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "ryg7jhEtPB",
        "title": "On importance-weighted autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "We show that most variants of importance-weighted autoencoders can be derived in a more principled manner as special cases of adaptive importance-sampling approaches like the reweighted-wake sleep algorithm.",
        "abstract": "The importance weighted autoencoder (IWAE) (Burda et al., 2016) is a popular variational-inference method which achieves a tighter evidence bound (and hence a lower bias) than standard variational autoencoders by optimising a multi-sample objective, i.e. an objective that is expressible as an integral over $K > 1$ Monte Carlo samples. Unfortunately, IWAE crucially relies on the availability of reparametrisations and even if these exist, the multi-sample objective leads to inference-network gradients which break down as $K$ is increased (Rainforth et al., 2018). This breakdown can only be circumvented by removing high-variance score-function terms, either by heuristically ignoring them (which yields the 'sticking-the-landing' IWAE (IWAE-STL) gradient from Roeder et al. (2017)) or through an identity from Tucker et al. (2019) (which yields the 'doubly-reparametrised' IWAE (IWAE-DREG) gradient). In this work, we argue that directly optimising the proposal distribution in importance sampling as in the reweighted wake-sleep (RWS) algorithm from Bornschein & Bengio (2015) is preferable to optimising IWAE-type multi-sample objectives. To formalise this argument, we introduce an adaptive-importance sampling framework termed adaptive importance sampling for learning (AISLE) which slightly generalises the RWS algorithm. We then show that AISLE admits IWAE-STL and IWAE-DREG (i.e. the IWAE-gradients which avoid breakdown) as special cases.",
        "keywords": "variational inference;autoencoders;importance sampling",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Axel Finke;Alexandre H. Thiery",
        "authorids": "axelfinke42@gmail.com;a.h.thiery@nus.edu.sg",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nfinke2020on,\ntitle={On importance-weighted autoencoders},\nauthor={Axel Finke and Alexandre H. Thiery},\nyear={2020},\nurl={https://openreview.net/forum?id=ryg7jhEtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryg7jhEtPB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "504;333;724",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1664;631;698",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            520.3333333333334,
            160.04235550489614
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            997.6666666666666,
            471.962098289918
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 25,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13761590536974200335&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryg7vA4tPB",
        "title": "Rigging the Lottery: Making All Tickets Winners",
        "track": "main",
        "status": "Reject",
        "tldr": "We demonstrate state-of-the-art sparse training results with ResNet-50, MobileNet v1 and MobileNet v2 on the ImageNet-2012 dataset.",
        "abstract": "Sparse neural networks have been shown to yield computationally efficient networks with improved inference times.  There is a large body of work on training dense networks to yield sparse networks for inference (Molchanov et al., 2017;Zhu & Gupta, 2018; Louizos et al., 2017; Li et al., 2016; Guo et al., 2016).  This limits the size of the largest trainable sparse model to that of the largest trainable dense model. In this paper we introduce a method to train sparse neural networks with a fixed parameter count and a fixed computational cost throughout training, without sacrificing accuracy relative to existing dense-to-sparse training methods. Our method updates the topology of the network during training by using parameter magnitudes and infrequent gradient calculations. We show that this approach requires less floating-point operations (FLOPs) to achieve a given level of accuracy compared to prior techniques. We demonstrate state-of-the-art sparse training results with ResNet-50, MobileNet v1 and MobileNet v2 on the ImageNet-2012 dataset. Finally,  we  provide  some  insights  into  why  allowing  the  topology  to change during the optimization can overcome local minima encountered when the topology remains static.",
        "keywords": "sparse training;sparsity;pruning;lottery tickets;imagenet;resnet;mobilenet;efficiency;optimization;local minima",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Utku Evci;Erich Elsen;Pablo Castro;Trevor Gale",
        "authorids": "ue225@nyu.edu;eriche@google.com;tgale@google.com;psc@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nevci2020rigging,\ntitle={Rigging the Lottery: Making All Tickets Winners},\nauthor={Utku Evci and Erich Elsen and Pablo Castro and Trevor Gale},\nyear={2020},\nurl={https://openreview.net/forum?id=ryg7vA4tPB}\n}",
        "github": "https://drive.google.com/file/d/1XdexLVd2_PkgUu8mjkjKQpA2zvsaiqKe/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryg7vA4tPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "352;348;299",
        "wc_reply_reviewers": "246;0;0",
        "wc_reply_authors": "2454;201;246",
        "reply_reviewers": "3;0;0",
        "reply_authors": "7;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            333.0,
            24.097026095903757
        ],
        "wc_reply_reviewers_avg": [
            82.0,
            115.96551211459379
        ],
        "wc_reply_authors_avg": [
            967.0,
            1051.6282613167068
        ],
        "reply_reviewers_avg": [
            1.0,
            1.4142135623730951
        ],
        "reply_authors_avg": [
            3.0,
            2.8284271247461903
        ],
        "replies_avg": [
            21,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 696,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9482190258009925205&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "ryg8WJSKPr",
        "title": "ConQUR: Mitigating Delusional Bias in Deep Q-Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We developed a search framework and consistency penalty to mitigate delusional bias.",
        "abstract": "Delusional bias is a fundamental source of error in approximate Q-learning. To date, the only techniques that explicitly address delusion require comprehensive search using tabular value estimates. In this paper, we develop efficient methods to mitigate delusional bias by training Q-approximators with labels that are \"consistent\" with the underlying greedy policy class. We introduce a simple penalization scheme that encourages Q-labels used across training batches to remain (jointly) consistent with the expressible policy class. We also propose a search framework that allows multiple Q-approximators to be generated and tracked, thus mitigating the effect of premature (implicit) policy commitments. Experimental results demonstrate that these methods can improve the performance of Q-learning in a variety of Atari games, sometimes dramatically.",
        "keywords": "reinforcement learning;q-learning;deep reinforcement learning;Atari",
        "primary_area": "",
        "supplementary_material": "",
        "author": "DiJia-Andy Su;Jayden Ooi;Tyler Lu;Dale Schuurmans;Craig Boutilier\u200e",
        "authorids": "andy.2008.su@gmail.com;jayden@alum.mit.edu;tyler.lu@gmail.com;schuurmans@google.com;cboutilier@google.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nsu2020conqur,\ntitle={Con{\\{}QUR{\\}}: Mitigating Delusional Bias in Deep Q-Learning},\nauthor={DiJia-Andy Su and Jayden Ooi and Tyler Lu and Dale Schuurmans and Craig Boutilier\u200e},\nyear={2020},\nurl={https://openreview.net/forum?id=ryg8WJSKPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryg8WJSKPr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "692;326;252",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1217;588;146",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            423.3333333333333,
            192.36308262126482
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            650.3333333333334,
            439.4499086610694
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10258907633136913241&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 16
    },
    {
        "id": "ryg8wpEtvB",
        "title": "Evaluating and Calibrating Uncertainty Prediction in Regression Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new definition for calibrated uncertainty prediction in regression tasks and a method for uncertainty calibration",
        "abstract": "Predicting not only the target but also an accurate measure of uncertainty is important for many applications and in particular safety-critical ones. In this work we study the calibration of uncertainty prediction for regression tasks which often arise in real-world systems. We show that the existing definition for calibration of a regression uncertainty [Kuleshov et al. 2018] has severe limitations in distinguishing informative from non-informative uncertainty predictions. We propose a new definition that escapes this caveat and an evaluation method using a simple histogram-based approach inspired by reliability diagrams used in classification tasks. Our method clusters examples with similar uncertainty prediction and compares the prediction with the empirical uncertainty on these examples. We also propose a simple scaling-based calibration that preforms well in our experimental tests. We show results on both a synthetic, controlled problem and on the object detection bounding-box regression task using the COCO and KITTI  datasets.",
        "keywords": "Uncertainty Estimation;Regression;Deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dan Levi;Liran Gispan;Niv Giladi;Ethan Fetaya",
        "authorids": "danmlevi@gmail.com;liran.gispan@gm.com;giladiniv@gmail.com;ethanf@cs.toronto.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nlevi2020evaluating,\ntitle={Evaluating and Calibrating Uncertainty Prediction in Regression Tasks},\nauthor={Dan Levi and Liran Gispan and Niv Giladi and Ethan Fetaya},\nyear={2020},\nurl={https://openreview.net/forum?id=ryg8wpEtvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryg8wpEtvB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "184;445;620",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            416.3333333333333,
            179.1467430782808
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 180,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14727169517091940592&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "rygBVTVFPB",
        "title": "Learning to Discretize: Solving 1D Scalar Conservation Laws via Deep Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We observe that numerical PDE solvers can be regarded as Markov Desicion Processes, and propose to use Reinforcement Learning to solve 1D scalar Conservation Laws",
        "abstract": "Conservation laws are considered to be fundamental laws of nature. It has broad application in many fields including physics, chemistry, biology, geology, and engineering. Solving the differential equations associated with conservation laws is a major branch in computational mathematics. Recent success of machine learning, especially deep learning, in areas such as computer vision and natural language processing, has attracted a lot of attention from the community of computational mathematics and inspired many intriguing works in combining machine learning with traditional methods. In this paper, we are the first to explore the possibility and benefit of solving nonlinear conservation laws using deep reinforcement learning. As a proof of concept, we focus on 1-dimensional scalar conservation laws. We deploy the machinery of deep reinforcement learning to train a policy network that can decide on how the numerical solutions should be approximated in a sequential and spatial-temporal adaptive manner. We will show that the problem of solving conservation laws can be naturally viewed as a sequential decision making process and the numerical schemes learned in such a way can easily enforce long-term accuracy. \nFurthermore, the learned policy network is carefully designed to determine a good local discrete approximation based on the current state of the solution, which essentially makes the proposed method a meta-learning approach.\nIn other words, the proposed method is capable of learning how to discretize for a given situation mimicking human experts. Finally, we will provide details on how the policy network is trained, how well it performs compared with some state-of-the-art numerical solvers such as WENO schemes, and how well it generalizes. Our code is released anomynously at \\url{https://github.com/qwerlanksdf/L2D}.",
        "keywords": "Numerical Methods;Conservation Laws;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yufei Wang*;Ziju Shen*;Zichao Long;Bin Dong",
        "authorids": "wang.yufei@pku.edu.cn;zjshen@pku.edu.cn;zlong@pku.edu.cn;dongbin@math.pku.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang*2020learning,\ntitle={Learning to Discretize: Solving 1D Scalar Conservation Laws via Deep Reinforcement Learning},\nauthor={Yufei Wang* and Ziju Shen* and Zichao Long and Bin Dong},\nyear={2020},\nurl={https://openreview.net/forum?id=rygBVTVFPB}\n}",
        "github": "https://github.com/qwerlanksdf/L2D",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rygBVTVFPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "200;414;543",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "115;764;558",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            385.6666666666667,
            141.4551361936199
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            479.0,
            270.777891761249
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 55,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15090151012473267370&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "rygEokBKPS",
        "title": "Yet another but more efficient black-box adversarial attack: tiling and evolution strategies",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new black-box adversarial attack based on tiling and evolution strategies",
        "abstract": "We introduce a new black-box attack achieving state of the art performances. Our approach is based on a new objective function, borrowing ideas from $\\ell_\\infty$-white box attacks, and particularly designed to fit derivative-free optimization requirements. It only requires to have access to the logits of the classifier without any other information which is a more realistic scenario.  Not only we introduce a new objective function, we extend previous works on black box adversarial attacks to a larger spectrum of evolution strategies and other derivative-free optimization methods. We also highlight a new intriguing property that deep neural networks are not robust to single shot tiled attacks. Our models achieve, with a budget limited to $10,000$ queries, results up to $99.2\\%$ of success rate against InceptionV3 classifier  with $630$  queries to the network on average in the untargeted attacks setting, which is an improvement by $90$ queries of  the current state of the art. In the targeted setting, we are able to reach, with a limited budget of $100,000$, $100\\%$ of success rate with a budget of $6,662$ queries on average, i.e. we need $800$ queries less than the current state of the art.",
        "keywords": "adversarial examples;black-box attacks;derivative free optimization;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Laurent Meunier;Jamal Atif;Olivier Teytaud",
        "authorids": "laurent.meunier1995@gmail.com;jamal.atif@dauphine.fr;oteytaud@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmeunier2020yet,\ntitle={Yet another but more efficient black-box adversarial attack: tiling and evolution strategies},\nauthor={Laurent Meunier and Jamal Atif and Olivier Teytaud},\nyear={2020},\nurl={https://openreview.net/forum?id=rygEokBKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygEokBKPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "527;405;272",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "832;347;266",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            401.3333333333333,
            104.1355953659565
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            481.6666666666667,
            249.9204317822419
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10522705816412281658&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rygFWAEFwS",
        "title": "Stochastic Weight Averaging in Parallel: Large-Batch Training That Generalizes Well",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose SWAP, a distributed algorithm for large-batch training of neural networks.",
        "abstract": "We propose Stochastic Weight Averaging in Parallel (SWAP), an algorithm to accelerate DNN training. Our algorithm uses large mini-batches to compute an approximate solution quickly and then refines it by averaging the weights of multiple models computed independently and in parallel. The resulting models generalize equally well as those trained with small mini-batches but are produced in a substantially shorter time. We demonstrate the reduction in training time and the good generalization performance of the resulting models on the computer vision datasets CIFAR10, CIFAR100, and ImageNet.",
        "keywords": "Large batch training;Distributed neural network training;Stochastic Weight Averaging",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vipul Gupta;Santiago Akle Serrano;Dennis DeCoste",
        "authorids": "vipul_gupta@berkeley.edu;sakle@apple.com;ddecoste@apple.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nGupta2020Stochastic,\ntitle={Stochastic Weight Averaging in Parallel: Large-Batch Training That Generalizes Well},\nauthor={Vipul Gupta and Santiago Akle Serrano and Dennis DeCoste},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rygFWAEFwS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygFWAEFwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "233;448;257",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "356;677;415",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            312.6666666666667,
            96.1954029855666
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            482.6666666666667,
            139.50945806248725
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 65,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11962641912405733272&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rygG4AVFvH",
        "title": "Chameleon: Adaptive Code Optimization for Expedited Deep Neural Network Compilation",
        "track": "main",
        "status": "Poster",
        "tldr": "Reinforcement Learning and Adaptive Sampling for Optimized Compilation of Deep Neural Networks.",
        "abstract": "Achieving faster execution with shorter compilation time can foster further diversity and innovation in neural networks. However, the current paradigm of executing neural networks either relies on hand-optimized libraries, traditional compilation heuristics, or very recently genetic algorithms and other stochastic methods. These methods suffer from frequent costly hardware measurements rendering them not only too time consuming but also suboptimal. As such, we devise a solution that can learn to quickly adapt to a previously unseen design space for code optimization, both accelerating the search and improving the output performance. This solution dubbed Chameleon leverages reinforcement learning whose solution takes fewer steps to converge, and develops an adaptive sampling algorithm that not only focuses on the costly samples (real hardware measurements) on representative points but also uses a domain-knowledge inspired logic to improve the samples itself. Experimentation with real hardware shows that Chameleon provides 4.45x speed up in optimization time over AutoTVM, while also improving inference time of the modern deep networks by 5.6%.",
        "keywords": "Reinforcement Learning;Learning to Optimize;Combinatorial Optimization;Compilers;Code Optimization;Neural Networks;ML for Systems;Learning for Systems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Byung Hoon Ahn;Prannoy Pilligundla;Amir Yazdanbakhsh;Hadi Esmaeilzadeh",
        "authorids": "bhahn@eng.ucsd.edu;ppilligu@eng.ucsd.edu;ayazdan@google.com;hadi@eng.ucsd.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nAhn2020Chameleon:,\ntitle={Chameleon: Adaptive Code Optimization for Expedited Deep Neural Network Compilation},\nauthor={Byung Hoon Ahn and Prannoy Pilligundla and Amir Yazdanbakhsh and Hadi Esmaeilzadeh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rygG4AVFvH}\n}",
        "github": "[![bitbucket](/images/bitbucket_icon.svg) act-lab/chameleon](https://bitbucket.org/act-lab/chameleon)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rygG4AVFvH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "318;137;223",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "676;20;681",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            226.0,
            73.92338376093635
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            459.0,
            310.4265882083342
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 101,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16200039356381258856&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "rygG7AEtvB",
        "title": "Finding Mixed Strategy Nash Equilibrium for Continuous Games through Deep Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "Design a very first algorithm to compute the mixed strategy Nash equilibrium of games with continuous action space.",
        "abstract": "Nash equilibrium has long been a desired solution concept in multi-player games, especially for those on continuous strategy spaces, which have attracted a rapidly growing amount of interests due to advances in research applications such as the generative adversarial networks. Despite the fact that several deep learning based approaches are designed to obtain pure strategy Nash equilibrium, it is rather luxurious to assume the existence of such an equilibrium. In this paper, we present a new method to approximate mixed strategy Nash equilibria in multi-player continuous games, which always exist and include the pure ones as a special case. We remedy the pure strategy weakness by adopting the pushforward measure technique to represent a mixed strategy in continuous spaces. That allows us to generalize the Gradient-based Nikaido-Isoda (GNI) function to measure the distance between the players' joint strategy profile and a Nash equilibrium. Applying the gradient descent algorithm, our approach is shown to converge to a stationary Nash equilibrium under the convexity assumption on payoff functions, the same popular setting as in previous studies. \nIn numerical experiments, our method consistently and  significantly outperforms recent works on approximating Nash equilibrium for quadratic games, general blotto games, and GAMUT games.",
        "keywords": "Mixed strategy Nash Equilibrium;Continuous Game;Pushforward Measure;NI Function",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zehao Dou;Xiang Yan;Dongge Wang;Xiaotie Deng",
        "authorids": "zehaodou@pku.edu.cn;yxghost@sjtu.edu.cn;dgwang96@pku.edu.cn;xiaotie@pku.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\ndou2020finding,\ntitle={Finding Mixed Strategy Nash Equilibrium for Continuous Games through Deep Learning},\nauthor={Zehao Dou and Xiang Yan and Dongge Wang and Xiaotie Deng},\nyear={2020},\nurl={https://openreview.net/forum?id=rygG7AEtvB}\n}",
        "github": "https://github.com/Odinnnnnnn/Nash_equilibrium_mixed",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygG7AEtvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "445;424;269",
        "wc_reply_reviewers": "158;209;159",
        "wc_reply_authors": "516;565;887",
        "reply_reviewers": "1;1;1",
        "reply_authors": "2;2;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            379.3333333333333,
            78.48708315526972
        ],
        "wc_reply_reviewers_avg": [
            175.33333333333334,
            23.809428571238094
        ],
        "wc_reply_authors_avg": [
            656.0,
            164.5620450367176
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3366168840626261037&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rygGQyrFvH",
        "title": "The Curious Case of Neural Text Degeneration",
        "track": "main",
        "status": "Poster",
        "tldr": "Current language generation systems either aim for high likelihood and devolve into generic repetition or miscalibrate their stochasticity\u2014we provide evidence of both and propose a solution: Nucleus Sampling.",
        "abstract": "Despite considerable advances in neural language modeling, it remains an open question what the best decoding strategy is for text generation from a language model (e.g. to generate a story). The counter-intuitive empirical observation is that even though the use of likelihood as training objective leads to high quality models for a broad range of language understanding tasks, maximization-based decoding methods such as beam search lead to degeneration \u2014 output text that is bland, incoherent, or gets stuck in repetitive loops.\n\nTo address this we propose Nucleus Sampling, a simple but effective method to draw considerably higher quality text out of neural language models than previous decoding strategies. Our approach avoids text degeneration by truncating the unreliable tail of the probability distribution, sampling from the dynamic nucleus of tokens containing the vast majority of the probability mass.\n\nTo properly examine current maximization-based and stochastic decoding methods, we compare generations from each of these methods to the distribution of human text along several axes such as likelihood, diversity, and repetition. Our results show that (1) maximization is an inappropriate decoding objective for open-ended text generation, (2) the probability distributions of the best current language models have an unreliable tail which needs to be truncated during generation and (3) Nucleus Sampling is currently the best available decoding strategy for generating long-form text that is both high-quality \u2014 as measured by human evaluation \u2014 and as diverse as human-written text.",
        "keywords": "generation;text;NLG;NLP;natural language;natural language generation;language model;neural;neural language model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ari Holtzman;Jan Buys;Li Du;Maxwell Forbes;Yejin Choi",
        "authorids": "ahai@cs.washington.edu;jbuys@cs.uct.ac.za;dul2@cs.washington.edu;mbforbes@cs.washington.edu;yejin@cs.washington.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nHoltzman2020The,\ntitle={The Curious Case of Neural Text Degeneration},\nauthor={Ari Holtzman and Jan Buys and Li Du and Maxwell Forbes and Yejin Choi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rygGQyrFvH}\n}",
        "github": "https://github.com/ari-holtzman/degen",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rygGQyrFvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "236;346;710",
        "wc_reply_reviewers": "0;85;0",
        "wc_reply_authors": "243;1093;886",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;3;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            430.6666666666667,
            202.55918202397595
        ],
        "wc_reply_reviewers_avg": [
            28.333333333333332,
            40.069384267237695
        ],
        "wc_reply_authors_avg": [
            740.6666666666666,
            361.9082142692475
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3518,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13091440005032798110&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rygGnertwH",
        "title": "Cancer homogeneity in single cell revealed by Bi-state model and Binary matrix factorization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "Our finding shed lights in preventing cancer progression",
        "abstract": "Single cell RNA sequencing (scRNAseq) technology enables quantifying gene expression profiles by individual cells within cancer. Dimension reduction methods have been commonly used for cell clustering analysis and visualization of the data. Current dimension reduction methods tend overly eliminate the expression variations correspond to less dominating characteristics, such we fail to find the homogenious properties of cancer development. In this paper, we proposed a new and clustering analysis method for scRNAseq data, namely BBSC, via implementing a binarization of the gene expression profile into on/off frequency changes with a Boolean matrix factorization. The low rank representation of expression matrix recovered by BBSC increase the resolution in identifying distinct cell types or functions. Application of BBSC on two cancer scRNAseq data successfully discovered both homogeneous and heterogeneous cancer cell clusters. Further finding showed potential in preventing cancer progression.",
        "keywords": "Boolean Matrix factorization;single cell analysis;computational biology;cancer research",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Changlin Wan;Wennan Chang;Sha Cao;Xiao Wang;Chi Zhang",
        "authorids": "wan82@purdue.edu;chang534@purdue.edu;shacao@iu.edu;wangxiao@purdue.edu;czhang87@iu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygGnertwH",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "285;501;318",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            368.0,
            95.00526301210897
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2AJKBUF-JY8J:scholar.google.com/&scioq=Cancer+homogeneity+in+single+cell+revealed+by+Bi-state+model+and+Binary+matrix+factorization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rygHe64FDS",
        "title": "Zeno++: Robust Fully Asynchronous SGD",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose Zeno++, a new robust asynchronous Stochastic Gradient Descent algorithm which tolerates Byzantine failures of the workers.",
        "abstract": "We propose Zeno++, a new robust asynchronous Stochastic Gradient Descent~(SGD) procedure which tolerates Byzantine failures of the workers. In contrast to previous work, Zeno++ removes some unrealistic restrictions on worker-server communications, allowing for fully asynchronous updates from anonymous workers, arbitrarily stale worker updates, and the possibility of an unbounded number of Byzantine workers. The key idea is to estimate the descent of the loss value after the candidate gradient is applied, where large descent values indicate that the update results in optimization progress. We prove the convergence of Zeno++ for non-convex problems under Byzantine failures. Experimental results show that Zeno++ outperforms existing approaches.",
        "keywords": "fault-tolerance;Byzantine-tolerance;security;SGD;asynchronous",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cong Xie;Oluwasanmi Koyejo;Indranil Gupta",
        "authorids": "cx2@illinois.edu;sanmi@illinois.edu;indy@illinois.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nxie2020zeno,\ntitle={Zeno++: Robust Fully Asynchronous {\\{}SGD{\\}}},\nauthor={Cong Xie and Oluwasanmi Koyejo and Indranil Gupta},\nyear={2020},\nurl={https://openreview.net/forum?id=rygHe64FDS}\n}",
        "github": "https://anonymous.4open.science/r/c7a1f547-5c5f-4e58-8f72-0478819acf0d/",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rygHe64FDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "186;350;435",
        "wc_reply_reviewers": "0;89;7",
        "wc_reply_authors": "334;1502;805",
        "reply_reviewers": "0;1;1",
        "reply_authors": "2;4;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            323.6666666666667,
            103.34516061346184
        ],
        "wc_reply_reviewers_avg": [
            32.0,
            40.40627014049511
        ],
        "wc_reply_authors_avg": [
            880.3333333333334,
            479.8001898938997
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            1.247219128924647
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 149,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6498141081528459239&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 9
    },
    {
        "id": "rygHq6EFvB",
        "title": "Generative Multi Source Domain Adaptation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "In this paper we propose generative method for multisource domain adaptation based on decomposition of content, style and domain factors.",
        "abstract": "Most domain adaptation methods consider the problem of transferring knowledge to the target domain from a single source dataset. However, in practical applications, we typically have access to multiple sources. In this paper we propose the first approach for Multi-Source Domain Adaptation (MSDA) based on Generative Adversarial Networks. Our method is inspired by the observation that the appearance of a given image depends on three factors: the domain, the style (characterized in terms of low-level features variations) and the content. For this reason we propose to project the image features onto a space where only the dependence from the content is kept, and then re-project this invariant representation onto the pixel space using the target domain and style. In this way, new labeled images can be generated which are used to train a final target classifier. We test our  approach using common MSDA benchmarks, showing that it outperforms state-of-the-art methods.",
        "keywords": "Domain Adaptation;Generative Adversarial Networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Subhankar Roy;Aliaksandr Siarohin;Enver Sangineto;Moin Nabi;Tassilo Klein;Nicu Sebe;Elisa Ricci",
        "authorids": "subhankar.roy@unitn.it;aliaksandr.siarohin@unitn.it;enver.sangineto@unitn.it;m.nabi@sap.com;tassilo.klein@sap.com;niculae.sebe@unitn.it;eliricci@fbk.eu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rygHq6EFvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "256;804;495",
        "wc_reply_reviewers": "0;6;0",
        "wc_reply_authors": "583;1213;704",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            518.3333333333334,
            224.32763737196734
        ],
        "wc_reply_reviewers_avg": [
            2.0,
            2.8284271247461903
        ],
        "wc_reply_authors_avg": [
            833.3333333333334,
            272.9717120061263
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "rygHq6EFwr",
        "title": "GResNet: Graph Residual Network for Reviving Deep GNNs from Suspended Animation",
        "track": "main",
        "status": "Reject",
        "tldr": "Identifying suspended animation problem with GNNs, propose a new model to resolve the problem with graph residual learning.",
        "abstract": "The existing graph neural networks (GNNs) based on the spectral graph convolutional operator have been criticized for its performance degradation, which is especially common for the models with deep architectures. In this paper, we further identify the suspended animation problem with the existing GNNs. Such a problem happens when the model depth reaches the suspended animation limit, and the model will not respond to the training data any more and become not learnable. Analysis about the causes of the suspended animation problem with existing GNNs will be provided in this paper, whereas several other peripheral factors that will impact the problem will be reported as well. To resolve the problem, we introduce the GRESNET (Graph Residual Network) framework in this paper, which creates extensively connected highways to involve nodes\u2019 raw features or intermediate representations throughout the graph for all the model layers. Different from the other learning settings, the extensive connections in the graph data will render the existing simple residual learning methods fail to work. We prove the effectiveness of the introduced new graph residual terms from the norm preservation perspective, which will help avoid dramatic changes to the node\u2019s representations between sequential layers. Detailed studies about the GRESNET framework for many existing GNNs, including GCN, GAT and LOOPYNET, will be reported in the paper with extensive empirical experiments on real-world benchmark datasets.",
        "keywords": "Graph Neural Networks;Node Classification;Representation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiawei Zhang;Lin Meng",
        "authorids": "jiawei@ifmlab.org;lin@ifmlab.org",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nzhang2020gresnet,\ntitle={{\\{}GR{\\}}esNet: Graph Residual Network for Reviving Deep {\\{}GNN{\\}}s from Suspended Animation},\nauthor={Jiawei Zhang and Lin Meng},\nyear={2020},\nurl={https://openreview.net/forum?id=rygHq6EFwr}\n}",
        "github": "https://github.com/anonymous-sourcecode/GResNet",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygHq6EFwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "375;537;179",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1733;1066;738",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            363.6666666666667,
            146.372432134227
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1179.0,
            413.99114322249295
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 67,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9905760540878369958&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rygL4gStDS",
        "title": "Understanding and Training Deep Diagonal Circulant Neural Networks",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We train deep neural networks based on diagonal and circulant matrices, and show that this type of networks are both compact and accurate on real world applications.",
        "abstract": "In this paper, we study deep diagonal circulant neural networks, that is deep neural networks in which weight matrices are the product of diagonal and circulant ones.\nBesides making a theoretical analysis of their expressivity, we introduced principled techniques for training these models: we devise an initialization scheme and proposed a smart use of non-linearity functions in order to train deep diagonal circulant networks. \nFurthermore, we show that these networks outperform recently introduced deep networks with other types of structured layers. We conduct a thorough experimental study to compare the performance of deep diagonal circulant networks with state of the art models based on structured matrices and with dense models. We show that our models achieve better accuracy than other structured approaches while required 2x fewer weights as the next best approach. Finally we train deep diagonal circulant networks to build a compact and accurate models on a real world video classification dataset with over 3.8 million training examples. ",
        "keywords": "Deep Learning;Structured Matrices;Compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexandre Araujo;Benjamin Negrevergne;Yann Chevaleyre;Jamal Atif",
        "authorids": "alexandre.araujo@dauphine.eu;benjamin.negrevergne@dauphine.psl.eu;yann.chevaleyre@lamsade.dauphine.fr;jamal.atif@lamsade.dauphine.fr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "https://drive.google.com/file/d/1fpTdPTiDY3Fq8KTqaI08io8cZI_DVKET/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygL4gStDS",
        "pdf_size": 0,
        "rating": "1;3",
        "confidence": "0;0",
        "wc_review": "182;497",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "196;380",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            339.5,
            157.5
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            288.0,
            92.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=915227496813356637&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rygMWT4twS",
        "title": "Stochastic Gradient Descent with Biased but Consistent Gradient Estimators",
        "track": "main",
        "status": "Reject",
        "tldr": "Convergence theory for biased (but consistent) gradient estimators in stochastic optimization and application to graph convolutional networks",
        "abstract": "Stochastic gradient descent (SGD), which dates back to the 1950s, is one of the most popular and effective approaches for performing stochastic optimization. Research on SGD resurged recently in machine learning for optimizing convex loss functions and training nonconvex deep neural networks. The theory assumes that one can easily compute an unbiased gradient estimator, which is usually the case due to the sample average nature of empirical risk minimization. There exist, however, many scenarios (e.g., graphs) where an unbiased estimator may be as expensive to compute as the full gradient because training examples are interconnected. Recently, Chen et al. (2018) proposed using a consistent gradient estimator as an economic alternative. Encouraged by empirical success, we show, in a general setting, that consistent estimators result in the same convergence behavior as do unbiased ones. Our analysis covers strongly convex, convex, and nonconvex objectives. We verify the results with illustrative experiments on synthetic and real-world data. This work opens several new research directions, including the development of more efficient SGD updates with consistent estimators and the design of efficient training algorithms for large-scale graphs.\n",
        "keywords": "Stochastic optimization;biased gradient estimator;graph convolutional networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jie Chen;Ronny Luss",
        "authorids": "chenjie@us.ibm.com;rluss@us.ibm.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nchen2020stochastic,\ntitle={Stochastic Gradient Descent with Biased but Consistent Gradient Estimators},\nauthor={Jie Chen and Ronny Luss},\nyear={2020},\nurl={https://openreview.net/forum?id=rygMWT4twS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rygMWT4twS",
        "pdf_size": 0,
        "rating": "1;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "390;473;281;364",
        "wc_reply_reviewers": "444;445;0;19",
        "wc_reply_authors": "1725;1723;366;313",
        "reply_reviewers": "2;2;0;1",
        "reply_authors": "4;3;1;1",
        "rating_avg": [
            3.25,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            377.0,
            68.50182479321262
        ],
        "wc_reply_reviewers_avg": [
            227.0,
            217.60399812503445
        ],
        "wc_reply_authors_avg": [
            1031.75,
            692.503925981651
        ],
        "reply_reviewers_avg": [
            1.25,
            0.82915619758885
        ],
        "reply_authors_avg": [
            2.25,
            1.299038105676658
        ],
        "replies_avg": [
            20,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 64,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18184444434358860600&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rygPm64tDH",
        "title": "Learning Explainable Models Using Attribution Priors",
        "track": "main",
        "status": "Reject",
        "tldr": "A method for encouraging axiomatic feature attributions of a deep model to match human intuition.",
        "abstract": "Two important topics in deep learning both involve incorporating humans into the modeling process: Model priors transfer information from humans to a model by regularizing the model's parameters; Model attributions transfer information from a model to humans by explaining the model's behavior. Previous work has taken important steps to connect these topics through various forms of gradient regularization. We find, however, that existing methods that use attributions to align a model's behavior with human intuition are ineffective. We develop an efficient and theoretically grounded feature attribution method, expected gradients, and a novel framework, attribution priors, to enforce prior expectations about a model's behavior during training. We demonstrate that attribution priors are broadly applicable by instantiating them on three different types of data: image data, gene expression data, and health care data. Our experiments show that models trained with attribution priors are more intuitive and achieve better generalization performance than both equivalent baselines and existing methods to regularize model behavior.",
        "keywords": "Deep Learning;Interpretability;Attributions;Explanations;Biology;Health;Computational Biology",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Gabriel Erion;Joseph D. Janizek;Pascal Sturmfels;Scott M. Lundberg;Su-In Lee",
        "authorids": "erion@cs.washington.edu;jjanizek@cs.washington.edu;psturm@cs.washington.edu;slund1@cs.washington.edu;suinlee@cs.washington.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nerion2020learning,\ntitle={Learning Explainable Models Using Attribution Priors},\nauthor={Gabriel Erion and Joseph D. Janizek and Pascal Sturmfels and Scott M. Lundberg and Su-In Lee},\nyear={2020},\nurl={https://openreview.net/forum?id=rygPm64tDH}\n}",
        "github": "https://www.dropbox.com/sh/xvt3vqv8xjb5nwh/AACgt-0OxiefImjVXX5UJSuua?dl=0",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygPm64tDH",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "274;450;456",
        "wc_reply_reviewers": "0;0;66",
        "wc_reply_authors": "485;294;240",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            393.3333333333333,
            84.41695458983475
        ],
        "wc_reply_reviewers_avg": [
            22.0,
            31.11269837220809
        ],
        "wc_reply_authors_avg": [
            339.6666666666667,
            105.10418111991973
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 94,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1641949278410496304&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rygRP2VYwB",
        "title": "Stochastically Controlled Compositional Gradient for the Composition problem",
        "track": "main",
        "status": "Reject",
        "tldr": "We devise a stochastically controlled compositional gradient algorithm for the composition problem",
        "abstract": "We consider  composition problems of the form  $\\frac{1}{n}\\sum\\nolimits_{i= 1}^n F_i(\\frac{1}{n}\\sum\\nolimits_{j = 1}^n G_j(x))$. Composition optimization arises in many important machine learning applications: reinforcement learning, variance-aware learning, nonlinear embedding, and many others. Both gradient descent and stochastic gradient descent are straightforward solution, but both require to  compute $\\frac{1}{n}\\sum\\nolimits_{j = 1}^n{G_j( x )} $ in each single iteration, which is inefficient-especially when $n$ is large. Therefore, with the aim of significantly reducing the query complexity of such problems, we designed a stochastically controlled compositional gradient algorithm that incorporates two kinds of variance reduction techniques, and works in both strongly convex and non-convex settings. The strategy is also accompanied by a mini-batch version of the proposed method that improves query complexity with respect to the size of the mini-batch. Comprehensive experiments demonstrate the superiority of the proposed method over existing methods.",
        "keywords": "Non-convex optimisation;Composition problem;Stochastically controlled compositional gradient",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Liu Liu;Ji Liu;Cho-Jui Hsieh;Dacheng Tao",
        "authorids": "liu.liu1@sydney.edu.au;ji.liu.uwisc@gmail.com;chohsieh@cs.ucla.edu;dacheng.tao@sydney.edu.au",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nliu2020stochastically,\ntitle={Stochastically Controlled Compositional Gradient for the Composition problem},\nauthor={Liu Liu and Ji Liu and Cho-Jui Hsieh and Dacheng Tao},\nyear={2020},\nurl={https://openreview.net/forum?id=rygRP2VYwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rygRP2VYwB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "206;191;288",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "343;218;140",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            228.33333333333334,
            42.63279905841928
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            233.66666666666666,
            83.61153562092308
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11939021533647315635&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rygT_JHtDr",
        "title": "Scalable Deep Neural Networks via Low-Rank Matrix Factorization",
        "track": "main",
        "status": "Reject",
        "tldr": "In this paper, we propose a novel method that enables DNNs to flexibly change their size after training. We factorize the weight matrices of the DNNs via singular value decomposition (SVD) and change their ranks according to the target size.",
        "abstract": "Compressing deep neural networks (DNNs) is important for real-world applications operating on resource-constrained devices. However, it is difficult to change the model size once the training is completed, which needs re-training to configure models suitable for different devices. In this paper, we propose a novel method that enables DNNs to flexibly change their size after training. We factorize the weight matrices of the DNNs via singular value decomposition (SVD) and change their ranks according to the target size. In contrast with existing methods, we introduce simple criteria that characterize the importance of each basis and layer, which enables to effectively compress the error and complexity of models as little as possible. In experiments on multiple image-classification tasks, our method exhibits favorable performance compared with other methods.",
        "keywords": "Deep Learning;Deep Neural Networks;Low-Rank Matrix Factorization;Model Compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Atsushi Yaguchi;Taiji Suzuki;Shuhei Nitta;Yukinobu Sakata;Akiyuki Tanizawa",
        "authorids": "atsushi.yaguchi@toshiba.co.jp;taiji@mist.i.u-tokyo.ac.jp;shuhei.nitta@toshiba.co.jp;yuki.sakata@toshiba.co.jp;akiyuki.tanizawa@toshiba.co.jp",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyaguchi2020scalable,\ntitle={Scalable Deep Neural Networks via Low-Rank Matrix Factorization},\nauthor={Atsushi Yaguchi and Taiji Suzuki and Shuhei Nitta and Yukinobu Sakata and Akiyuki Tanizawa},\nyear={2020},\nurl={https://openreview.net/forum?id=rygT_JHtDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rygT_JHtDr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "544;552;204",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "351;380;65",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            433.3333333333333,
            162.19604050517248
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            265.3333333333333,
            142.1509369961693
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15613899315542969340&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rygUoeHKvB",
        "title": "Deep exploration by novelty-pursuit with maximum state entropy",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose an efficient exploration method called Novelty-pursuit for reinforcement learning. This method bridges the intrinsically motivated goal exploration process and the the maximum state entropy exploration.",
        "abstract": "Efficient exploration is essential to reinforcement learning in huge state space. Recent approaches to address this issue include the intrinsically motivated goal exploration process (IMGEP) and the maximum state entropy exploration (MSEE). In this paper, we disclose that goal-conditioned exploration behaviors in IMGEP can also maximize the state entropy, which bridges the IMGEP and the MSEE. From this connection, we propose a maximum entropy criterion for goal selection in goal-conditioned exploration, which results in the new exploration method novelty-pursuit. Novelty-pursuit performs the exploration in two stages: first, it selects a goal for the goal-conditioned exploration policy to reach the boundary of the explored region; then, it takes random actions to explore the non-explored region. We demonstrate the effectiveness of the proposed method in environments from simple maze environments, Mujoco tasks, to the long-horizon video game of SuperMarioBros. Experiment results show that the proposed method outperforms the state-of-the-art approaches that use curiosity-driven exploration.",
        "keywords": "Exploration;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zi-Niu Li;Xiong-Hui Chen;Yang Yu",
        "authorids": "liziniu1997@gmail.com;chenxh@lamda.nju.edu.cn;yuy@lamda.nju.edu.cn",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nli2020deep,\ntitle={Deep exploration by novelty-pursuit with maximum state entropy},\nauthor={Zi-Niu Li and Xiong-Hui Chen and Yang Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=rygUoeHKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rygUoeHKvB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "413;735;649",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "175;414;483",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            599.0,
            136.12739131661442
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            357.3333333333333,
            131.97053543205098
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eTOgEyCcnSMJ:scholar.google.com/&scioq=Deep+exploration+by+novelty-pursuit+with+maximum+state+entropy&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ryga2CNKDH",
        "title": "Evaluating Lossy Compression Rates of Deep Generative Models",
        "track": "main",
        "status": "Reject",
        "tldr": "We study rate distortion approximations for evaluating deep generative models, and show that rate distortion curves provide more insights about the model than the log-likelihood alone while requiring roughly the same computational cost.",
        "abstract": "Deep generative models have achieved remarkable progress in recent years. Despite this progress, quantitative evaluation and comparison of generative models remains as one of the important challenges. One of the most popular metrics for evaluating generative models is the log-likelihood. While the direct computation of log-likelihood can be intractable, it has been recently shown that the log-likelihood of some of the most interesting generative models such as variational autoencoders (VAE) or generative adversarial networks (GAN) can be efficiently estimated using annealed importance sampling (AIS). In this work, we argue that the log-likelihood metric by itself cannot represent all the different performance characteristics of generative models, and propose to use rate distortion curves to evaluate and compare deep generative models. We show that we can approximate the entire rate distortion curve using one single run of AIS for roughly the same computational cost as a single log-likelihood estimate. We evaluate lossy compression rates of different deep generative models such as VAEs, GANs (and its variants) and adversarial autoencoders (AAE) on MNIST and CIFAR10, and arrive at a number of insights not obtainable from log-likelihoods alone.",
        "keywords": "Deep Learning;Generative Models;Information Theory;Rate Distortion Theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sicong Huang;Alireza Makhzani;Yanshuai Cao;Roger Grosse",
        "authorids": "huang@cs.toronto.edu;a.makhzani@gmail.com;yanshuai.cao@borealisai.com;rgrosse@cs.toronto.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nhuang2020evaluating,\ntitle={Evaluating Lossy Compression Rates of Deep Generative Models},\nauthor={Sicong Huang and Alireza Makhzani and Yanshuai Cao and Roger Grosse},\nyear={2020},\nurl={https://openreview.net/forum?id=ryga2CNKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryga2CNKDH",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "179;1138;1029",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "292;773;371",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            782.0,
            428.7011391012003
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            478.6666666666667,
            210.60916936881503
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 32,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=922070016179737157&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rygeHgSFDH",
        "title": "Disentanglement by Nonlinear ICA with General Incompressible-flow Networks (GIN)",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "A central question of representation learning asks under which conditions it is possible to reconstruct the true latent variables of an arbitrarily complex generative process. Recent breakthrough work by Khemakhem et al. (2019) on nonlinear ICA has answered this question for a broad class of conditional generative processes. We extend this important result in a direction relevant for application to real-world data. First, we generalize the theory to the case of unknown intrinsic problem dimension and prove that in some special (but not very restrictive) cases, informative latent variables will be automatically separated from noise by an estimating model. Furthermore, the recovered informative latent variables will be in one-to-one correspondence with the true latent variables of the generating process, up to a trivial component-wise transformation. Second, we introduce a modification of the RealNVP invertible neural network architecture (Dinh et al. (2016)) which is particularly suitable for this type of problem: the General Incompressible-flow Network (GIN). Experiments on artificial data and EMNIST demonstrate that theoretical predictions are indeed verified in practice. In particular, we provide a detailed set of exactly 22 informative latent variables extracted from EMNIST.",
        "keywords": "disentanglement;nonlinear ICA;representation learning;feature discovery;theoretical justification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Peter Sorrenson;Carsten Rother;Ullrich K\u00f6the",
        "authorids": "peter.sorrenson@gmail.com;carsten.rother@iwr.uni-heidelberg.de;ullrich.koethe@iwr.uni-heidelberg.de",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nSorrenson2020Disentanglement,\ntitle={Disentanglement by Nonlinear ICA with General Incompressible-flow Networks (GIN)},\nauthor={Peter Sorrenson and Carsten Rother and Ullrich K\u00f6the},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rygeHgSFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rygeHgSFDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "346;196;548",
        "wc_reply_reviewers": "123;0;205",
        "wc_reply_authors": "153;183;257",
        "reply_reviewers": "1;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            363.3333333333333,
            144.22513265339327
        ],
        "wc_reply_reviewers_avg": [
            109.33333333333333,
            84.24699137390934
        ],
        "wc_reply_authors_avg": [
            197.66666666666666,
            43.70608907489003
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 151,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16681653991270138537&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rygePJHYPH",
        "title": "Towards trustworthy predictions from deep neural networks with fast adversarial calibration",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "To facilitate a wide-spread acceptance of AI systems guiding decision making in real-world applications, trustworthiness of deployed models is key. That is, it is crucial for predictive models to be uncertainty-aware and yield well-calibrated (and thus trustworthy) predictions for both in-domain samples as well as under domain shift. Recent efforts to account for predictive uncertainty include post-processing steps for trained neural networks, Bayesian neural networks as well as alternative non-Bayesian approaches such as ensemble approaches and evidential deep learning. Here, we propose an efficient yet general modelling approach for obtaining well-calibrated, trustworthy probabilities for samples obtained after a domain shift. We introduce a new training strategy combining an entropy-encouraging loss term with an adversarial calibration loss term and demonstrate that this results in well-calibrated and technically trustworthy predictions for a wide range of perturbations. We comprehensively evaluate previously proposed approaches on different data modalities, a large range of data sets, network architectures and perturbation strategies and observe that our modelling approach substantially outperforms existing state-of-the-art approaches, yielding well-calibrated predictions for both in-domain and out-of domain samples. ",
        "keywords": "deep learning;uncertainty;calibration;domain shift;robustness",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Christian Tomani;Florian Buettner",
        "authorids": "christian.tomani@gmail.com;fbuettner.phys@gmail.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\ntomani2020towards,\ntitle={Towards trustworthy predictions from deep neural networks with fast adversarial calibration},\nauthor={Christian Tomani and Florian Buettner},\nyear={2020},\nurl={https://openreview.net/forum?id=rygePJHYPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygePJHYPH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "258;925;422",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "209;1279;479",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;3;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            535.0,
            283.7827807790083
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            655.6666666666666,
            454.33712397538267
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 36,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8531994849847551869&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rygf-kSYwH",
        "title": "Behaviour Suite for Reinforcement Learning",
        "track": "main",
        "status": "Spotlight",
        "tldr": "Bsuite is a collection of carefully-designed experiments that investigate the core capabilities of RL agents.",
        "abstract": "This paper introduces the Behaviour Suite for Reinforcement Learning, or bsuite for short. bsuite is a collection of carefully-designed experiments that investigate core capabilities of reinforcement learning (RL) agents with two objectives. First, to collect clear, informative and scalable problems that capture key issues in the design of general and efficient learning algorithms. Second, to study agent behaviour through their performance on these shared benchmarks. To complement this effort, we open source this http URL, which automates evaluation and analysis of any agent on bsuite. This library facilitates reproducible and accessible research on the core issues in RL, and ultimately the design of superior learning algorithms. Our code is Python, and easy to use within existing projects. We include examples with OpenAI Baselines, Dopamine as well as new reference implementations. Going forward, we hope to incorporate more excellent experiments from the research community, and commit to a periodic review of bsuite from a committee of prominent researchers.",
        "keywords": "reinforcement learning;benchmark;core issues;scalability;reproducibility",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ian Osband;Yotam Doron;Matteo Hessel;John Aslanides;Eren Sezener;Andre Saraiva;Katrina McKinney;Tor Lattimore;Csaba Szepesvari;Satinder Singh;Benjamin Van Roy;Richard Sutton;David Silver;Hado Van Hasselt",
        "authorids": "ian.osband@gmail.com;ydoron@google.com;mtthss@google.com;jaslanides@google.com;esezener@google.com;andresnds@google.com;mckinneyk@google.com;lattimore@google.com;szepi@google.com;baveja@google.com;benvanroy@google.com;suttonr@google.com;davidsilver@google.com;hado@google.com",
        "gender": ";;;;;;;;;;;;;",
        "homepage": ";;;;;;;;;;;;;",
        "dblp": ";;;;;;;;;;;;;",
        "google_scholar": ";;;;;;;;;;;;;",
        "orcid": ";;;;;;;;;;;;;",
        "linkedin": ";;;;;;;;;;;;;",
        "or_profile": ";;;;;;;;;;;;;",
        "aff": ";;;;;;;;;;;;;",
        "aff_domain": ";;;;;;;;;;;;;",
        "position": ";;;;;;;;;;;;;",
        "bibtex": "@inproceedings{\nOsband2020Behaviour,\ntitle={Behaviour Suite for Reinforcement Learning},\nauthor={Ian Osband and Yotam Doron and Matteo Hessel and John Aslanides and Eren Sezener and Andre Saraiva and Katrina McKinney and Tor Lattimore and Csaba Szepesvari and Satinder Singh and Benjamin Van Roy and Richard Sutton and David Silver and Hado Van Hasselt},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rygf-kSYwH}\n}",
        "github": "https://github.com/deepmind/bsuite",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygf-kSYwH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "160;521;652",
        "wc_reply_reviewers": "0;68;151",
        "wc_reply_authors": "287;844;789",
        "reply_reviewers": "0;1;2",
        "reply_authors": "1;3;3",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            444.3333333333333,
            208.04540102797006
        ],
        "wc_reply_reviewers_avg": [
            73.0,
            61.746794788609606
        ],
        "wc_reply_authors_avg": [
            640.0,
            250.616573008783
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "replies_avg": [
            15,
            0
        ],
        "authors#_avg": [
            14,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 215,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10471200174222163517&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rygfC0VKPS",
        "title": "Improved Modeling of Complex Systems Using Hybrid Physics/Machine Learning/Stochastic Models",
        "track": "main",
        "status": "Reject",
        "tldr": "Improved modeling of complex systems uses hybrid neural/domain model composition, new decorrelation loss functions and extrapolative test sets ",
        "abstract": "Combining domain knowledge models with neural models has been challenging.  End-to-end trained neural models often perform better (lower Mean Square Error) than domain knowledge models or domain/neural combinations, and the combination is inefficient to train.  In this paper, we demonstrate that by composing domain models with machine learning models, by using extrapolative testing sets, and invoking decorrelation objective functions, we create models which can predict more complex systems. The models are interpretable, extrapolative, data-efficient, and capture predictable but complex non-stochastic behavior such as unmodeled degrees of freedom and systemic measurement noise.  We apply this improved modeling paradigm to several simulated systems and an actual physical system in the context of system identification.   Several ways of composing domain models with neural models are examined for time series, boosting, bagging, and auto-encoding on various systems of varying complexity and non-linearity.  Although this work is preliminary, we show that the ability to combine models is a very promising direction for neural modeling.",
        "keywords": "Composition;extrapolation;boosting;autocorrelation;systematic errors",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anand Ramakrishnan;Warren B. Jackson;Kent Evans",
        "authorids": "aramakrishnan@wpi.edu;jackson@parc.com;kent.evans@parc.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nramakrishnan2020improved,\ntitle={Improved Modeling of Complex Systems Using Hybrid Physics/Machine Learning/Stochastic Models},\nauthor={Anand Ramakrishnan and Warren B. Jackson and Kent Evans},\nyear={2020},\nurl={https://openreview.net/forum?id=rygfC0VKPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygfC0VKPS",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "419;241;449",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            369.6666666666667,
            91.80171869608736
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:P9wnugc03rkJ:scholar.google.com/&scioq=Improved+Modeling+of+Complex+Systems+Using+Hybrid+Physics/Machine+Learning/Stochastic+Models&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rygfnn4twS",
        "title": "AutoQ: Automated Kernel-Wise Neural Network Quantization",
        "track": "main",
        "status": "Poster",
        "tldr": "Accurate, Fast and Automated Kernel-Wise Neural Network Quantization with Mixed Precision using Hierarchical Deep Reinforcement Learning",
        "abstract": "Network quantization is one of the most hardware friendly techniques to enable the deployment of convolutional neural networks (CNNs) on low-power mobile devices. Recent network quantization techniques quantize each weight kernel in a convolutional layer independently for higher inference accuracy, since the weight kernels in a layer exhibit different variances and hence have different amounts of redundancy. The quantization bitwidth or bit number (QBN) directly decides the inference accuracy, latency, energy and hardware overhead. To effectively reduce the redundancy and accelerate CNN inferences, various weight kernels should be quantized with different QBNs. However, prior works use only one QBN to quantize each convolutional layer or the entire CNN, because the design space of searching a QBN for each weight kernel is too large. The hand-crafted heuristic of the kernel-wise QBN search is so sophisticated that domain experts can obtain only sub-optimal results. It is difficult for even deep reinforcement learning (DRL) DDPG-based agents to find a kernel-wise QBN configuration that can achieve reasonable inference accuracy. In this paper, we propose a hierarchical-DRL-based kernel-wise network quantization technique, AutoQ, to automatically search a QBN for each weight kernel, and choose another QBN for each activation layer. Compared to the models quantized by the state-of-the-art DRL-based schemes, on average, the same models quantized by AutoQ reduce the inference latency by 54.06%, and decrease the inference energy consumption by 50.69%, while achieving the same inference accuracy.",
        "keywords": "AutoML;Kernel-Wise Neural Networks Quantization;Hierarchical Deep Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qian Lou;Feng Guo;Minje Kim;Lantao Liu;Lei Jiang.",
        "authorids": "louqian@iu.edu;fengguo@iu.edu;minje@indiana.edu;lantao@iu.edu;jiang60@iu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nLou2020AutoQ:,\ntitle={AutoQ: Automated Kernel-Wise Neural Network Quantization },\nauthor={Qian Lou and Feng Guo and Minje Kim and Lantao Liu and Lei Jiang.},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rygfnn4twS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rygfnn4twS",
        "pdf_size": 0,
        "rating": "3;6;6;8",
        "confidence": "0;0;0;0",
        "wc_review": "609;181;216;370",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "763;220;0;499",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;0;1",
        "rating_avg": [
            5.75,
            1.7853571071357126
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.0,
            168.7112918568286
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            370.5,
            287.44086348325635
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.75,
            0.4330127018922193
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 119,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9600328672658477929&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "ryghPCVYvH",
        "title": "Generative Restricted Kernel Machines",
        "track": "main",
        "status": "Reject",
        "tldr": "Gen-RKM: a novel framework for generative models using Restricted Kernel Machines with multi-view generation and uncorrelated feature learning.",
        "abstract": "We introduce a novel framework for generative models based on Restricted Kernel Machines (RKMs) with multi-view generation and uncorrelated feature learning capabilities, called Gen-RKM. To incorporate multi-view generation, this mechanism uses a shared representation of data from various views. The mechanism is flexible to incorporate both kernel-based, (deep) neural network and convolutional based models within the same setting. To update the parameters of the network, we propose a novel training procedure which jointly learns the features and shared representation. Experiments demonstrate the potential of the framework through qualitative evaluation of generated samples.",
        "keywords": "Generative models;Kernel methods;Deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Arun Pandey;Joachim Schreurs;Johan A.K. Suykens",
        "authorids": "arun.pandey@esat.kuleuven.be;joachim.schreurs@esat.kuleuven.be;johan.suykens@esat.kuleuven.be",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npandey2020generative,\ntitle={Generative Restricted Kernel Machines},\nauthor={Arun Pandey and Joachim Schreurs and Johan A.K. Suykens},\nyear={2020},\nurl={https://openreview.net/forum?id=ryghPCVYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ryghPCVYvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "411;254;401",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "329;480;1068",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            355.3333333333333,
            71.76969338717345
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            625.6666666666666,
            318.7939076512529
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=681625586718103225&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryghZJBKPS",
        "title": "Deep Batch Active Learning by Diverse, Uncertain Gradient Lower Bounds",
        "track": "main",
        "status": "Talk",
        "tldr": "We introduce a new batch active learning algorithm that's robust to model architecture, batch size, and dataset.",
        "abstract": "We design a new algorithm for batch active learning with deep neural network models. Our algorithm, Batch Active learning by Diverse Gradient Embeddings (BADGE), samples groups of points that are disparate and high-magnitude when represented in a hallucinated gradient space, a strategy designed to incorporate both predictive uncertainty and sample diversity into every selected batch. Crucially, BADGE trades off between diversity and uncertainty without requiring any hand-tuned hyperparameters. While other approaches sometimes succeed for particular batch sizes or architectures, BADGE consistently performs as well or better, making it a useful option for real world active learning problems.",
        "keywords": "deep learning;active learning;batch active learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jordan T. Ash;Chicheng Zhang;Akshay Krishnamurthy;John Langford;Alekh Agarwal",
        "authorids": "jordanta@cs.princeton.edu;chichengz@cs.arizona.edu;akshay.krishnamurthy@microsoft.com;jcl@microsoft.com;alekha@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nAsh2020Deep,\ntitle={Deep Batch Active Learning by Diverse, Uncertain Gradient Lower Bounds},\nauthor={Jordan T. Ash and Chicheng Zhang and Akshay Krishnamurthy and John Langford and Alekh Agarwal},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryghZJBKPS}\n}",
        "github": "[![github](/images/github_icon.svg) JordanAsh/badge](https://github.com/JordanAsh/badge) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=ryghZJBKPS)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryghZJBKPS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "901;505;359",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "494;119;195",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            588.3333333333334,
            228.98228946555867
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            269.3333333333333,
            161.8648270076678
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 957,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5483695014257396730&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rygixkHKDH",
        "title": "Geometric Analysis of Nonconvex Optimization Landscapes for Overcomplete Learning",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "Learning overcomplete representations finds many applications in machine learning and data analytics. In the past decade, despite the empirical success of heuristic methods, theoretical understandings and explanations of these algorithms are still far from satisfactory. In this work, we provide new theoretical insights for several important representation learning problems: learning (i) sparsely used overcomplete dictionaries and (ii) convolutional dictionaries. We formulate these problems as $\\ell^4$-norm optimization problems over the sphere and study the geometric properties of their nonconvex optimization landscapes. For both problems, we show the nonconvex objective has benign (global) geometric structures, which enable the development of efficient optimization methods finding the target solutions. Finally, our theoretical results are justified by numerical simulations.\n",
        "keywords": "dictionary learning;sparse representations;nonconvex optimization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Qing Qu;Yuexiang Zhai;Xiao Li;Yuqian Zhang;Zhihui Zhu",
        "authorids": "qingqu1006@gmail.com;ysz@berkeley.edu;xli@ee.cuhk.edu.hk;yqz.zhang@gmail.com;zzhu29@jhu.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nQu2020Geometric,\ntitle={Geometric Analysis of Nonconvex Optimization Landscapes for Overcomplete Learning},\nauthor={Qing Qu and Yuexiang Zhai and Xiao Li and Yuqian Zhang and Zhihui Zhu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rygixkHKDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rygixkHKDH",
        "pdf_size": 0,
        "rating": "8;8;8",
        "confidence": "0;0;0",
        "wc_review": "331;326;414",
        "wc_reply_reviewers": "0;0;15",
        "wc_reply_authors": "318;56;398",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            8.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            357.0,
            40.3567425180312
        ],
        "wc_reply_reviewers_avg": [
            5.0,
            7.0710678118654755
        ],
        "wc_reply_authors_avg": [
            257.3333333333333,
            146.06239153944531
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 33,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13789997947112463749&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rygjHxrYDB",
        "title": "Deep Audio Priors Emerge From Harmonic Convolutional Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "A new operation called Harmonic Convolution makes deep network model audio priors without training.",
        "abstract": "Convolutional neural networks (CNNs) excel in image recognition and generation. Among many efforts to explain their effectiveness, experiments show that CNNs carry strong inductive biases that capture natural image priors. Do deep networks also have inductive biases for audio signals? In this paper, we empirically show that current network architectures for audio processing do not show strong evidence in capturing such priors. We propose Harmonic Convolution, an operation that helps deep networks distill priors in audio signals by explicitly utilizing the harmonic structure within. This is done by engineering the kernel to be supported by sets of harmonic series, instead of local neighborhoods for convolutional kernels. We show that networks using Harmonic Convolution can reliably model audio priors and achieve high performance in unsupervised audio restoration tasks. With Harmonic Convolution, they also achieve better generalization performance for sound source separation.",
        "keywords": "Audio;Deep Prior",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhoutong Zhang;Yunyun Wang;Chuang Gan;Jiajun Wu;Joshua B. Tenenbaum;Antonio Torralba;William T. Freeman",
        "authorids": "ztzhang@mit.edu;wyy@mit.edu;ganchuang1990@gmail.com;jiajunwu@mit.edu;jbt@mit.edu;torralba@mit.edu;billf@mit.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nZhang2020Deep,\ntitle={Deep Audio Priors Emerge From Harmonic Convolutional Networks},\nauthor={Zhoutong Zhang and Yunyun Wang and Chuang Gan and Jiajun Wu and Joshua B. Tenenbaum and Antonio Torralba and William T. Freeman},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rygjHxrYDB}\n}",
        "github": "http://dap.csail.mit.edu/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rygjHxrYDB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "330;359;822",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "277;299;1008",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            503.6666666666667,
            225.40679275971746
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            528.0,
            339.53006739708144
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 40,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9355991723168189204&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rygjmpVFvB",
        "title": "Difference-Seeking Generative Adversarial Network--Unseen Sample Generation",
        "track": "main",
        "status": "Poster",
        "tldr": "We proposed a novel GAN framework to generate unseen data.",
        "abstract": "\nUnseen data, which are not samples from the distribution of training data and are difficult to collect, have exhibited importance in numerous applications, ({\\em e.g.,} novelty detection, semi-supervised learning, and adversarial training).  In this paper, we introduce a general framework called  \\textbf{d}ifference-\\textbf{s}eeking \\textbf{g}enerative \\textbf{a}dversarial \\textbf{n}etwork (DSGAN), to generate various types of unseen data. Its novelty is the consideration of the probability density of the unseen data distribution as the difference between two distributions $p_{\\bar{d}}$ and $p_{d}$ whose samples are relatively easy to collect.\n\nThe DSGAN can learn the target distribution, $p_{t}$, (or the unseen data distribution)  from only the samples from the two distributions, $p_{d}$ and $p_{\\bar{d}}$. In our scenario, $p_d$ is the distribution of the seen data, and $p_{\\bar{d}}$ can be obtained from $p_{d}$ via simple operations, so that  we only need the samples of $p_{d}$ during the training. \nTwo key applications, semi-supervised learning and novelty detection, are taken as case studies to illustrate that the DSGAN enables the production of various unseen data. We also provide theoretical analyses about the convergence of the DSGAN.\n\n",
        "keywords": "generative adversarial network;semi-supervised learning;novelty detection",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yi Lin Sung;Sung-Hsien Hsieh;Soo-Chang Pei;Chun-Shien Lu",
        "authorids": "r06942076@ntu.edu.tw;parvaty316@hotmail.com;peisc@ntu.edu.tw;lcs@iis.sinica.edu.tw",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nSung2020Difference-Seeking,\ntitle={Difference-Seeking Generative Adversarial Network--Unseen Sample Generation},\nauthor={Yi Lin Sung and Sung-Hsien Hsieh and Soo-Chang Pei and Chun-Shien Lu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rygjmpVFvB}\n}",
        "github": "https://drive.google.com/open?id=18aQzyPbTT7_4fdkFjxL2MLjxMLK_hCuH",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rygjmpVFvB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "221;181;115",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "355;447;248",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            172.33333333333334,
            43.70608907489003
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            350.0,
            81.31830462243214
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5935389001061397819&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "rygoURNYvS",
        "title": "Pre-trained Contextual Embedding of Source Code",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "The source code of a program not only serves as a formal description of an executable task, but it also serves to communicate developer intent in a human-readable form. To facilitate this, developers use meaningful identifier names and natural-language documentation. This makes it possible to successfully apply sequence-modeling approaches, shown to be effective in natural-language processing, to source code. A major advancement in natural-language understanding has been the use of pre-trained token embeddings; BERT and other works have further shown that pre-trained contextual embeddings can be extremely powerful and can be finetuned effectively for a variety of downstream supervised tasks. Inspired by these developments, we present the first attempt to replicate this success on source code. We curate a massive corpus of Python programs from GitHub to pre-train a BERT model, which we call Code Understanding BERT (CuBERT). We also pre-train Word2Vec embeddings on the same dataset. We create a benchmark of five classification tasks and compare finetuned CuBERT against sequence models trained with and without the Word2Vec embeddings. Our results show that CuBERT outperforms the baseline methods by a margin of 2.9-22%. We also show its superiority when finetuned with smaller datasets, and over fewer epochs.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Aditya Kanade;Petros Maniatis;Gogul Balakrishnan;Kensen Shi",
        "authorids": "akanade@google.com;maniatis@google.com;bgogul@google.com;kshi@google.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nkanade2020pretrained,\ntitle={Pre-trained Contextual Embedding of Source Code},\nauthor={Aditya Kanade and Petros Maniatis and Gogul Balakrishnan and Kensen Shi},\nyear={2020},\nurl={https://openreview.net/forum?id=rygoURNYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rygoURNYvS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "425;408;705",
        "wc_reply_reviewers": "71;343;105",
        "wc_reply_authors": "809;486;597",
        "reply_reviewers": "2;2;1",
        "reply_authors": "2;2;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            512.6666666666666,
            136.17717217735952
        ],
        "wc_reply_reviewers_avg": [
            173.0,
            121.00688685635485
        ],
        "wc_reply_authors_avg": [
            630.6666666666666,
            133.99585399887897
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            0.0
        ],
        "replies_avg": [
            19,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 65,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3185058516754263357&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rygpAnEKDH",
        "title": "StacNAS: Towards Stable and Consistent Optimization for Differentiable Neural Architecture Search",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Earlier methods for Neural Architecture Search were computationally expensive. Recently proposed Differentiable Neural Architecture Search algorithms such as DARTS can effectively speed up the computation. However, the current formulation relies on a relaxation of the original problem that leads to unstable and suboptimal solutions. We argue that these problems are caused by three fundamental reasons: (1) The difficulty of bi-level optimization; (2) Multicollinearity of correlated operations such as max pooling and average pooling; (3) The discrepancy between the optimization complexity of the search stage and the final training.  In this paper, we propose a grouped variable pruning algorithm based on one-level optimization, which leads to a more stable and consistent optimization solution for differentiable NAS.  \n Extensive experiments verify the superiority of the proposed method regarding both accuracy and stability. Our new approach obtains state-of-the-art accuracy on CIFAR-10, CIFAR-100 and ImageNet. ",
        "keywords": "Differentiable  Neural Architecture Search",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Li Guilin;Zhang Xing;Wang Zitong;Li Zhenguo;Zhang Tong",
        "authorids": "hiliguilin@gmail.com;zhang.xing1@huawei.com;ztwang@math.cuhk.edu.hk;li.zhenguo@huawei.com;tongzhang@tongzhang-ml.org",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "https://github.com/susan0199/stacnas",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rygpAnEKDH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "140;417;149",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "63;307;183",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            235.33333333333334,
            128.5102676399395
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            184.33333333333334,
            99.6170445032152
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4047032335475122253&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rygtPhVtDS",
        "title": "Noise Regularization for Conditional Density Estimation",
        "track": "main",
        "status": "Reject",
        "tldr": "A model-agnostic regularization scheme for neural network-based conditional density estimation.",
        "abstract": "Modelling statistical relationships beyond the conditional mean is crucial in many settings. Conditional density estimation (CDE) aims to learn the full conditional probability density from data. Though highly expressive, neural network based CDE models can suffer from severe over-fitting when trained with the maximum likelihood objective. Due to the inherent structure of such models, classical regularization approaches in the parameter space are rendered ineffective. To address this issue, we develop a model-agnostic noise regularization method for CDE that adds random perturbations to the data during training. We demonstrate that the proposed approach corresponds to a smoothness regularization and prove its asymptotic consistency. In our experiments, noise regularization significantly and consistently outperforms other regularization methods across seven data sets and three CDE models. The effectiveness of noise regularization makes neural network based CDE the preferable method over previous non- and semi-parametric approaches, even when training data is scarce. ",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jonas Rothfuss;Fabio Ferreira;Simon Boehm;Simon Walther;Maxim Ulrich;Tamim Asfour;Andreas Krause",
        "authorids": "jonas.rothfuss@gmail.com;fabioferreira@mailbox.org;simonboehm@gmx.de;simon.walther@kit.edu;maxim.ulrich@kit.edu;asfour@kit.edu;krausea@ethz.ch",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@misc{\nrothfuss2020noise,\ntitle={Noise Regularization for Conditional Density Estimation},\nauthor={Jonas Rothfuss and Fabio Ferreira and Simon Boehm and Simon Walther and Maxim Ulrich and Tamim Asfour and Andreas Krause},\nyear={2020},\nurl={https://openreview.net/forum?id=rygtPhVtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rygtPhVtDS",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "198;222;579",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "530;443;316",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            333.0,
            174.22399375516565
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            429.6666666666667,
            87.87238221167988
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 40,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9902591337356269600&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryguP1BFwr",
        "title": "Walking the Tightrope: An Investigation of the Convolutional Autoencoder Bottleneck",
        "track": "main",
        "status": "Reject",
        "tldr": "We conduct experiments on how the bottlneck in convolutional autoencoder influences their behavior and find that heigth/widht matters significantly more than number of channels and that complete CAEs do not learn to simply copy their input.",
        "abstract": "In this paper, we present an in-depth investigation of the convolutional autoencoder (CAE) bottleneck.\nAutoencoders (AE), and especially their convolutional variants, play a vital role in the current deep learning toolbox.\nResearchers and practitioners employ CAEs for a variety of tasks, ranging from outlier detection and compression to transfer and representation learning.\nDespite their widespread adoption, we have limited insight into how the bottleneck shape impacts the emergent properties of the CAE.\nWe demonstrate that increased height and width of the bottleneck drastically improves generalization, which in turn leads to better performance of the latent codes in downstream transfer learning tasks.\nThe number of channels in the bottleneck, on the other hand, is secondary in importance.\nFurthermore, we show empirically, that, contrary to popular belief, CAEs do not learn to copy their input, even when the bottleneck has the same number of neurons as there are pixels in the input.\nCopying does not occur, despite training the CAE for 1,000 epochs on a tiny (~ 600 images) dataset.\nWe believe that the findings in this paper are directly applicable and will lead to improvements in models that rely on CAEs.",
        "keywords": "convolutional autoencoder;bottleneck;representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ilja Manakov;Markus Rohm;Volker Tresp",
        "authorids": "ilja.manakov@med.uni-muenchen.de;markus.rohm@med.uni-muenchen.de;volker.tresp@siemens.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmanakov2020walking,\ntitle={Walking the Tightrope: An Investigation of the Convolutional Autoencoder Bottleneck},\nauthor={Ilja Manakov and Markus Rohm and Volker Tresp},\nyear={2020},\nurl={https://openreview.net/forum?id=ryguP1BFwr}\n}",
        "github": "https://github.com/YmouslyAnon/WalkingTheTightrope",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryguP1BFwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "833;176;256",
        "wc_reply_reviewers": "544;0;0",
        "wc_reply_authors": "1641;623;233",
        "reply_reviewers": "2;0;0",
        "reply_authors": "4;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            421.6666666666667,
            292.6845097066502
        ],
        "wc_reply_reviewers_avg": [
            181.33333333333334,
            256.4440593103212
        ],
        "wc_reply_authors_avg": [
            832.3333333333334,
            593.5662464197985
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 14,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3185020448338485632&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rygvFyrKwH",
        "title": "Adversarial Robustness as a Prior for Learned Representations",
        "track": "main",
        "status": "Reject",
        "tldr": "Representations learned by robust neural networks align better with our idealization of representations as high-level feature extractors, and thus allow for representation inversion, as well as direct feature visualization and manipulation.",
        "abstract": "An important goal in deep learning is to learn versatile, high-level feature representations of input data. However, standard networks' representations seem to possess shortcomings that, as we illustrate, prevent them from fully realizing this goal. In this work, we show that robust optimization can be re-cast as a tool for enforcing priors on the features learned by deep neural networks. It turns out that representations learned by robust models address the aforementioned shortcomings and make significant progress towards learning a high-level encoding of inputs. In particular, these representations are approximately invertible, while allowing for direct visualization and manipulation of salient input features. More broadly, our results indicate adversarial robustness as a promising avenue for improving learned representations.",
        "keywords": "adversarial robustness;adversarial examples;robust optimization;representation learning;feature visualization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Logan Engstrom;Andrew Ilyas;Shibani Santurkar;Dimitris Tsipras;Brandon Tran;Aleksander Madry",
        "authorids": "engstrom@mit.edu;ailyas@mit.edu;shibani@mit.edu;tsipras@mit.edu;btran115@mit.edu;madry@mit.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nengstrom2020adversarial,\ntitle={Adversarial Robustness as a Prior for Learned Representations},\nauthor={Logan Engstrom and Andrew Ilyas and Shibani Santurkar and Dimitris Tsipras and Brandon Tran and Aleksander Madry},\nyear={2020},\nurl={https://openreview.net/forum?id=rygvFyrKwH}\n}",
        "github": "https://github.com/snappymanatee/robust-learned-representations",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rygvFyrKwH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "1250;561;214",
        "wc_reply_reviewers": "441;496;0",
        "wc_reply_authors": "1088;384;357",
        "reply_reviewers": "4;1;0",
        "reply_authors": "3;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            675.0,
            430.5585519608996
        ],
        "wc_reply_reviewers_avg": [
            312.3333333333333,
            221.99149132843408
        ],
        "wc_reply_authors_avg": [
            609.6666666666666,
            338.41230605415177
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            1.699673171197595
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 225,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1831998416682722967&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rygw7aNYDS",
        "title": "Efficient Inference and Exploration for Reinforcement Learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We investigate the large-sample behaviors of the Q-value estimates and proposed an efficient exploration strategy that relies on estimating the relative discrepancies among the Q estimates. ",
        "abstract": "Despite an ever growing literature on reinforcement learning algorithms and applications, much less is known about their statistical inference. In this paper, we investigate the large-sample behaviors of the Q-value estimates with closed-form characterizations of the asymptotic variances. This allows us to efficiently construct confidence regions for Q-value and optimal value functions, and to develop policies to minimize their estimation errors. This also leads to a policy exploration strategy that relies on estimating the relative discrepancies among the Q estimates. Numerical experiments show superior performances of our exploration strategy than other benchmark approaches.",
        "keywords": "Reinforcement Learning;Efficient Exploration;Asymptotic Analysis;Statistical Inference",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yi Zhu;Jing Dong;Henry Lam",
        "authorids": "yizhu2020@u.northwestern.edu;jing.dong@gsb.columbia.edu;khl2114@columbia.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhu2020efficient,\ntitle={Efficient Inference and Exploration for Reinforcement Learning},\nauthor={Yi Zhu and Jing Dong and Henry Lam},\nyear={2020},\nurl={https://openreview.net/forum?id=rygw7aNYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygw7aNYDS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "481;474;327",
        "wc_reply_reviewers": "53;0;0",
        "wc_reply_authors": "593;171;449",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            427.3333333333333,
            71.00391225528038
        ],
        "wc_reply_reviewers_avg": [
            17.666666666666668,
            24.984439601924677
        ],
        "wc_reply_authors_avg": [
            404.3333333333333,
            175.15199748282126
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7198968805907078096&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rygwLgrYPB",
        "title": "Regularizing activations in neural networks via distribution matching with the Wasserstein metric",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Regularization and normalization have become indispensable components in training deep neural networks, resulting in faster training and improved generalization performance. We propose the projected error function regularization loss (PER) that encourages activations to follow the standard normal distribution. PER randomly projects activations onto one-dimensional space and computes the regularization loss in the projected space. PER is similar to the Pseudo-Huber loss in the projected space, thus taking advantage of both $L^1$ and $L^2$ regularization losses. Besides, PER can capture the interaction between hidden units by projection vector drawn from a unit sphere. By doing so, PER minimizes the upper bound of the Wasserstein distance of order one between an empirical distribution of activations and the standard normal distribution. To the best of the authors' knowledge, this is the first work to regularize activations via distribution matching in the probability distribution space. We evaluate the proposed method on the image classification task and the word-level language modeling task.\n",
        "keywords": "regularization;Wasserstein metric;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Taejong Joo;Donggu Kang;Byunghoon Kim",
        "authorids": "tjoo@estsoft.com;emppunity@gmail.com;byungkim@hanyang.ac.kr",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nJoo2020Regularizing,\ntitle={Regularizing activations in neural networks via distribution matching with the Wasserstein metric},\nauthor={Taejong Joo and Donggu Kang and Byunghoon Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rygwLgrYPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygwLgrYPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "269;303;99",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "441;273;164",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            223.66666666666666,
            89.23875590546719
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            292.6666666666667,
            113.93662956027599
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6049331072789526627&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rygxdA4YPS",
        "title": "AdaScale SGD: A Scale-Invariant Algorithm for Distributed Training",
        "track": "main",
        "status": "Reject",
        "tldr": "A practical and principled algorithm for distributed SGD, which simplifies the process of scaling up training",
        "abstract": "When using distributed training to speed up stochastic gradient descent, learning rates must adapt to new scales in order to maintain training effectiveness. Re-tuning these parameters is resource intensive, while fixed scaling rules often degrade model quality. We propose AdaScale SGD, a practical and principled algorithm that is approximately scale invariant. By continually adapting to the gradient\u2019s variance, AdaScale often trains at a wide range of scales with nearly identical results. We describe this invariance formally through AdaScale\u2019s convergence bounds. As the batch size increases, the bounds maintain final objective values, while smoothly transitioning away from linear speed-ups. In empirical comparisons, AdaScale trains well beyond the batch size limits of popular \u201clinear learning rate scaling\u201d rules. This includes large-scale training without model degradation for machine translation, image classification, object detection, and speech recognition tasks. The algorithm introduces negligible computational overhead and no tuning parameters, making AdaScale an attractive choice for large-scale training.\n",
        "keywords": "Large-batch SGD;large-scale learning;distributed training",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tyler B. Johnson;Pulkit Agrawal;Haijie Gu;Carlos Guestrin",
        "authorids": "tbjohns@apple.com;pulkit_agrawal@apple.com;jaygu@apple.com;guestrin@apple.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\njohnson2020adascale,\ntitle={AdaScale {\\{}SGD{\\}}: A Scale-Invariant Algorithm for Distributed Training},\nauthor={Tyler B. Johnson and Pulkit Agrawal and Haijie Gu and Carlos Guestrin},\nyear={2020},\nurl={https://openreview.net/forum?id=rygxdA4YPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rygxdA4YPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "740;136;250",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "303;229;354",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            375.3333333333333,
            262.0245959616938
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            295.3333333333333,
            51.31817958146563
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7043287143987071088&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryl-RTEYvB",
        "title": "Robust Learning with Jacobian Regularization",
        "track": "main",
        "status": "Reject",
        "tldr": "We analyze and develop a computationally efficient implementation of Jacobian regularization that increases the classification margins of neural networks.",
        "abstract": "Design of reliable systems must guarantee stability against input perturbations. In machine learning, such guarantee entails preventing overfitting and ensuring robustness of models against corruption of input data. In order to maximize stability, we analyze and develop a computationally efficient implementation of Jacobian regularization that increases classification margins of neural networks. The stabilizing effect of the Jacobian regularizer leads to significant improvements in robustness, as measured against both random and adversarial input perturbations, without severely degrading generalization properties on clean data.",
        "keywords": "Supervised Representation Learning;Few-Shot Learning;Regularization;Adversarial Defense;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Judy Hoffman;Daniel A. Roberts;Sho Yaida",
        "authorids": "judy@gatech.edu;dan@diffeo.com;shoyaida@fb.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nhoffman2020robust,\ntitle={Robust Learning with Jacobian Regularization},\nauthor={Judy Hoffman and Daniel A. Roberts and Sho Yaida},\nyear={2020},\nurl={https://openreview.net/forum?id=ryl-RTEYvB}\n}",
        "github": "https://www.dropbox.com/s/3t0l5vujtk1yzwq/jacobian.py?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryl-RTEYvB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "282;424;374",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "253;351;255",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            360.0,
            58.81042991397586
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            286.3333333333333,
            45.73352769637999
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 190,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4147613420091210447&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "ryl0cAVtPH",
        "title": "On The Difficulty of Warm-Starting Neural Network Training",
        "track": "main",
        "status": "Reject",
        "tldr": "We empirically study the gap in generalization between warm-started and randomly-initialized neural networks.",
        "abstract": "In many real-world deployments of machine learning systems, data arrive piecemeal. These learning scenarios may be passive, where data arrive incrementally due to structural properties of the problem (e.g., daily financial data) or active, where samples are selected according to a measure of their quality (e.g., experimental design). In both of these cases, we are building a sequence of models that incorporate an increasing amount of data. We would like each of these models in the sequence to be performant and take advantage of all the data that are available to that point. Conventional intuition suggests that when solving a sequence of related optimization problems of this form, it should be possible to initialize using the solution of the previous iterate---to \"warm start'' the optimization rather than initialize from scratch---and see reductions in wall-clock time. However, in practice this warm-starting seems to yield poorer generalization performance than models that have fresh random initializations, even though the final training losses are similar. While it appears that some hyperparameter settings allow a practitioner to close this generalization gap, they seem to only do so in regimes that damage the wall-clock gains of the warm start. Nevertheless, it is highly desirable to be able to warm-start neural network training, as it would dramatically reduce the resource usage associated with the construction of performant deep learning systems. In this work, we take a closer look at this empirical phenomenon and try to understand when and how it occurs. Although the present investigation did not lead to a solution, we hope that a thorough articulation of the problem will spur new research that may lead to improved methods that consume fewer resources during training.",
        "keywords": "deep learning;neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jordan T. Ash;Ryan P. Adams",
        "authorids": "jordanta@cs.princeton.edu;rpa@princeton.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nash2020on,\ntitle={On The Difficulty of Warm-Starting Neural Network Training},\nauthor={Jordan T. Ash and Ryan P. Adams},\nyear={2020},\nurl={https://openreview.net/forum?id=ryl0cAVtPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryl0cAVtPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "595;353;532",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "646;642;657",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            493.3333333333333,
            102.50962014475628
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            648.3333333333334,
            6.342099196813483
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 24,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9451811030369242276&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "ryl1r1BYDS",
        "title": "Multiagent Reinforcement Learning in Games with an Iterated Dominance Solution",
        "track": "main",
        "status": "Reject",
        "tldr": "For games that are solvable by iterated elimination of dominated strategies, we prove that simple standard reinforcement learning algorithms converge to the iterated dominance solution.",
        "abstract": "Multiagent reinforcement learning (MARL) attempts to optimize policies of intelligent agents interacting in the same environment. However, it may fail to converge to a Nash equilibrium in some games.  We study independent MARL under the more demanding solution concept of iterated elimination of strictly dominated strategies.  In dominance solvable games, if players iteratively eliminate strictly dominated strategies until no further strategies can be eliminated, we obtain a single strategy profile. We show that convergence to the iterated dominance solution is guaranteed for several reinforcement learning algorithms (for multiple independent learners). We illustrate an application of our results by studying mechanism design for principal-agent problems, where a principal wishes to incentivize agents to exert costly effort in a joint project when it can only observe whether the project succeeded, but not whether agents actually exerted effort. We show that MARL converges to the desired outcome if the rewards are designed so that exerting effort is the iterated dominance solution, but fails if it is merely a Nash equilibrium.",
        "keywords": "multiagent;reinforcement learning;iterated dominance;mechanism design;Nash equilibrium",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yoram Bachrach;Tor Lattimore;Marta Garnelo;Julien Perolat;David Balduzzi;Thomas Anthony;Satinder Singh;Thore Graepel",
        "authorids": "yorambac@gmail.com;lattimore@google.com;garnelo@google.com;perolat@google.com;dbalduzzi@google.com;twa@google.com;baveja@google.com;thore@google.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nbachrach2020multiagent,\ntitle={Multiagent Reinforcement Learning in Games with an Iterated Dominance Solution},\nauthor={Yoram Bachrach and Tor Lattimore and Marta Garnelo and Julien Perolat and David Balduzzi and Thomas Anthony and Satinder Singh and Thore Graepel},\nyear={2020},\nurl={https://openreview.net/forum?id=ryl1r1BYDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ryl1r1BYDS",
        "pdf_size": 0,
        "rating": "1;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "176;195;317;470",
        "wc_reply_reviewers": "331;0;0;0",
        "wc_reply_authors": "1424;462;38;270",
        "reply_reviewers": "2;0;0;0",
        "reply_authors": "3;1;1;1",
        "rating_avg": [
            4.0,
            2.1213203435596424
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            289.5,
            117.41912110044088
        ],
        "wc_reply_reviewers_avg": [
            82.75,
            143.3272043263246
        ],
        "wc_reply_authors_avg": [
            548.5,
            527.2937985601576
        ],
        "reply_reviewers_avg": [
            0.5,
            0.8660254037844386
        ],
        "reply_authors_avg": [
            1.5,
            0.8660254037844386
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15564912870981182040&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryl260NYDr",
        "title": "Empirical observations pertaining to learned priors for deep latent variable models",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "There exist many forms of deep latent variable models, such as the variational autoencoder and adversarial autoencoder. Regardless of the specific class of model, there exists an implicit consensus that the latent distribution should be regularized towards the prior, even in the case where the prior distribution is learned. Upon investigating the effect of latent regularization on image generation our results indicate that in the case when a sufficiently expressive prior is learned, latent regularization is not necessary and may in fact be harmful insofar as image quality is concerned. We additionally investigate the benefit of learned priors on two common problems in computer vision: latent variable disentanglement, and diversity in image-to-image translation.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rogan Morrow;Wei-Chen Chiu",
        "authorids": "rogan.o.morrow@gmail.com;walon@cs.nctu.edu.tw",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryl260NYDr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "392;265;240",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "158;156;194",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            299.0,
            66.54822812567339
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            169.33333333333334,
            17.46106780494506
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EW22Yh7EMKsJ:scholar.google.com/&scioq=Empirical+observations+pertaining+to+learned+priors+for+deep+latent+variable+models&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "ryl3blSFPr",
        "title": "Denoising Improves Latent Space Geometry in Text Autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Neural language models have recently shown impressive gains in unconditional text generation, but controllable generation and manipulation of text remain challenging. In particular, controlling text via latent space operations in autoencoders has been difficult, in part due to chaotic latent space geometry. We propose to employ adversarial autoencoders together with denoising (referred as DAAE) to drive the latent space to organize itself. Theoretically, we prove that input sentence perturbations in the denoising approach encourage similar sentences to map to similar latent representations. Empirically, we illustrate the trade-off between text-generation and autoencoder-reconstruction capabilities, and our model significantly improves over other autoencoder variants. Even from completely unsupervised training, DAAE can successfully alter the tense/sentiment of sentences via simple latent vector arithmetic.",
        "keywords": "controllable text generation;autoencoders;denoising;latent space geometry",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianxiao Shen;Jonas Mueller;Regina Barzilay;Tommi Jaakkola",
        "authorids": "tianxiao@mit.edu;jonasmue@amazon.com;regina@csail.mit.edu;tommi@csail.mit.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nshen2020denoising,\ntitle={Denoising Improves Latent Space Geometry in Text Autoencoders},\nauthor={Tianxiao Shen and Jonas Mueller and Regina Barzilay and Tommi Jaakkola},\nyear={2020},\nurl={https://openreview.net/forum?id=ryl3blSFPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryl3blSFPr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "390;267;514",
        "wc_reply_reviewers": "145;0;0",
        "wc_reply_authors": "1156;971;1194",
        "reply_reviewers": "1;0;0",
        "reply_authors": "3;2;3",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            390.3333333333333,
            100.83760321537903
        ],
        "wc_reply_reviewers_avg": [
            48.333333333333336,
            68.3536555146996
        ],
        "wc_reply_authors_avg": [
            1107.0,
            97.40978732482002
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:y_6Hr0Yz-yQJ:scholar.google.com/&scioq=Denoising+Improves+Latent+Space+Geometry+in+Text+Autoencoders&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ryl3ygHYDB",
        "title": "Lookahead: A Far-sighted Alternative of Magnitude-based Pruning",
        "track": "main",
        "status": "Poster",
        "tldr": "We study a multi-layer generalization of the magnitude-based pruning.",
        "abstract": "Magnitude-based pruning is one of the simplest methods for pruning neural networks. Despite its simplicity, magnitude-based pruning and its variants demonstrated remarkable performances for pruning modern architectures. Based on the observation that magnitude-based pruning indeed minimizes the Frobenius distortion of a linear operator corresponding to a single layer, we develop a simple pruning method, coined lookahead pruning, by extending the single layer optimization to a multi-layer optimization. Our experimental results demonstrate that the proposed method consistently outperforms magnitude-based pruning on various networks, including VGG and ResNet, particularly in the high-sparsity regime. See https://github.com/alinlab/lookahead_pruning for codes.",
        "keywords": "network magnitude-based pruning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sejun Park*;Jaeho Lee*;Sangwoo Mo;Jinwoo Shin",
        "authorids": "sejun.park@kaist.ac.kr;jaeho-lee@kaist.ac.kr;swmo@kaist.ac.kr;jinwoos@kaist.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nPark*2020Lookahead:,\ntitle={Lookahead: A Far-sighted Alternative of Magnitude-based Pruning},\nauthor={Sejun Park* and Jaeho Lee* and Sangwoo Mo and Jinwoo Shin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryl3ygHYDB}\n}",
        "github": "https://github.com/alinlab/lookahead_pruning",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer5",
        "site": "https://openreview.net/forum?id=ryl3ygHYDB",
        "pdf_size": 0,
        "rating": "6;6;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "589;275;648;634",
        "wc_reply_reviewers": "64;38;0;349",
        "wc_reply_authors": "522;502;743;1187",
        "reply_reviewers": "1;1;0;1",
        "reply_authors": "2;2;2;3",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            536.5,
            152.5426169960382
        ],
        "wc_reply_reviewers_avg": [
            112.75,
            138.28480574524448
        ],
        "wc_reply_authors_avg": [
            738.5,
            275.6705461234479
        ],
        "reply_reviewers_avg": [
            0.75,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            2.25,
            0.4330127018922193
        ],
        "replies_avg": [
            18,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 127,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2120869474011210882&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "ryl4-pEKvB",
        "title": "DeepAGREL: Biologically plausible deep learning via direct reinforcement",
        "track": "main",
        "status": "Reject",
        "tldr": "We show how deep learning can be implemented in the brain using direct reinforcement learning just as well as error-backprop for hard tasks, with a surprisingly small penalty to the speed of convergence",
        "abstract": "While much recent work has focused on biologically plausible variants of error-backpropagation, learning in the brain seems to mostly adhere to a reinforcement learning paradigm; biologically plausible neural reinforcement learning frameworks, however, were limited to shallow networks learning from compact and abstract sensory representations. Here, we show that it is possible to generalize such approaches to deep networks with an arbitrary number of layers.  \nWe demonstrate the learning scheme - DeepAGREL - on classical and hard image-classification benchmarks requiring deep networks, namely MNIST, CIFAR10, and CIFAR100, cast as direct reward tasks, both for deep fully connected, convolutional and locally connected architectures. We show that for these tasks, DeepAGREL achieves an accuracy that is equal to supervised error-backpropagation, and the trial-and-error nature of such learning imposes only a very limited cost in terms of training time. Thus, our results provide new insights into how deep learning may be implemented in the brain. ",
        "keywords": "biologically plausible deep learning;reinforcement learning;feedback gating;image claassification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Isabella Pozzi;Sander M. Bohte;Pieter R. Roelfsema",
        "authorids": "pozzi@cwi.nl;s.m.bohte@cwi.nl;p.roelfsema@nin.knaw.nl",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\npozzi2020deepagrel,\ntitle={Deep{\\{}AGREL{\\}}: Biologically plausible deep learning via direct reinforcement},\nauthor={Isabella Pozzi and Sander M. Bohte and Pieter R. Roelfsema},\nyear={2020},\nurl={https://openreview.net/forum?id=ryl4-pEKvB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryl4-pEKvB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "345;188;488",
        "wc_reply_reviewers": "233;0;0",
        "wc_reply_authors": "510;138;490",
        "reply_reviewers": "1;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            340.3333333333333,
            122.51893277730136
        ],
        "wc_reply_reviewers_avg": [
            77.66666666666667,
            109.83725334431037
        ],
        "wc_reply_authors_avg": [
            379.3333333333333,
            170.84365822457545
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FwDWZRSQCFoJ:scholar.google.com/&scioq=DeepAGREL:+Biologically+plausible+deep+learning+via+direct+reinforcement&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "ryl5CJSFPS",
        "title": "GENERALIZATION GUARANTEES FOR NEURAL NETS VIA HARNESSING THE LOW-RANKNESS OF JACOBIAN",
        "track": "main",
        "status": "Reject",
        "tldr": "We empirically demonstrate that the Jacobian of neural networks exhibit a low-rank structure and harness this property to develop new optimization and generalization guarantees.",
        "abstract": "Modern neural network architectures often generalize well despite containing many more parameters than the size of the training dataset. This paper explores the generalization capabilities of neural networks trained via gradient descent. We develop a data-dependent optimization and generalization theory which leverages the low-rank structure of the Jacobian matrix associated with the network. Our results help demystify why training and generalization is easier on clean and structured datasets and harder on noisy and unstructured datasets as well as how the network size affects the evolution of the train and test errors during training. Specifically, we use a control knob to split the Jacobian spectum into ``information\" and ``nuisance\" spaces associated with the large and small singular values. We show that over the information space learning is fast and one can quickly train a model with zero training loss that can also generalize well. Over the nuisance space training is slower and early stopping can help with generalization at the expense of some bias. We also show that the overall generalization capability of the network is controlled by how well the labels are aligned with the information space. A key feature of our results is that even constant width neural nets can provably generalize for sufficiently nice datasets. We conduct various numerical experiments on deep networks that corroborate our theoretical findings and demonstrate that: (i) the Jacobian of typical neural networks exhibit low-rank structure with a few large singular values and many small ones leading to a low-dimensional information space, (ii) over the information space learning is fast and most of the labels falls on this space, and (iii) label noise falls on the nuisance space and impedes optimization/generalization.",
        "keywords": "Theory of neural nets;low-rank structure of Jacobian;optimization and generalization theory",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Samet Oymak;Zalan Fabian;Mingchen Li;Mahdi Soltanolkotabi",
        "authorids": "sametoymak@gmail.com;zfabian@usc.edu;mli176@ucr.edu;msoltoon@gmail.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\noymak2020generalization,\ntitle={{\\{}GENERALIZATION{\\}} {\\{}GUARANTEES{\\}} {\\{}FOR{\\}} {\\{}NEURAL{\\}} {\\{}NETS{\\}} {\\{}VIA{\\}} {\\{}HARNESSING{\\}} {\\{}THE{\\}} {\\{}LOW{\\}}-{\\{}RANKNESS{\\}} {\\{}OF{\\}} {\\{}JACOBIAN{\\}}},\nauthor={Samet Oymak and Zalan Fabian and Mingchen Li and Mahdi Soltanolkotabi},\nyear={2020},\nurl={https://openreview.net/forum?id=ryl5CJSFPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ryl5CJSFPS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "393;883;387",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "561;806;808",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            554.3333333333334,
            232.41533703456167
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            725.0,
            115.96838649678052
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vJOkTxIsKa4J:scholar.google.com/&scioq=GENERALIZATION+GUARANTEES+FOR+NEURAL+NETS+VIA+HARNESSING+THE+LOW-RANKNESS+OF+JACOBIAN&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ryl71a4YPB",
        "title": "A Unified framework for randomized smoothing based certified defenses",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Randomized smoothing, which was recently proved to be a certified defensive technique, has received considerable attention due to its scalability to large datasets and neural networks. However, several important questions still remain unanswered in the existing frameworks, such as (i) whether Gaussian mechanism is an optimal choice for certifying $\\ell_2$-normed robustness, and (ii) whether randomized smoothing can certify $\\ell_\\infty$-normed robustness (on high-dimensional datasets like ImageNet). To answer these questions, we introduce a {\\em  unified} and {\\em self-contained} framework to study randomized smoothing-based certified defenses, where we mainly focus on the two most popular norms in adversarial machine learning, {\\em i.e.,} $\\ell_2$ and $\\ell_\\infty$ norm. We answer the above two questions by first demonstrating that Gaussian mechanism and  Exponential mechanism are the (near) optimal options to certify the $\\ell_2$ and $\\ell_\\infty$-normed robustness. We further show that the largest $\\ell_\\infty$ radius certified by randomized smoothing is upper bounded by $O(1/\\sqrt{d})$, where $d$ is the dimensionality of the data. This theoretical finding suggests that certifying $\\ell_\\infty$-normed robustness by randomized smoothing may not be scalable to high-dimensional data. The veracity of our framework and analysis is verified by extensive evaluations on CIFAR10 and ImageNet.",
        "keywords": "Certificated Defense;Randomized Smoothing;A Unified and Self-Contained Framework",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tianhang Zheng;Di Wang;Baochun Li;Jinhui Xu",
        "authorids": "th.zheng@mail.utoronto.ca;dwang45@buffalo.edu;bli@ece.toronto.edu;jinhui@buffalo.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzheng2020a,\ntitle={A Unified framework for randomized smoothing based certified defenses},\nauthor={Tianhang Zheng and Di Wang and Baochun Li and Jinhui Xu},\nyear={2020},\nurl={https://openreview.net/forum?id=ryl71a4YPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryl71a4YPB",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "721;549;992",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "437;226;435",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            754.0,
            182.35313725479654
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            366.0,
            98.99831648400222
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            14,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4iJ_zDCKzSsJ:scholar.google.com/&scioq=A+Unified+framework+for+randomized+smoothing+based+certified+defenses&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rylB0nNKDr",
        "title": "Increasing batch size through instance repetition improves generalization",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "Large-batch SGD is important for scaling training of deep neural networks. However, without fine-tuning hyperparameter schedules, the generalization of the model may be hampered.\nWe propose to use batch augmentation: replicating instances of samples within the same batch with different data augmentations. Batch augmentation acts as a regularizer and an accelerator, increasing both generalization and performance scaling for a fixed budget of optimization steps. \nWe analyze the effect of batch augmentation on gradient variance and show that it empirically improves convergence for a wide variety of networks and datasets. Our results show that batch augmentation reduces the number of necessary SGD updates to achieve the same accuracy as the state-of-the-art.\nOverall, this simple yet effective method enables faster training and better generalization by allowing more computational resources to be used concurrently.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Elad Hoffer;Tal Ben-Nun;Itay Hubara;Niv Giladi;Torsten Hoefler;Daniel Soudry",
        "authorids": "elad.hoffer@gmail.com;talbn@inf.ethz.ch;itayhubara@gmail.com;giladiniv@campus.technion.ac.il;htor@inf.ethz.ch;daniel.soudry@gmail.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "",
        "github": "https://github.com/paper-submissions/batch-duplicates",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rylB0nNKDr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "278;403;145",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "137;187;26",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            275.3333333333333,
            105.3449360698252
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            116.66666666666667,
            67.28215877102109
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jXlobvpqfhcJ:scholar.google.com/&scioq=Increasing+batch+size+through+instance+repetition+improves+generalization&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rylBK34FDS",
        "title": "DeepHoyer: Learning Sparser Neural Network with Differentiable Scale-Invariant Sparsity Measures",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose almost everywhere differentiable and scale invariant regularizers for DNN pruning, which can lead to supremum sparsity through standard SGD training.",
        "abstract": "In seeking for sparse and efficient neural network models, many previous works investigated on enforcing L1 or L0 regularizers to encourage weight sparsity during training. The L0 regularizer measures the parameter sparsity directly and is invariant to the scaling of parameter values. But it cannot provide useful gradients and therefore requires complex optimization techniques. The L1 regularizer is almost everywhere differentiable and can be easily optimized with gradient descent. Yet it is not scale-invariant and causes the same shrinking rate to all parameters, which is inefficient in increasing sparsity. Inspired by the Hoyer measure (the ratio between L1 and L2 norms) used in traditional compressed sensing problems, we present DeepHoyer, a set of sparsity-inducing regularizers that are both differentiable almost everywhere and scale-invariant. Our experiments show that enforcing DeepHoyer regularizers can produce even sparser neural network models than previous works, under the same accuracy level. We also show that DeepHoyer can be applied to both element-wise and structural pruning.",
        "keywords": "Deep neural network;Sparsity inducing regularizer;Model compression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Huanrui Yang;Wei Wen;Hai Li",
        "authorids": "huanrui.yang@duke.edu;wei.wen@duke.edu;hai.li@duke.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nYang2020DeepHoyer:,\ntitle={DeepHoyer: Learning Sparser Neural Network with Differentiable Scale-Invariant Sparsity Measures},\nauthor={Huanrui Yang and Wei Wen and Hai Li},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylBK34FDS}\n}",
        "github": "https://github.com/yanghr/DeepHoyer",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rylBK34FDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "127;123;342",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "318;24;414",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            197.33333333333334,
            102.30781440774155
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            252.0,
            165.91564121564912
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 140,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9357831330077087953&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rylCP6NFDB",
        "title": "Hindsight Trust Region Policy Optimization",
        "track": "main",
        "status": "Reject",
        "tldr": "This paper proposes an advanced policy optimization method with hindsight experience for sparse reward reinforcement learning.",
        "abstract": "As reinforcement learning continues to drive machine intelligence beyond its conventional boundary, unsubstantial practices in sparse reward environment severely limit further applications in a broader range of advanced fields. Motivated by the demand for an effective deep reinforcement learning algorithm that accommodates sparse reward environment, this paper presents Hindsight Trust Region Policy Optimization (HTRPO), a method that efficiently utilizes interactions in sparse reward conditions to optimize policies within trust region and, in the meantime, maintains learning stability. Firstly, we theoretically adapt the TRPO objective function, in the form of the expected return of the policy, to the distribution of hindsight data generated from the alternative goals. Then, we apply Monte Carlo with importance sampling to estimate KL-divergence between two policies, taking the hindsight data as input. Under the condition that the distributions are sufficiently close, the KL-divergence is approximated by another f-divergence. Such approximation results in the decrease of variance and alleviates the instability during policy update.  Experimental results on both discrete and continuous benchmark tasks demonstrate that HTRPO converges significantly faster than previous policy gradient methods. It achieves effective performances and high data-efficiency for training policies in sparse reward environments.",
        "keywords": "Hindsight;Sparse Reward;Reinforcement Learning;Policy Gradients",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hanbo Zhang;Site Bai;Xuguang Lan;Nanning Zheng",
        "authorids": "zhanghanbo163@stu.xjtu.edu.cn;best99317@stu.xjtu.edu.cn;xglan@xjtu.edu.cn;nnzheng@xjtu.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nzhang2020hindsight,\ntitle={Hindsight Trust Region Policy Optimization},\nauthor={Hanbo Zhang and Site Bai and Xuguang Lan and Nanning Zheng},\nyear={2020},\nurl={https://openreview.net/forum?id=rylCP6NFDB}\n}",
        "github": "https://github.com/HTRPOCODES/HTRPO",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rylCP6NFDB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "176;1283;327",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "704;1004;772",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            595.3333333333334,
            490.1457833021609
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            826.6666666666666,
            128.42983384798964
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17709788449434380870&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "rylDzTEKwr",
        "title": "Variational Hashing-based Collaborative Filtering with Self-Masking",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new variational hashing-based collaborative filtering approach optimized for a novel self-mask variant of the Hamming distance, which outperforms state-of-the-art by up to 12% on NDCG.",
        "abstract": "Hashing-based collaborative filtering learns binary vector representations (hash codes) of users and items, such that recommendations can be computed very efficiently using the Hamming distance, which is simply the sum of differing bits between two hash codes. A problem with hashing-based collaborative filtering using the Hamming distance, is that each bit is equally weighted in the distance computation, but in practice some bits might encode more important properties than other bits, where the importance depends on the user. \nTo this end, we propose an end-to-end trainable variational hashing-based collaborative filtering approach that uses the novel concept of self-masking: the user hash code acts as a mask on the items (using the Boolean AND operation), such that it learns to encode which bits are important to the user, rather than the user's preference towards the underlying item property that the bits represent. This allows a binary user-level importance weighting of each item without the need to store additional weights for each user. We experimentally evaluate our approach against state-of-the-art baselines on 4 datasets, and obtain significant gains of up to 12% in NDCG. We also make available an efficient implementation of self-masking, which experimentally yields <4% runtime overhead compared to the standard Hamming distance.",
        "keywords": "hashing;collaborative filtering;information retrieval;supervised learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Casper Hansen;Christian Hansen;Jakob Grue Simonsen;Stephen Alstrup;Christina Lioma",
        "authorids": "c.hansen@di.ku.dk;chrh@di.ku.dk;simonsen@di.ku.dk;s.alstrup@di.ku.dk;c.lioma@di.ku.dk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nhansen2020variational,\ntitle={Variational Hashing-based Collaborative Filtering with Self-Masking},\nauthor={Casper Hansen and Christian Hansen and Jakob Grue Simonsen and Stephen Alstrup and Christina Lioma},\nyear={2020},\nurl={https://openreview.net/forum?id=rylDzTEKwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rylDzTEKwr",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "289;231;404",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "664;390;948",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;2",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            308.0,
            71.89343966362068
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            667.3333333333334,
            227.81473954851023
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zL9D2-3uFLkJ:scholar.google.com/&scioq=Variational+Hashing-based+Collaborative+Filtering+with+Self-Masking&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rylHspEKPr",
        "title": "Learning to Represent Programs with Property Signatures",
        "track": "main",
        "status": "Poster",
        "tldr": "We represent a computer program using a set of simpler programs and use this representation to improve program synthesis techniques.",
        "abstract": "We introduce the notion of property signatures, a representation for programs and\nprogram specifications meant for consumption by machine learning algorithms.\nGiven a function with input type \u03c4_in and output type \u03c4_out, a property is a function\nof type: (\u03c4_in, \u03c4_out) \u2192 Bool that (informally) describes some simple property\nof the function under consideration. For instance, if \u03c4_in and \u03c4_out are both lists\nof the same type, one property might ask \u2018is the input list the same length as the\noutput list?\u2019. If we have a list of such properties, we can evaluate them all for our\nfunction to get a list of outputs that we will call the property signature. Crucially,\nwe can \u2018guess\u2019 the property signature for a function given only a set of input/output\npairs meant to specify that function. We discuss several potential applications of\nproperty signatures and show experimentally that they can be used to improve\nover a baseline synthesizer so that it emits twice as many programs in less than\none-tenth of the time.",
        "keywords": "Program Synthesis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Augustus Odena;Charles Sutton",
        "authorids": "augustusodena@google.com;csutton@inf.ed.ac.uk",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nOdena2020Learning,\ntitle={Learning to Represent Programs with Property Signatures},\nauthor={Augustus Odena and Charles Sutton},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylHspEKPr}\n}",
        "github": "https://github.com/brain-research/searcho",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rylHspEKPr",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "413;240;232",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "585;546;106",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            295.0,
            83.50249497270526
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            412.3333333333333,
            217.19474108632454
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 40,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9175380566274026362&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rylJkpEtwS",
        "title": "Learning the Arrow of Time for Problems in Reinforcement Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "We learn the arrow of time for MDPs and use it to measure reachability, detect side-effects and obtain a curiosity reward signal. ",
        "abstract": "We humans have an innate understanding of the asymmetric progression of time, which we use to efficiently and safely perceive and manipulate our environment. Drawing inspiration from that, we approach the problem of learning an arrow of time in a Markov (Decision) Process. We illustrate how a learned arrow of time can capture salient information about the environment, which in turn can be used to measure reachability, detect side-effects and to obtain an intrinsic reward signal. Finally, we propose a simple yet effective algorithm to parameterize the problem at hand and learn an arrow of time with a function approximator (here, a deep neural network). Our empirical results span a selection of discrete and continuous environments, and demonstrate for a class of stochastic processes that the learned arrow of time agrees reasonably well with a well known notion of an arrow of time due to Jordan, Kinderlehrer and Otto (1998). ",
        "keywords": "Arrow of Time;Reinforcement Learning;AI-Safety",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Nasim Rahaman;Steffen Wolf;Anirudh Goyal;Roman Remme;Yoshua Bengio",
        "authorids": "nasim.rahaman@tuebingen.mpg.de;steffen.wolf@iwr.uni-heidelberg.de;anirudhgoyal9119@gmail.com;roman.remme@iwr.uni-heidelberg.de;yoshua.bengio@mila.quebec",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nRahaman2020Learning,\ntitle={Learning the Arrow of Time for Problems in Reinforcement Learning},\nauthor={Nasim Rahaman and Steffen Wolf and Anirudh Goyal and Roman Remme and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylJkpEtwS}\n}",
        "github": "https://www.sendspace.com/file/0mx0en",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rylJkpEtwS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "370;1024;513",
        "wc_reply_reviewers": "0;740;28",
        "wc_reply_authors": "1219;2319;458",
        "reply_reviewers": "0;2;1",
        "reply_authors": "4;6;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            635.6666666666666,
            280.73039656502385
        ],
        "wc_reply_reviewers_avg": [
            256.0,
            342.4305282340736
        ],
        "wc_reply_authors_avg": [
            1332.0,
            763.9402245376706
        ],
        "reply_reviewers_avg": [
            1.0,
            0.816496580927726
        ],
        "reply_authors_avg": [
            4.0,
            1.632993161855452
        ],
        "replies_avg": [
            20,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 8,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2966896831695078649&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rylK-kBYwr",
        "title": "End-to-End Multi-Domain Task-Oriented Dialogue Systems with Multi-level Neural Belief Tracker",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We proposed an end-to-end dialogue system with a novel multi-level dialogue state tracker and achieved consistent performance on MultiWOZ2.1 in state tracking, task completion, and response generation performance.",
        "abstract": "It has been an open research challenge for developing an end-to-end multi-domain task-oriented dialogue system, in which a human can converse with the dialogue agent to complete tasks in more than one domain. First, tracking belief states of multi-domain dialogues is difficult as the dialogue agent must obtain the complete belief states from all relevant domains, each of which can have shared slots common among domains as well as unique slots specifically for the domain only. Second, the dialogue agent must also process various types of information, including contextual information from dialogue context, decoded dialogue states of current dialogue turn, and queried results from a knowledge base, to semantically shape context-aware and task-specific responses to human. To address these challenges, we propose an end-to-end neural architecture for task-oriented dialogues in multiple domains. We propose a novel Multi-level Neural Belief Tracker which tracks the dialogue belief states by learning signals at both slot and domain level independently. The representations are combined in a Late Fusion approach to form joint feature vectors of (domain, slot) pairs. Following recent work in end-to-end dialogue systems, we incorporate the belief tracker with generation components to address end-to-end dialogue tasks. We achieve state-of-the-art performance on the MultiWOZ2.1 benchmark with 50.91% joint goal accuracy and competitive measures in task-completion and response generation.",
        "keywords": "task-oriented;dialogues;dialogue state tracking;end-to-end",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Hung Le;Doyen Sahoo;Chenghao Liu;Nancy F. Chen;Steven C.H. Hoi",
        "authorids": "l.hung1610@gmail.com;dsahoo@salesforce.com;chliu@smu.edu.sg;nfychen@i2r.a-star.edu.sg;shoi@salesforce.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rylK-kBYwr",
        "pdf_size": 0,
        "rating": "3;3;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "229;387;809;217",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "0;0;0;0",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "0;0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            410.5,
            239.65548189014996
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6870975972215009681&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "rylMgCNYvS",
        "title": "On the Linguistic Capacity of Real-time Counter Automata",
        "track": "main",
        "status": "Reject",
        "tldr": "We study the class of formal languages acceptable by real-time counter automata, a model of computation related to some types of recurrent neural networks.",
        "abstract": "While counter machines have received little attention in theoretical computer science since the 1960s, they have recently achieved a newfound relevance to the field of natural language processing (NLP). Recent work has suggested that some strong-performing recurrent neural networks utilize their memory as counters. Thus, one potential way to understand the sucess of these networks is to revisit the theory of counter computation. Therefore, we choose to study the abilities of real-time counter machines as formal grammars. We first show that several variants of the counter machine converge to express the same class of formal languages. We also prove that counter languages are closed under complement, union, intersection, and many other common set operations. Next, we show that counter machines cannot evaluate boolean expressions, even though they can weakly validate their syntax. This has implications for the interpretability and evaluation of neural network systems: successfully matching syntactic patterns does not guarantee that a counter-like model accurately represents underlying semantic structures. Finally, we consider the question of whether counter languages are semilinear. This work makes general contributions to the theory of formal languages that are of particular interest for the interpretability of recurrent neural networks.",
        "keywords": "formal language theory;counter automata;natural language processing;deep learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "William Merrill",
        "authorids": "vikingarnir.will@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nmerrill2020on,\ntitle={On the Linguistic Capacity of Real-time Counter Automata},\nauthor={William Merrill},\nyear={2020},\nurl={https://openreview.net/forum?id=rylMgCNYvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer6",
        "site": "https://openreview.net/forum?id=rylMgCNYvS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "388;186;819",
        "wc_reply_reviewers": "0;0;15",
        "wc_reply_authors": "320;109;328",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            464.3333333333333,
            263.9978956145084
        ],
        "wc_reply_reviewers_avg": [
            5.0,
            7.0710678118654755
        ],
        "wc_reply_authors_avg": [
            252.33333333333334,
            101.40458021652124
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11825147780789357407&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rylNJlStwB",
        "title": "Learning to Infer User Interface Attributes from Images",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We present a new approach that helps developers automate the process of user interface implementation. Concretely, given an input image created by a designer (e.g, using a vector graphics editor), we learn to infer its implementation which when rendered (e.g., on the Android platform), looks visually the same as the input image. To achieve this, we take a black box rendering engine and a set of attributes it supports (e.g., colors, border radius, shadow or text properties), use it to generate a suitable synthetic training dataset, and then train specialized neural models to predict each of the attribute values. To improve pixel-level accuracy, we also use imitation learning to train a neural policy that refines the predicted attribute values by learning to compute the similarity of the original and rendered images in their attribute space, rather than based on the difference of pixel values.\n",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Philippe Schlattner;Pavol Bielik;Martin Vechev",
        "authorids": "pschlatt@ethz.ch;pavol.bielik@inf.ethz.ch;martin.vechev@inf.ethz.ch",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nschlattner2020learning,\ntitle={Learning to Infer User Interface Attributes from Images},\nauthor={Philippe Schlattner and Pavol Bielik and Martin Vechev},\nyear={2020},\nurl={https://openreview.net/forum?id=rylNJlStwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rylNJlStwB",
        "pdf_size": 0,
        "rating": "1;3;8",
        "confidence": "0;0;0",
        "wc_review": "646;158;273",
        "wc_reply_reviewers": "242;0;0",
        "wc_reply_authors": "1987;510;482",
        "reply_reviewers": "1;0;0",
        "reply_authors": "4;1;1",
        "rating_avg": [
            4.0,
            2.943920288775949
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            359.0,
            208.2994639135364
        ],
        "wc_reply_reviewers_avg": [
            80.66666666666667,
            114.07989403142966
        ],
        "wc_reply_authors_avg": [
            993.0,
            702.9570873578747
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1810559498026999468&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rylT0AVtwH",
        "title": "Learning from Partially-Observed Multimodal Data with Variational Autoencoders",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel VAE-based framework learning from partially-observed data for imputation and generation. ",
        "abstract": "Learning from only partially-observed data for imputation has been an active research area. Despite promising progress on unimodal data imputation (e.g., image in-painting), models designed for multimodal data imputation are far from satisfactory. In this paper, we propose variational selective autoencoders (VSAE) for this task. Different from previous works, our proposed VSAE learns only from partially-observed data. The proposed VSAE is capable of learning the joint distribution of observed and unobserved modalities as well as the imputation mask, resulting in a unified model for various down-stream tasks including data generation and imputation.\nEvaluation on both synthetic high-dimensional and challenging low-dimensional multi-modality datasets shows significant improvement over the state-of-the-art data imputation models.",
        "keywords": "data imputation;variational autoencoders;generative models",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yu Gong;Hossein Hajimirsadeghi;Jiawei He;Megha Nawhal;Thibaut Durand;Greg Mori",
        "authorids": "yu_gong@sfu.ca;hossein.hajimirsadeghi@gmail.com;jha203@sfu.ca;mnawhal@sfu.ca;thibaut.p.durand@borealisai.com;mori@cs.sfu.ca",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\ngong2020learning,\ntitle={Learning from Partially-Observed Multimodal Data with Variational Autoencoders},\nauthor={Yu Gong and Hossein Hajimirsadeghi and Jiawei He and Megha Nawhal and Thibaut Durand and Greg Mori},\nyear={2020},\nurl={https://openreview.net/forum?id=rylT0AVtwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer5",
        "site": "https://openreview.net/forum?id=rylT0AVtwH",
        "pdf_size": 0,
        "rating": "3;3;3;6",
        "confidence": "0;0;0;0",
        "wc_review": "357;910;680;286",
        "wc_reply_reviewers": "0;389;130;0",
        "wc_reply_authors": "263;620;434;248",
        "reply_reviewers": "0;1;1;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            3.75,
            1.299038105676658
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            558.25,
            251.58137351560828
        ],
        "wc_reply_reviewers_avg": [
            129.75,
            158.80865058302084
        ],
        "wc_reply_authors_avg": [
            391.25,
            150.932725079752
        ],
        "reply_reviewers_avg": [
            0.5,
            0.5
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gD9wHnlx2jUJ:scholar.google.com/&scioq=Learning+from+Partially-Observed+Multimodal+Data+with+Variational+Autoencoders&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rylUOn4Yvr",
        "title": "ROBUST DISCRIMINATIVE REPRESENTATION LEARNING VIA GRADIENT RESCALING: AN EMPHASIS REGULARISATION PERSPECTIVE",
        "track": "main",
        "status": "Reject",
        "tldr": "ROBUST DISCRIMINATIVE REPRESENTATION LEARNING VIA GRADIENT RESCALING: AN EMPHASIS REGULARISATION PERSPECTIVE",
        "abstract": "It is fundamental and challenging to train robust and accurate Deep Neural Networks (DNNs) when semantically abnormal examples exist. Although great progress has been made, there is still one crucial research question which is not thoroughly explored yet: What training examples should be focused and how much more should they be emphasised to achieve robust learning? In this work, we study this question and propose gradient rescaling (GR) to solve it. GR modifies the magnitude of logit vector\u2019s gradient to emphasise on relatively easier training data points when noise becomes more severe, which functions as explicit emphasis regularisation to improve the generalisation performance of DNNs. Apart from regularisation, we connect GR to examples weighting and designing robust loss functions. We empirically demonstrate that GR is highly anomaly-robust and outperforms the state-of-the-art by a large margin, e.g., increasing 7% on CIFAR100 with 40% noisy labels. It is also significantly superior to standard regularisers in both clean and abnormal settings. Furthermore, we present comprehensive ablation studies to explore the behaviours of GR under different cases, which is informative for applying GR in real-world scenarios.",
        "keywords": "examples weighting;emphasis regularisation;gradient scaling;abnormal training examples",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinshao Wang;Yang Hua;Elyor Kodirov;Neil M. Robertson",
        "authorids": "xwang39@qub.ac.uk;y.hua@qub.ac.uk;elyor@anyvision.co;n.robertson@qub.ac.uk",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020robust,\ntitle={{\\{}ROBUST{\\}} {\\{}DISCRIMINATIVE{\\}} {\\{}REPRESENTATION{\\}} {\\{}LEARNING{\\}} {\\{}VIA{\\}} {\\{}GRADIENT{\\}} {\\{}RESCALING{\\}}: {\\{}AN{\\}} {\\{}EMPHASIS{\\}} {\\{}REGULARISATION{\\}} {\\{}PERSPECTIVE{\\}}},\nauthor={Xinshao Wang and Yang Hua and Elyor Kodirov and Neil M. Robertson},\nyear={2020},\nurl={https://openreview.net/forum?id=rylUOn4Yvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rylUOn4Yvr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "620;125;348",
        "wc_reply_reviewers": "0;0;5",
        "wc_reply_authors": "2256;406;755",
        "reply_reviewers": "0;0;1",
        "reply_authors": "4;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            364.3333333333333,
            202.41266978351155
        ],
        "wc_reply_reviewers_avg": [
            1.6666666666666667,
            2.357022603955158
        ],
        "wc_reply_authors_avg": [
            1139.0,
            802.5862362803555
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:G3t0D1JFCGoJ:scholar.google.com/&scioq=ROBUST+DISCRIMINATIVE+REPRESENTATION+LEARNING+VIA+GRADIENT+RESCALING:+AN+EMPHASIS+REGULARISATION+PERSPECTIVE&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rylVHR4FPB",
        "title": "Sampling-Free Learning of Bayesian Quantized Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose Bayesian quantized networks, for which we learn a posterior distribution over their quantized parameters.",
        "abstract": "Bayesian learning of model parameters in neural networks is important in scenarios where estimates with well-calibrated uncertainty are important. In this paper, we propose Bayesian quantized networks (BQNs), quantized neural networks (QNNs) for which we learn a posterior distribution over their discrete parameters. We provide a set of efficient algorithms for learning and prediction in BQNs without the need to sample from their parameters or activations, which not only allows for differentiable learning in quantized models but also reduces the variance in gradients estimation. We evaluate BQNs on MNIST, Fashion-MNIST and KMNIST classification datasets compared against bootstrap ensemble of QNNs (E-QNN). We demonstrate BQNs achieve both lower predictive errors and better-calibrated uncertainties than E-QNN (with less than 20% of the negative log-likelihood).",
        "keywords": "Bayesian neural networks;Quantized neural networks",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jiahao Su;Milan Cvitkovic;Furong Huang",
        "authorids": "jiahaosu@terpmail.umd.edu;mcvitkov@caltech.edu;furongh@cs.umd.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nSu2020Sampling-Free,\ntitle={Sampling-Free Learning of Bayesian Quantized Neural Networks},\nauthor={Jiahao Su and Milan Cvitkovic and Furong Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylVHR4FPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rylVHR4FPB",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "439;518;224",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "417;755;377",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            393.6666666666667,
            124.23186207875803
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            516.3333333333334,
            169.55104115149382
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 9,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6542055597601636702&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rylVTTVtvH",
        "title": "Tensor Graph Convolutional Networks for Prediction on Dynamic Graphs",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a novel tensor based method for graph convolutional networks on dynamic graphs",
        "abstract": "Many irregular domains such as social networks, financial transactions, neuron connections, and natural language structures are represented as graphs. In recent years, a variety of  graph neural networks (GNNs) have been successfully applied for representation learning and prediction on such graphs. However, in many of the applications, the underlying graph changes over time and existing GNNs are inadequate for handling such dynamic graphs. In this paper we propose a novel technique for learning embeddings of dynamic graphs based on a tensor algebra framework. Our method extends the popular graph convolutional network (GCN) for learning representations of dynamic graphs using the recently proposed tensor M-product technique. Theoretical results that establish the connection between the proposed tensor approach and spectral convolution of tensors are developed. Numerical experiments on real datasets demonstrate the usefulness of the proposed method for an edge classification task on dynamic graphs.",
        "keywords": "graph convolutional networks;graph learning;dynamic graphs;edge classification;tensors",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Osman Asif Malik;Shashanka Ubaru;Lior Horesh;Misha E. Kilmer;Haim Avron",
        "authorids": "osman.malik.87@gmail.com;shashanka.ubaru@ibm.com;lhoresh@us.ibm.com;misha.kilmer@tufts.edu;haimav@tauex.tau.ac.il",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nmalik2020tensor,\ntitle={Tensor Graph Convolutional Networks for Prediction on Dynamic Graphs},\nauthor={Osman Asif Malik and Shashanka Ubaru and Lior Horesh and Misha E. Kilmer and Haim Avron},\nyear={2020},\nurl={https://openreview.net/forum?id=rylVTTVtvH}\n}",
        "github": "[![github](/images/github_icon.svg) IBM/TM-GCN](https://github.com/IBM/TM-GCN)",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rylVTTVtvH",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "144;405;230",
        "wc_reply_reviewers": "60;0;0",
        "wc_reply_authors": "363;289;304",
        "reply_reviewers": "1;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            259.6666666666667,
            108.59813790095829
        ],
        "wc_reply_reviewers_avg": [
            20.0,
            28.284271247461902
        ],
        "wc_reply_authors_avg": [
            318.6666666666667,
            31.94091767971331
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1004896600020498031&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rylXBkrYDS",
        "title": "A Baseline for Few-Shot Image Classification",
        "track": "main",
        "status": "Poster",
        "tldr": "Transductive fine-tuning of a deep network is a strong baseline for few-shot image classification and outperforms the state-of-the-art on all standard benchmarks.",
        "abstract": "Fine-tuning a deep network trained with the standard cross-entropy loss is a strong baseline for few-shot learning. When fine-tuned transductively, this outperforms the current state-of-the-art on standard datasets such as Mini-ImageNet, Tiered-ImageNet, CIFAR-FS and FC-100 with the same hyper-parameters. The simplicity of this approach enables us to demonstrate the first few-shot learning results on the ImageNet-21k dataset. We find that using a large number of meta-training classes results in high few-shot accuracies even for a large number of few-shot classes. We do not advocate our approach as the solution for few-shot learning, but simply use the results to highlight limitations of current benchmarks and few-shot protocols. We perform extensive studies on benchmark datasets to propose a metric that quantifies the \"hardness\" of a few-shot episode. This metric can be used to report the performance of few-shot algorithms in a more systematic way.",
        "keywords": "few-shot learning;transductive learning;fine-tuning;baseline;meta-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Guneet Singh Dhillon;Pratik Chaudhari;Avinash Ravichandran;Stefano Soatto",
        "authorids": "guneetdhillon@utexas.edu;pratikac@seas.upenn.edu;avinash.a.ravichandran@gmail.com;soattos@amazon.com",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nDhillon2020A,\ntitle={A Baseline for Few-Shot Image Classification},\nauthor={Guneet Singh Dhillon and Pratik Chaudhari and Avinash Ravichandran and Stefano Soatto},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylXBkrYDS}\n}",
        "github": "[![github](/images/github_icon.svg) amazon-science/few-shot-baseline](https://github.com/amazon-science/few-shot-baseline) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rylXBkrYDS)",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rylXBkrYDS",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "161;253;415",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "513;804;797",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            276.3333333333333,
            104.99947089813782
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            704.6666666666666,
            135.5589252768781
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 757,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7382548819855207602&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "rylZKTNYPr",
        "title": "Inferring Dynamical Systems with Long-Range Dependencies through Line Attractor Regularization",
        "track": "main",
        "status": "Reject",
        "tldr": "We develop a new optimization approach for vanilla ReLU-based RNN that enables long short-term memory and identification of arbitrary nonlinear dynamical systems with widely differing time scales.",
        "abstract": "Vanilla RNN with ReLU activation have a simple structure that is amenable to systematic dynamical systems analysis and interpretation, but they suffer from the exploding vs. vanishing gradients problem. Recent attempts to retain this simplicity while alleviating the gradient problem are based on proper initialization schemes or orthogonality/unitary constraints on the RNN\u2019s recurrency matrix, which, however, comes with limitations to its expressive power with regards to dynamical systems phenomena like chaos or multi-stability. Here, we instead suggest a regularization scheme that pushes part of the RNN\u2019s latent subspace toward a line attractor configuration that enables long short-term memory and arbitrarily slow time scales. We show that our approach excels on a number of benchmarks like the sequential MNIST or multiplication problems, and enables reconstruction of dynamical systems which harbor widely different time scales.",
        "keywords": "Recurrent Neural Networks;Nonlinear State Space Models;Generative Models;Long short-term memory;vanishing/exploding gradient problem;Nonlinear dynamics;Interpretable machine learning;Time series analysis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dominik Schmidt;Georgia Koppe;Max Beutelspacher;Daniel Durstewitz",
        "authorids": "dominik.schmidt@zi-mannheim.de;georgia.koppe@zi-mannheim.de;max.beutelspacher@mailbox.org;daniel.durstewitz@zi-mannheim.de",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nschmidt2020inferring,\ntitle={Inferring Dynamical Systems with Long-Range Dependencies through Line Attractor Regularization},\nauthor={Dominik Schmidt and Georgia Koppe and Max Beutelspacher and Daniel Durstewitz},\nyear={2020},\nurl={https://openreview.net/forum?id=rylZKTNYPr}\n}",
        "github": "https://www.dropbox.com/s/3iye6cox1ipl5vs/iclr_code.tar.gz?dl=1",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rylZKTNYPr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "703;473;195",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "821;809;277",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            457.0,
            207.69849943287184
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            635.6666666666666,
            253.66294346807715
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 7,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10353057644285404681&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "rylb3eBtwr",
        "title": "Robust Subspace Recovery Layer for Unsupervised Anomaly Detection",
        "track": "main",
        "status": "Poster",
        "tldr": "This work proposes an autoencoder with a novel robust subspace recovery layer for unsupervised anomaly detection and demonstrates state-of-the-art results on various datasets.",
        "abstract": "We propose a neural network for unsupervised anomaly detection with a novel robust subspace recovery layer (RSR layer). This layer seeks to extract the underlying subspace from a latent representation of the given data and removes outliers that lie away from this subspace. It is used within an autoencoder. The encoder maps the data into a latent space, from which the RSR layer extracts the subspace. The decoder then smoothly maps back the underlying subspace to a ``manifold\" close to the original inliers. Inliers and outliers are distinguished according to the distances between the original and mapped positions (small for inliers and large for outliers). Extensive numerical experiments with both image and document datasets demonstrate state-of-the-art precision and recall. ",
        "keywords": "robust subspace recovery;unsupervised anomaly detection;outliers;latent space;autoencoder",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chieh-Hsin Lai;Dongmian Zou;Gilad Lerman",
        "authorids": "laixx313@umn.edu;dzou@umn.edu;lerman@umn.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLai2020Robust,\ntitle={Robust Subspace Recovery Layer for Unsupervised Anomaly Detection},\nauthor={Chieh-Hsin Lai and Dongmian Zou and Gilad Lerman},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylb3eBtwr}\n}",
        "github": "[![github](/images/github_icon.svg) dmzou/RSRAE](https://github.com/dmzou/RSRAE) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rylb3eBtwr)",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rylb3eBtwr",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "502;188;484",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "2060;427;462",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            391.3333333333333,
            143.96604537953462
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            983.0,
            761.6880376287044
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 85,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11513209509503726282&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "rylfl6VFDH",
        "title": "Adaptive network sparsification with dependent variational beta-Bernoulli dropout",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "While variational dropout approaches have been shown to be effective for network sparsification, they are still suboptimal in the sense that they set the dropout rate for each neuron without consideration of the input data. With such input independent dropout, each neuron is evolved to be generic across inputs, which makes it difficult to sparsify networks without accuracy loss. To overcome this limitation, we propose adaptive variational dropout whose probabilities are drawn from sparsity inducing beta-Bernoulli prior. It allows each neuron to be evolved either to be generic or specific for certain inputs, or dropped altogether. Such input-adaptive sparsity- inducing dropout allows the resulting network to tolerate larger degree of sparsity without losing its expressive power by removing redundancies among features. We validate our dependent variational beta-Bernoulli dropout on multiple public datasets, on which it obtains significantly more compact networks than baseline methods, with consistent accuracy improvements over the base networks.",
        "keywords": "network sparsification;variational inference;pruning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Juho Lee;Saehoon Kim;Jaehong Yoon;Hae Beom Lee;Eunho Yang;Sung Ju Hwang",
        "authorids": "juho@aitrics.com;shkim@aitrics.com;jaehong.yoon@kaist.ac.kr;haebeom.lee@kaist.ac.kr;eunhoy@kaist.ac.kr;sjhwang82@kaist.ac.kr",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nlee2020adaptive,\ntitle={Adaptive network sparsification with dependent variational beta-Bernoulli dropout},\nauthor={Juho Lee and Saehoon Kim and Jaehong Yoon and Hae Beom Lee and Eunho Yang and Sung Ju Hwang},\nyear={2020},\nurl={https://openreview.net/forum?id=rylfl6VFDH}\n}",
        "github": "https://github.com/ICLR2020anonymous/bbdrop",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rylfl6VFDH",
        "pdf_size": 0,
        "rating": "3;3;6;6",
        "confidence": "0;0;0;0",
        "wc_review": "189;444;319;326",
        "wc_reply_reviewers": "0;0;0;149",
        "wc_reply_authors": "181;574;162;460",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            4.5,
            1.5
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            319.5,
            90.23995789006109
        ],
        "wc_reply_reviewers_avg": [
            37.25,
            64.51889258194068
        ],
        "wc_reply_authors_avg": [
            344.25,
            177.51672456419422
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14566551484534742862&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryljMpNtwr",
        "title": "Benchmarking Robustness in Object Detection: Autonomous Driving when Winter is Coming",
        "track": "main",
        "status": "Reject",
        "tldr": "A benchmark to asses the robustness of object detection models towards common image corruptions. Like classification models, object detection models perform worse on corrupted images. Training with stylized data reduces the gap for all corruptions.",
        "abstract": "The ability to detect objects regardless of image distortions or weather conditions is crucial for real-world applications of deep learning like autonomous driving. We here provide an easy-to-use benchmark to assess how object detection models perform when image quality degrades. The three resulting benchmark datasets, termed PASCAL-C, COCO-C and Cityscapes-C, contain a large variety of image corruptions. We show that a range of standard object detection models suffer a severe performance loss on corrupted images (down to 30-60% of the original performance). However, a simple data augmentation trick - stylizing the training images - leads to a substantial increase in robustness across corruption type, severity and dataset. We envision our comprehensive benchmark to track future progress towards building robust object detection models. Benchmark, code and data are available at: (hidden for double blind review)",
        "keywords": "deep learning;object detection;robustness;neural networks;data augmentation;autonomous driving",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Claudio Michaelis;Benjamin Mitzkus;Robert Geirhos;Evgenia Rusak;Oliver Bringmann;Alexander S. Ecker;Matthias Bethge;Wieland Brendel",
        "authorids": "claudio.michaelis@uni-tuebingen.de;benjamin.mitzkus@uni-tuebingen.de;robert@geirhos.de;evgenia.rusak@bethgelab.org;oliver.bringmann@uni-tuebingen.de;alexander.ecker@uni-tuebingen.de;matthias@bethgelab.org;wieland.brendel@bethgelab.org",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nmichaelis2020benchmarking,\ntitle={Benchmarking Robustness in Object Detection: Autonomous Driving when Winter is Coming},\nauthor={Claudio Michaelis and Benjamin Mitzkus and Robert Geirhos and Evgenia Rusak and Oliver Bringmann and Alexander S. Ecker and Matthias Bethge and Wieland Brendel},\nyear={2020},\nurl={https://openreview.net/forum?id=ryljMpNtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryljMpNtwr",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "805;263;378",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "708;356;762",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;3",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            482.0,
            233.17089583965375
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            608.6666666666666,
            180.01728312087766
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 573,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10348718966382358742&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rylkma4twr",
        "title": "Min-Max Optimization without Gradients: Convergence and Applications to Adversarial ML",
        "track": "main",
        "status": "Reject",
        "tldr": "Towards principled and efficient black-box min-max optimization with applications to design of evasion and poisoning adversarial attacks",
        "abstract": "In this paper, we study the problem of constrained robust (min-max) optimization ina black-box setting, where the desired optimizer cannot access the gradients of the objective function but may query its values. We present a principled optimization framework, integrating a zeroth-order (ZO) gradient estimator with an alternating projected stochastic gradient descent-ascent method, where the former only requires a small number of function queries and the later needs just one-step descent/ascent update. We show that the proposed framework, referred to as ZO-Min-Max, has a sub-linear convergence rate under mild conditions and scales gracefully with problem size. From an application side, we explore a promising connection between black-box min-max optimization and black-box evasion and poisoning attacks in adversarial machine learning (ML). Our empirical evaluations on these use cases demonstrate the effectiveness of our approach and its scalability to dimensions that prohibit using recent black-box solvers.",
        "keywords": "nonconvex optimization;min-max optimization;robust optimization;adversarial attack",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sijia Liu;Songtao Lu;Xiangyi Chen;Yao Feng;Kaidi Xu;Abdullah Al-Dujaili;Minyi Hong;Una-May Obelilly",
        "authorids": "sijia.liu@ibm.com;songtao@ibm.com;chen5719@umn.edu;feng-y16@mails.tsinghua.edu.cn;xu.kaid@husky.neu.edu;aldujail@mit.edu;mhong@umn.edu;unamay@csail.mit.edu",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@misc{\nliu2020minmax,\ntitle={Min-Max Optimization without Gradients: Convergence and Applications to Adversarial {\\{}ML{\\}}},\nauthor={Sijia Liu and Songtao Lu and Xiangyi Chen and Yao Feng and Kaidi Xu and Abdullah Al-Dujaili and Minyi Hong and Una-May Obelilly},\nyear={2020},\nurl={https://openreview.net/forum?id=rylkma4twr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rylkma4twr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "148;288;221",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1610;306;898",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;2",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            219.0,
            57.17225434305234
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            938.0,
            533.1066184795183
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "rylmoxrFDH",
        "title": "Critical initialisation in continuous approximations of binary neural networks",
        "track": "main",
        "status": "Poster",
        "tldr": "signal propagation theory applied to continuous surrogates of binary nets;  counter intuitive initialisation; reparameterisation trick not helpful",
        "abstract": "The training of stochastic neural network models with binary ($\\pm1$) weights and activations via continuous surrogate networks is investigated. We derive new surrogates using a novel derivation based on writing the stochastic neural network as a Markov chain. This derivation also encompasses existing variants of the surrogates presented in the literature. Following this, we theoretically study the surrogates at initialisation. We derive, using mean field theory, a set of scalar equations describing how input signals propagate through the randomly initialised networks. The equations reveal whether so-called critical initialisations exist for each surrogate network, where the network can be trained to arbitrary depth. Moreover, we predict theoretically and confirm numerically, that common weight initialisation schemes used in standard continuous networks, when applied to the mean values of the stochastic binary weights, yield poor training performance. This study shows that, contrary to common intuition, the means of the stochastic binary weights should be initialised close to $\\pm 1$, for deeper networks to be trainable.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "George Stamatescu;Federica Gerace;Carlo Lucibello;Ian Fuss;Langford White",
        "authorids": "george.stamatescu@gmail.com;federicagerace91@gmail.com;carlo.lucibello@gmail.com;ian.fuss@adelaide.edu.au;lang.white@adelaide.edu.au",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nStamatescu2020Critical,\ntitle={Critical initialisation in continuous approximations of binary neural networks},\nauthor={George Stamatescu and Federica Gerace and Carlo Lucibello and Ian Fuss and Langford White},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylmoxrFDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer5",
        "site": "https://openreview.net/forum?id=rylmoxrFDH",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "225;287;319",
        "wc_reply_reviewers": "0;24;0",
        "wc_reply_authors": "161;579;308",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            277.0,
            39.02136167109839
        ],
        "wc_reply_reviewers_avg": [
            8.0,
            11.313708498984761
        ],
        "wc_reply_authors_avg": [
            349.3333333333333,
            173.13257604763146
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12081502271188187899&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "rylnK6VtDH",
        "title": "Multiplicative Interactions and Where to Find Them",
        "track": "main",
        "status": "Poster",
        "tldr": "We explore the role of multiplicative interaction as a unifying framework to describe a range of classical and modern neural network architectural motifs, such as gating, attention layers, hypernetworks, and dynamic convolutions amongst others.",
        "abstract": "We explore the role of multiplicative interaction as a unifying framework to describe a range of classical and modern neural network architectural motifs, such as gating, attention layers, hypernetworks, and dynamic convolutions amongst others.\nMultiplicative interaction layers as primitive operations have a long-established presence in the literature, though this often not emphasized and thus under-appreciated. We begin by showing that such layers strictly enrich the representable function classes of neural networks. We conjecture that multiplicative interactions offer a particularly powerful inductive bias when fusing multiple streams of information or when conditional computation is required. We therefore argue that they should be considered in many situation where multiple compute or information paths need to be combined, in place of the simple and oft-used concatenation operation. Finally, we back up our claims and demonstrate the potential of multiplicative interactions by applying them in large-scale complex RL and sequence modelling tasks, where their use allows us to deliver state-of-the-art results, and thereby provides new evidence in support of multiplicative interactions playing a more prominent role when designing new neural network architectures.",
        "keywords": "multiplicative interactions;hypernetworks;attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Siddhant M. Jayakumar;Wojciech M. Czarnecki;Jacob Menick;Jonathan Schwarz;Jack Rae;Simon Osindero;Yee Whye Teh;Tim Harley;Razvan Pascanu",
        "authorids": "sidmj@google.com;lejlot@google.com;jmenick@google.com;schwarzjn@google.com;jwrae@google.com;osindero@google.com;ywteh@google.com;tharley@google.com;razp@google.com",
        "gender": ";;;;;;;;",
        "homepage": ";;;;;;;;",
        "dblp": ";;;;;;;;",
        "google_scholar": ";;;;;;;;",
        "orcid": ";;;;;;;;",
        "linkedin": ";;;;;;;;",
        "or_profile": ";;;;;;;;",
        "aff": ";;;;;;;;",
        "aff_domain": ";;;;;;;;",
        "position": ";;;;;;;;",
        "bibtex": "@inproceedings{\nJayakumar2020Multiplicative,\ntitle={Multiplicative Interactions and Where to Find Them},\nauthor={Siddhant M. Jayakumar and Wojciech M. Czarnecki and Jacob Menick and Jonathan Schwarz and Jack Rae and Simon Osindero and Yee Whye Teh and Tim Harley and Razvan Pascanu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylnK6VtDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rylnK6VtDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "226;260;404",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "172;285;164",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            296.6666666666667,
            77.15496671562296
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            207.0,
            55.25094267672423
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            9,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 154,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4846008217007262730&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryloogSKDS",
        "title": "Deep Orientation Uncertainty Learning based on a Bingham Loss",
        "track": "main",
        "status": "Poster",
        "tldr": "A method for learning to predict uncertainties over orientations using the Bingham Distribution",
        "abstract": "Reasoning about uncertain orientations is one of the core problems in many perception tasks such as object pose estimation or motion estimation. In these scenarios, poor illumination conditions, sensor limitations, or appearance invariance may result in highly uncertain estimates. In this work, we propose a novel learning-based representation for orientation uncertainty. By characterizing uncertainty over unit quaternions with the Bingham distribution, we formulate a loss that naturally captures the antipodal symmetry of the representation. We discuss the interpretability of the learned distribution parameters and demonstrate the feasibility of our approach on several challenging real-world pose estimation tasks involving uncertain orientations.",
        "keywords": "Orientation Estimation;Directional Statistics;Bingham Distribution",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Igor Gilitschenski;Roshni Sahoo;Wilko Schwarting;Alexander Amini;Sertac Karaman;Daniela Rus",
        "authorids": "igilitschenski@mit.edu;rsahoo@mit.edu;wilkos@mit.edu;amini@mit.edu;sertac@mit.edu;rus@csail.mit.edu",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nGilitschenski2020Deep,\ntitle={Deep Orientation Uncertainty Learning based on a Bingham Loss},\nauthor={Igor Gilitschenski and Roshni Sahoo and Wilko Schwarting and Alexander Amini and Sertac Karaman and Daniela Rus},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryloogSKDS}\n}",
        "github": "[![github](/images/github_icon.svg) igilitschenski/deep_bingham](https://github.com/igilitschenski/deep_bingham)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryloogSKDS",
        "pdf_size": 0,
        "rating": "6;6",
        "confidence": "0;0",
        "wc_review": "252;140",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "129;214",
        "reply_reviewers": "0;0",
        "reply_authors": "1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            196.0,
            56.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            171.5,
            42.5
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 77,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7600753973085180123&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rylqmxBKvH",
        "title": "Unsupervised Spatiotemporal Data Inpainting",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "We tackle the problem of inpainting occluded area in spatiotemporal sequences, such as cloud occluded satellite observations, in an unsupervised manner. We place ourselves in the setting where there is neither access to paired nor unpaired training data. We consider several cases in which the underlying information of the observed sequence in certain areas is lost through an observation operator. In this case, the only available information is provided by the observation of the sequence, the nature of the measurement process and its associated statistics. We propose an unsupervised-learning framework to retrieve the most probable sequence using a generative adversarial network. We demonstrate the capacity of our model to exhibit strong reconstruction capacity on several video datasets such as satellite sequences or natural videos.\n",
        "keywords": "Deep Learning;Adversarial;MAP;GAN;neural networks;video",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuan Yin;Arthur Pajot;Emmanuel de B\u00e9zenac;Patrick Gallinari",
        "authorids": "yuan.yin@lip6.fr;arthur.pajot@lip6.fr;emmanuel.de-bezenac@lip6.fr;patrick.gallinari@lip6.fr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nyin2020unsupervised,\ntitle={Unsupervised Spatiotemporal Data Inpainting},\nauthor={Yuan Yin and Arthur Pajot and Emmanuel de B{\\'e}zenac and Patrick Gallinari},\nyear={2020},\nurl={https://openreview.net/forum?id=rylqmxBKvH}\n}",
        "github": "https://sites.google.com/view/unsup-video-inpaiting/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rylqmxBKvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "237;349;274",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "698;1273;300",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            286.6666666666667,
            46.59279868057819
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            757.0,
            399.4103987963592
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tRZYlRmhOmsJ:scholar.google.com/&scioq=Unsupervised+Spatiotemporal+Data+Inpainting&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "rylrI1HtPr",
        "title": "Pixel Co-Occurence Based Loss Metrics for Super Resolution Texture Recovery",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce an unbiased perceptual loss function and metric and show that it improves recovery of texture during super resolution",
        "abstract": "Single Image Super Resolution (SISR) has significantly improved with Convolutional Neural Networks (CNNs) and Generative Adversarial Networks (GANs), often achieving order of magnitude better pixelwise accuracies (distortions) and state-of-the-art perceptual accuracy. Due to the stochastic nature of GAN reconstruction and the ill-posed nature of the problem, perceptual accuracy tends to correlate inversely with pixelwise accuracy which is especially detrimental to SISR, where preservation of original content is an objective. GAN stochastics can be guided by intermediate loss functions such as the VGG featurewise loss, but these features are typically derived from biased pre-trained networks. Similarly, measurements of perceptual quality such as the human Mean Opinion Score (MOS) and no-reference measures have issues with pre-trained bias. The spatial relationships between pixel values can be measured without bias using the Grey Level Co-occurence Matrix (GLCM), which was found to match the cardinality and comparative value of the MOS while reducing subjectivity and automating the analytical process. In this work, the GLCM is also directly used as a loss function to guide the generation of perceptually accurate images based on spatial collocation of pixel values. We compare GLCM based loss against scenarios where (1) no intermediate guiding loss function, and (2) the VGG feature function are used. Experimental validation is carried on X-ray images of rock samples, characterised by significant number of high frequency texture features. We find GLCM-based loss to result in images with higher pixelwise accuracy and better perceptual scores.",
        "keywords": "Super Resolution Generative Adversarial Networks;Perceptual Loss Functions",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ying Da Wang;Pawel Swietojanski;Ryan T Armstrong;Peyman Mostaghimi",
        "authorids": "yingda.wang@unsw.edu.au;p.swietojanski@unsw.edu.au;ryan.armstrong@unsw.edu.au;peyman@unsw.edu.au",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nwang2020pixel,\ntitle={Pixel Co-Occurence Based Loss Metrics for Super Resolution Texture Recovery},\nauthor={Ying Da Wang and Pawel Swietojanski and Ryan T Armstrong and Peyman Mostaghimi},\nyear={2020},\nurl={https://openreview.net/forum?id=rylrI1HtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rylrI1HtPr",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "98;243;200",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;76",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            180.33333333333334,
            60.80752877362765
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            25.333333333333332,
            35.82674358011841
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:opCzL50Cl1kJ:scholar.google.com/&scioq=Pixel+Co-Occurence+Based+Loss+Metrics+for+Super+Resolution+Texture+Recovery&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "rylrdxHFDr",
        "title": "State Alignment-based Imitation Learning",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "Consider an imitation learning problem that the imitator and the expert have different dynamics models. Most of existing imitation learning methods fail because they focus on the imitation of actions. We propose a novel state alignment-based imitation learning method to train the imitator by following the state sequences in the expert demonstrations as much as possible. The alignment of states comes from both local and global perspectives. We combine them into a reinforcement learning framework by a regularized policy update objective. We show the superiority of our method on standard imitation learning settings as well as the challenging settings in which the expert and the imitator have different dynamics models.",
        "keywords": "Imitation learning;Reinforcement Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Fangchen Liu;Zhan Ling;Tongzhou Mu;Hao Su",
        "authorids": "fliu@eng.ucsd.edu;z6ling@eng.ucsd.edu;t3mu@eng.ucsd.edu;haosu@eng.ucsd.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nLiu2020State,\ntitle={State Alignment-based Imitation Learning},\nauthor={Fangchen Liu and Zhan Ling and Tongzhou Mu and Hao Su},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylrdxHFDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rylrdxHFDr",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "1058;830;579",
        "wc_reply_reviewers": "367;123;111",
        "wc_reply_authors": "1547;1683;1215",
        "reply_reviewers": "1;1;1",
        "reply_authors": "2;3;3",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            822.3333333333334,
            195.62606052250695
        ],
        "wc_reply_reviewers_avg": [
            200.33333333333334,
            117.95290962451452
        ],
        "wc_reply_authors_avg": [
            1481.6666666666667,
            196.5660759699451
        ],
        "reply_reviewers_avg": [
            1.0,
            0.0
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            0.4714045207910317
        ],
        "replies_avg": [
            16,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 102,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3436621560237570936&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rylvAA4YDB",
        "title": "IsoNN: Isomorphic Neural Network for Graph Representation Learning and Classification",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep learning models have achieved huge success in numerous fields, such as computer vision and natural language processing. However, unlike such fields, it is hard to apply traditional deep learning models on the graph data due to the \u2018node-orderless\u2019 property. Normally, adjacency matrices will cast an artificial and random node-order on the graphs, which renders the performance of deep mod- els on graph classification tasks extremely erratic, and the representations learned by such models lack clear interpretability. To eliminate the unnecessary node- order constraint, we propose a novel model named Isomorphic Neural Network (ISONN), which learns the graph representation by extracting its isomorphic features via the graph matching between input graph and templates. ISONN has two main components: graph isomorphic feature extraction component and classification component. The graph isomorphic feature extraction component utilizes a set of subgraph templates as the kernel variables to learn the possible subgraph patterns existing in the input graph and then computes the isomorphic features. A set of permutation matrices is used in the component to break the node-order brought by the matrix representation. Three fully-connected layers are used as the classification component in ISONN. Extensive experiments are conducted on benchmark datasets, the experimental results can demonstrate the effectiveness of ISONN, especially compared with both classic and state-of-the-art graph classification methods.",
        "keywords": "Deep Learning;Graph Neural Network",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lin Meng;Jiawei Zhang",
        "authorids": "lin@ifmlab.org;jiawei@ifmlab.org",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nmeng2020isonn,\ntitle={Iso{\\{}NN{\\}}: Isomorphic Neural Network for Graph Representation Learning and Classification},\nauthor={Lin Meng and Jiawei Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=rylvAA4YDB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=rylvAA4YDB",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "130;732;233",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "459;1403;359",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            365.0,
            262.89288059334484
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            740.3333333333334,
            470.3511690452382
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 23,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16961894062907030612&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rylvYaNYDH",
        "title": "Finding and Visualizing Weaknesses of Deep Reinforcement Learning Agents",
        "track": "main",
        "status": "Poster",
        "tldr": "We generate critical states of a trained RL algorithms to visualize potential weaknesses. ",
        "abstract": "As deep reinforcement learning driven by visual perception becomes more widely used there is a growing need to better understand and probe the learned agents. Understanding the decision making process and its relationship to visual inputs can be very valuable to identify problems in learned behavior. However, this topic has been relatively under-explored in the research community. In this work we present a method for synthesizing visual inputs of interest for a trained agent. Such inputs or states could be situations in which specific actions are necessary. Further, critical states in which a very high or a very low reward can be achieved are often interesting to understand the situational awareness of the system as they can correspond to risky states. To this end, we learn a generative model over the state space of the environment and use its latent space to optimize a target function for the state of interest. In our experiments we show that this method can generate insights for a variety of environments and reinforcement learning methods. We explore results in the standard Atari benchmark games as well as in an autonomous driving simulator. Based on the efficiency with which we have been able to identify behavioural weaknesses with this technique, we believe this general approach could serve as an important tool for AI safety applications.",
        "keywords": "Visualization;Reinforcement Learning;Safety",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Christian Rupprecht;Cyril Ibrahim;Christopher J. Pal",
        "authorids": "christian.rupprecht@eng.ox.ac.uk;cyril.ibrahim@elementai.com;christopher.pal@polymtl.ca",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nRupprecht2020Finding,\ntitle={Finding and Visualizing Weaknesses of Deep Reinforcement Learning Agents},\nauthor={Christian Rupprecht and Cyril Ibrahim and Christopher J. Pal},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylvYaNYDH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rylvYaNYDH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "936;767;394",
        "wc_reply_reviewers": "0;0;158",
        "wc_reply_authors": "565;554;343",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            699.0,
            226.43468521113692
        ],
        "wc_reply_reviewers_avg": [
            52.666666666666664,
            74.481914284983
        ],
        "wc_reply_authors_avg": [
            487.3333333333333,
            102.15782996042067
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 47,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11518984940352760452&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rylwJxrYDS",
        "title": "vq-wav2vec: Self-Supervised Learning of Discrete Speech Representations",
        "track": "main",
        "status": "Poster",
        "tldr": "Learn how to quantize speech signal and apply algorithms requiring discrete inputs to audio data such as BERT.",
        "abstract": "We propose vq-wav2vec to learn discrete representations of audio segments through a wav2vec-style self-supervised context prediction task. The algorithm uses either a gumbel softmax or online k-means clustering to quantize the dense representations. Discretization enables the direct application of algorithms from the NLP community which require discrete inputs. Experiments show that BERT pre-training achieves a new state of the art on TIMIT phoneme classification and WSJ speech recognition.",
        "keywords": "speech recognition;speech representation learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Alexei Baevski;Steffen Schneider;Michael Auli",
        "authorids": "alexei.b@gmail.com;stes@fb.com;michael.auli@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nBaevski2020vq-wav2vec:,\ntitle={vq-wav2vec: Self-Supervised Learning of Discrete Speech Representations},\nauthor={Alexei Baevski and Steffen Schneider and Michael Auli},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=rylwJxrYDS}\n}",
        "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=rylwJxrYDS)",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=rylwJxrYDS",
        "pdf_size": 0,
        "rating": "6;8;8;8",
        "confidence": "0;0;0;0",
        "wc_review": "141;728;242;411",
        "wc_reply_reviewers": "0;0;0;0",
        "wc_reply_authors": "202;494;5;434",
        "reply_reviewers": "0;0;0;0",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            7.5,
            0.8660254037844386
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            380.5,
            222.61457724057516
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            283.75,
            194.3995563266542
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 811,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1142923229168041752&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "rylxpA4YwH",
        "title": "On the Evaluation of Conditional GANs",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a new metric for evaluating conditional GANs that captures image quality, conditional consistency, and intra-conditioning diversity in a single measure.",
        "abstract": "Conditional Generative Adversarial Networks (cGANs) are finding increasingly widespread use in many application domains. Despite outstanding progress, quantitative evaluation of such models often involves multiple distinct metrics to assess different desirable properties, such as image quality, conditional consistency, and intra-conditioning diversity. In this setting, model benchmarking becomes a challenge, as each metric may indicate a different \"best\" model. In this paper, we propose the Frechet Joint Distance (FJD), which is defined as the Frechet distance between joint distributions of images and conditioning, allowing it to implicitly capture the aforementioned properties in a single metric. We conduct proof-of-concept experiments on a controllable synthetic dataset, which consistently highlight the benefits of FJD when compared to currently established metrics. Moreover, we use the newly introduced metric to compare existing cGAN-based models for a variety of conditioning modalities (e.g. class labels, object masks, bounding boxes, images, and text captions). We show that FJD can be used as a promising single metric for model benchmarking.",
        "keywords": "FJD;Frechet Joint Distance;GAN;cGAN;generative adversarial network;conditional;evaluation;metric;FID;Frechet Inception Distance",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Terrance DeVries;Adriana Romero;Luis Pineda;Graham W. Taylor;Michal Drozdzal",
        "authorids": "terrance@uoguelph.ca;adrianars@fb.com;lep@fb.com;gwtaylor@uoguelph.ca;mdrozdzal@fb.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\ndevries2020on,\ntitle={On the Evaluation of Conditional {\\{}GAN{\\}}s},\nauthor={Terrance DeVries and Adriana Romero and Luis Pineda and Graham W. Taylor and Michal Drozdzal},\nyear={2020},\nurl={https://openreview.net/forum?id=rylxpA4YwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=rylxpA4YwH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "161;419;418",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "26;981;689",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            332.6666666666667,
            121.38735061318741
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            565.3333333333334,
            399.56337280697903
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 53,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15805514506049728203&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "rylztAEYvr",
        "title": "Iterative Target Augmentation for Effective Conditional Generation",
        "track": "main",
        "status": "Reject",
        "tldr": "We improve generative models by proposing a meta-algorithm that filters new training data from the model's outputs.",
        "abstract": "Many challenging prediction problems, from molecular optimization to program synthesis, involve creating complex structured objects as outputs. However, available training data may not be sufficient for a generative model to learn all possible complex transformations. By leveraging the idea that evaluation is easier than generation, we show how a simple, broadly applicable, iterative target augmentation scheme can be surprisingly effective in guiding the training and use of such models. Our scheme views the generative model as a prior distribution, and employs a separately trained filter as the likelihood. In each augmentation step, we filter the model's outputs to obtain additional prediction targets for the next training epoch. Our method is applicable in the supervised as well as semi-supervised settings. We demonstrate that our approach yields significant gains over strong baselines both in molecular optimization and program synthesis. In particular, our augmented model outperforms the previous state-of-the-art in molecular optimization by over 10% in absolute gain. ",
        "keywords": "data augmentation;generative models;self-training;molecular optimization;program synthesis",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kevin Yang;Wengong Jin;Kyle Swanson;Regina Barzilay;Tommi Jaakkola",
        "authorids": "yangk@berkeley.edu;wengong@csail.mit.edu;swansonk.14@gmail.com;regina@csail.mit.edu;tommi@csail.mit.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nyang2020iterative,\ntitle={Iterative Target Augmentation for Effective Conditional Generation},\nauthor={Kevin Yang and Wengong Jin and Kyle Swanson and Regina Barzilay and Tommi Jaakkola},\nyear={2020},\nurl={https://openreview.net/forum?id=rylztAEYvr}\n}",
        "github": "https://www.dropbox.com/s/87v6w4aab2txg2y/iterative-target-augmentation.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=rylztAEYvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "308;249;219",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "409;496;412",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            258.6666666666667,
            36.97146046464609
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            439.0,
            40.32369030731191
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CcDelRfh4XAJ:scholar.google.com/&scioq=Iterative+Target+Augmentation+for+Effective+Conditional+Generation&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ryx0nnEKwH",
        "title": "Improving Batch Normalization with Skewness Reduction for Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "Reduce Skewness",
        "abstract": "Batch Normalization (BN) is a well-known technique used in training deep neural networks.\n    The main idea behind batch normalization is to normalize the features of the layers ($i.e.$, transforming them to have a mean equal to zero and a variance equal to one).\n    Such a procedure encourages the optimization landscape of the loss function to be smoother, and improve the learning of the networks for both speed and performance.\n    In this paper,\n    we demonstrate that the performance of the network can be improved,\n    if the distributions of the features of the output in the same layer are similar.\n    As normalizing based on mean and variance does not necessarily make the features to have the same distribution, we propose a new normalization scheme: Batch Normalization with Skewness Reduction (BNSR).\n    Comparing with other normalization approaches,\n    BNSR transforms not just only the mean and variance,\n    but also the skewness of the data.\n    By tackling this property of a distribution, we are able to make the output distributions of the layers to be further similar. The nonlinearity of BNSR may further improve the expressiveness of the underlying network.\n    Comparisons with other normalization schemes are tested on the CIFAR-100 and ImageNet datasets. Experimental results show that the proposed approach can outperform other state-of-the-arts that are not equipped with BNSR.",
        "keywords": "Batch Normalization;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Pak Lun Kevin Ding;Sarah Martin;Baoxin Li",
        "authorids": "kevinding@asu.edu;samart44@asu.edu;baoxin.li@asu.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nding2020improving,\ntitle={Improving Batch Normalization with Skewness Reduction for Deep Neural Networks},\nauthor={Pak Lun Kevin Ding and Sarah Martin and Baoxin Li},\nyear={2020},\nurl={https://openreview.net/forum?id=ryx0nnEKwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryx0nnEKwH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "106;292;304",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            234.0,
            90.64215354899729
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4131850586923305778&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "ryx1wRNFvB",
        "title": "Improved memory in recurrent neural networks with sequential non-normal dynamics",
        "track": "main",
        "status": "Poster",
        "tldr": "a feedforward, chain-like motif (1->2->3->...) is proposed as a useful inductive bias for better memory in RNNs.",
        "abstract": "Training recurrent neural networks (RNNs) is a hard problem due to degeneracies in the optimization landscape, a problem also known as vanishing/exploding gradients. Short of designing new RNN architectures, previous methods for dealing with this problem usually boil down to orthogonalization of the recurrent dynamics, either at initialization or during the entire training period. The basic motivation behind these methods is that orthogonal transformations are isometries of the Euclidean space, hence they preserve (Euclidean) norms and effectively deal with vanishing/exploding gradients. However, this ignores the crucial effects of non-linearity and noise. In the presence of a non-linearity, orthogonal transformations no longer preserve norms, suggesting that alternative transformations might be better suited to non-linear networks. Moreover, in the presence of noise, norm preservation itself ceases to be the ideal objective. A more sensible objective is maximizing the signal-to-noise ratio (SNR) of the propagated signal instead. Previous work has shown that in the linear case, recurrent networks that maximize the SNR display strongly non-normal, sequential dynamics and orthogonal networks are highly suboptimal by this measure. Motivated by this finding, here we investigate the potential of non-normal RNNs, i.e. RNNs with a non-normal recurrent connectivity matrix, in sequential processing tasks. Our experimental results show that non-normal RNNs outperform their orthogonal counterparts in a diverse range of benchmarks. We also find evidence for increased non-normality and hidden chain-like feedforward motifs in trained RNNs initialized with orthogonal recurrent connectivity matrices. ",
        "keywords": "recurrent neural networks;memory;non-normal dynamics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Emin Orhan;Xaq Pitkow",
        "authorids": "aeminorhan@gmail.com;xaq@rice.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nOrhan2020Improved,\ntitle={Improved memory in recurrent neural networks with sequential non-normal dynamics},\nauthor={Emin Orhan and Xaq Pitkow},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryx1wRNFvB}\n}",
        "github": "https://github.com/eminorhan/nonnormal-init",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryx1wRNFvB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "197;345;436",
        "wc_reply_reviewers": "0;24;0",
        "wc_reply_authors": "559;694;409",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            326.0,
            98.49196244702745
        ],
        "wc_reply_reviewers_avg": [
            8.0,
            11.313708498984761
        ],
        "wc_reply_authors_avg": [
            554.0,
            116.40446726822816
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 18,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2472327505855554396&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryx2wp4tvS",
        "title": "MLModelScope: A Distributed Platform for ML Model Evaluation and Benchmarking at Scale",
        "track": "main",
        "status": "Reject",
        "tldr": "MLModelScope is a platform that allows for repeatable and fair ML model evaluation at scale.",
        "abstract": "Machine Learning (ML) and Deep Learning (DL) innovations are being introduced at such a rapid pace that researchers are hard-pressed to analyze and study them. The complicated procedures for evaluating innovations, along with the lack of standard and efficient ways of specifying and provisioning ML/DL evaluation, is a major \"pain point\" for the community. This paper proposes MLModelScope, an open-source, framework/hardware agnostic, extensible and customizable design that enables repeatable, fair, and scalable model evaluation and benchmarking.  We implement the distributed design with support for all major frameworks and hardware, and equip it with web, command-line, and library interfaces. To demonstrate MLModelScope's capabilities we perform parallel evaluation and show how subtle changes to model evaluation pipeline affects the accuracy and HW/SW stack choices affect performance.",
        "keywords": "Evaluation;Scalable;Repeatable;Fair;System",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cheng Li;Abdul Dakkak;Jinjun Xiong;Wen-mei Hwu",
        "authorids": "cli99@illinois.edu;dakkak@illinois.edu;jinjun@us.ibm.com;w-hwu@illinois.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nli2020mlmodelscope,\ntitle={{\\{}MLM{\\}}odelScope: A Distributed Platform for {\\{}ML{\\}} Model Evaluation and Benchmarking at Scale},\nauthor={Cheng Li and Abdul Dakkak and Jinjun Xiong and Wen-mei Hwu},\nyear={2020},\nurl={https://openreview.net/forum?id=ryx2wp4tvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryx2wp4tvS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "796;353;425",
        "wc_reply_reviewers": "825;0;0",
        "wc_reply_authors": "2001;765;770",
        "reply_reviewers": "5;0;0",
        "reply_authors": "6;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            524.6666666666666,
            194.1002032170211
        ],
        "wc_reply_reviewers_avg": [
            275.0,
            388.90872965260115
        ],
        "wc_reply_authors_avg": [
            1178.6666666666667,
            581.4810592119251
        ],
        "reply_reviewers_avg": [
            1.6666666666666667,
            2.357022603955158
        ],
        "reply_authors_avg": [
            2.6666666666666665,
            2.3570226039551585
        ],
        "replies_avg": [
            17,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gS1Jy2YwUaAJ:scholar.google.com/&scioq=MLModelScope:+A+Distributed+Platform+for+ML+Model+Evaluation+and+Benchmarking+at+Scale&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ryx4PJrtvS",
        "title": "A Copula approach for hyperparameter transfer learning",
        "track": "main",
        "status": "Reject",
        "tldr": "We show how using semi-parametric prior estimations can speed up HPO significantly across datasets and metrics.",
        "abstract": "Bayesian optimization (BO) is a popular methodology to tune the hyperparameters of expensive black-box functions. Despite its success, standard BO focuses on a single task at a time and is not designed to leverage information from related functions, such as tuning performance metrics of the same algorithm across multiple datasets. In this work, we introduce a novel approach to achieve transfer learning across different datasets as well as different metrics. The main idea is to regress the mapping from hyperparameter to metric quantiles with a semi-parametric Gaussian Copula distribution, which provides robustness against different scales or outliers that can occur in different tasks. We introduce two methods to leverage this estimation: a Thompson sampling strategy as well as a Gaussian Copula process using such quantile estimate as a prior. We show that these strategies can combine the estimation of multiple metrics such as runtime and accuracy, steering the optimization toward cheaper hyperparameters for the same level of accuracy. Experiments on an extensive set of hyperparameter tuning tasks demonstrate significant improvements over state-of-the-art methods.",
        "keywords": "Hyperparameter optimization;Bayesian Optimization;Gaussian Process;Copula;Transfer-learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "David Salinas;Huibin Shen;Valerio Perrone",
        "authorids": "david.salinas.pro@gmail.com;huibishe@amazon.com;vperrone@amazon.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nsalinas2020a,\ntitle={A Copula approach for hyperparameter transfer learning},\nauthor={David Salinas and Huibin Shen and Valerio Perrone},\nyear={2020},\nurl={https://openreview.net/forum?id=ryx4PJrtvS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryx4PJrtvS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "575;759;131",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "788;802;10",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            488.3333333333333,
            263.60239418909345
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            533.3333333333334,
            370.09668406452323
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10744496128143278077&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryx4TlHKDS",
        "title": "EXACT ANALYSIS OF CURVATURE CORRECTED LEARNING DYNAMICS IN DEEP LINEAR NETWORKS",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Deep neural networks exhibit complex learning dynamics due to the highly non-convex loss landscape, which causes slow convergence and vanishing gradient problems. Second order approaches, such as natural gradient descent, mitigate such problems by neutralizing the effect of potentially ill-conditioned curvature on the gradient-based updates, yet precise theoretical understanding on how such curvature correction affects the learning dynamics of deep networks has been lack- ing. Here, we analyze the dynamics of training deep neural networks under a generalized family of natural gradient methods that applies curvature corrections, and derive precise analytical solutions. Our analysis reveals that curvature corrected update rules preserve many features of gradient descent, such that the learning trajectory of each singular mode in natural gradient descent follows precisely the same path as gradient descent, while only accelerating the temporal dynamics along the path. We also show that layer-restricted approximations of natural gradient, which are widely used in most second order methods (e.g. K-FAC), can significantly distort the learning trajectory into highly diverging dynamics that significantly differs from true natural gradient, which may lead to undesirable net- work properties. We also introduce fractional natural gradient that applies partial curvature correction, and show that it provides most of the benefit of full curvature correction in terms of convergence speed, with additional benefit of superior numerical stability and neutralizing vanishing/exploding gradient problems, which holds true also in layer-restricted approximations.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Dongsung Huh",
        "authorids": "dongsunghuh@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nhuh2020exact,\ntitle={{\\{}EXACT{\\}} {\\{}ANALYSIS{\\}} {\\{}OF{\\}} {\\{}CURVATURE{\\}} {\\{}CORRECTED{\\}} {\\{}LEARNING{\\}} {\\{}DYNAMICS{\\}} {\\{}IN{\\}} {\\{}DEEP{\\}} {\\{}LINEAR{\\}} {\\{}NETWORKS{\\}}},\nauthor={Dongsung Huh},\nyear={2020},\nurl={https://openreview.net/forum?id=ryx4TlHKDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryx4TlHKDS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "518;162;109",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "223;90;86",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            263.0,
            181.60580020105817
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            133.0,
            63.66055817118372
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KYH-mCoacWEJ:scholar.google.com/&scioq=EXACT+ANALYSIS+OF+CURVATURE+CORRECTED+LEARNING+DYNAMICS+IN+DEEP+LINEAR+NETWORKS&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "ryx6WgStPB",
        "title": "Hypermodels for Exploration",
        "track": "main",
        "status": "Poster",
        "tldr": "Hypermodels can encode posterior distributions similar to large ensembles at much smaller computational cost. This can facilitate significant improvements in exploration.",
        "abstract": "We study the use of hypermodels to represent epistemic uncertainty and guide exploration.\nThis generalizes and extends the use of ensembles to approximate Thompson sampling. The computational cost of training an ensemble grows with its size, and as such, prior work has typically been limited to ensembles with tens of elements. We show that alternative hypermodels can enjoy dramatic efficiency gains, enabling behavior that would otherwise require hundreds or thousands of elements, and even succeed in situations where ensemble methods fail to learn regardless of size.\nThis allows more accurate approximation of Thompson sampling as well as use of more sophisticated exploration schemes.  In particular, we consider an approximate form of information-directed sampling and demonstrate performance gains relative to Thompson sampling.  As alternatives to ensembles, we consider linear and neural network hypermodels, also known as hypernetworks.\nWe prove that, with neural network base models, a linear hypermodel can represent essentially any distribution over functions, and as such, hypernetworks do not extend what can be represented.",
        "keywords": "exploration;hypermodel;reinforcement learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Vikranth Dwaracherla;Xiuyuan Lu;Morteza Ibrahimi;Ian Osband;Zheng Wen;Benjamin Van Roy",
        "authorids": "vikranthd@google.com;lxlu@google.com;mibrahimi@google.com;iosband@google.com;zhengwen@google.com;benvanroy@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nDwaracherla2020Hypermodels,\ntitle={Hypermodels for Exploration},\nauthor={Vikranth Dwaracherla and Xiuyuan Lu and Morteza Ibrahimi and Ian Osband and Zheng Wen and Benjamin Van Roy},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryx6WgStPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ryx6WgStPB",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "168;653;432",
        "wc_reply_reviewers": "0;0;31",
        "wc_reply_authors": "261;374;265",
        "reply_reviewers": "0;0;1",
        "reply_authors": "2;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            417.6666666666667,
            198.25965017846895
        ],
        "wc_reply_reviewers_avg": [
            10.333333333333334,
            14.613540144521982
        ],
        "wc_reply_authors_avg": [
            300.0,
            52.351376931907595
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 58,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10026579150837480064&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryx6daEtwr",
        "title": "GPNET: MONOCULAR 3D VEHICLE DETECTION BASED ON LIGHTWEIGHT WHEEL GROUNDING POINT DETECTION NETWORK",
        "track": "main",
        "status": "Reject",
        "tldr": "Method for detecting vehicle 3D information based on fisheye camera with high efficiency",
        "abstract": "We present a method to infer 3D location and orientation of vehicles on a single image. To tackle this problem, we optimize the mapping relation between the vehicle\u2019s wheel grounding point on the image and the real location of the wheel in the 3D real world coordinate. Here we also integrate three task priors, including a ground plane constraint and vehicle wheel grounding point position, as well as a small projection error from the image to the ground plane. And a robust light network for grounding point detection in autopilot is proposed based on the vehicle and wheel detection result. In the light grounding point detection network, the DSNT key point regression method is used for balancing the speed of convergence and the accuracy of position, which has been proved more robust and accurate compared with the other key point detection methods. With more, the size of grounding point detection network is less than 1 MB, which can be executed quickly on the embedded environment. The code will be available soon.",
        "keywords": "applications in vision;audio;speech;natural language processing;robotics",
        "primary_area": "",
        "supplementary_material": "",
        "author": "zizhang.wu",
        "authorids": "wuzizhang87@gmail.com",
        "gender": "",
        "homepage": "",
        "dblp": "",
        "google_scholar": "",
        "orcid": "",
        "linkedin": "",
        "or_profile": "",
        "aff": "",
        "aff_domain": "",
        "position": "",
        "bibtex": "@misc{\nzizhang.wu2020gpnet,\ntitle={{\\{}GPNET{\\}}: {\\{}MONOCULAR{\\}} 3D {\\{}VEHICLE{\\}} {\\{}DETECTION{\\}} {\\{}BASED{\\}} {\\{}ON{\\}} {\\{}LIGHTWEIGHT{\\}} {\\{}WHEEL{\\}} {\\{}GROUNDING{\\}} {\\{}POINT{\\}} {\\{}DETECTION{\\}} {\\{}NETWORK{\\}}},\nauthor={zizhang.wu},\nyear={2020},\nurl={https://openreview.net/forum?id=ryx6daEtwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryx6daEtwr",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "100;110;584",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            264.6666666666667,
            225.8396678078401
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            1,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "ryxAY34YwB",
        "title": "Make Lead Bias in Your Favor: A Simple and Effective Method for News Summarization",
        "track": "main",
        "status": "Reject",
        "tldr": "A method to leverage lead bias in large-scale pretraining for abstractive news summarization",
        "abstract": "Lead bias is a common phenomenon in news summarization, where early parts of an article often contain the most salient information. While many algorithms exploit this fact in summary generation, it has a detrimental effect on teaching the model to discriminate and extract important information. We propose that the lead bias can be leveraged in a simple and effective way in our favor to pretrain abstractive news summarization models on large-scale unlabelled corpus: predicting the leading sentences using the rest of an article. Via careful data cleaning and filtering, our transformer-based pretrained model without any finetuning achieves remarkable results over various news summarization tasks. With further finetuning, our model outperforms many competitive baseline models. For example, the pretrained model without finetuning outperforms pointer-generator network on CNN/DailyMail dataset. The finetuned model obtains 3.2% higher ROUGE-1, 1.6% higher ROUGE-2 and 2.1% higher ROUGE-L scores than the best baseline model on XSum dataset.",
        "keywords": "Summarization;Pretraining",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Chenguang Zhu;Ziyi Yang;Robert Gmyr;Michael Zeng;Xuedong Huang",
        "authorids": "chezhu@microsoft.com;zy99@stanford.edu;rogmyr@microsoft.com;nzeng@microsoft.com;xdh@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhu2020make,\ntitle={Make Lead Bias in Your Favor: A Simple and Effective Method for News Summarization},\nauthor={Chenguang Zhu and Ziyi Yang and Robert Gmyr and Michael Zeng and Xuedong Huang},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxAY34YwB}\n}",
        "github": "https://www.dropbox.com/s/3qbcpfwtzfowzmo/PretrainAbsSum.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxAY34YwB",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "206;408;348",
        "wc_reply_reviewers": "0;0;205",
        "wc_reply_authors": "303;210;504",
        "reply_reviewers": "0;0;1",
        "reply_authors": "1;1;2",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            320.6666666666667,
            84.70078052900234
        ],
        "wc_reply_reviewers_avg": [
            68.33333333333333,
            96.6379267621615
        ],
        "wc_reply_authors_avg": [
            339.0,
            122.69474316367429
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 15,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3035753923396205269&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryxB2lBtvH",
        "title": "Learning to Coordinate Manipulation Skills via Skill Behavior Diversification",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose to tackle complex tasks of multiple agents by learning composable primitive skills and coordination of the skills. ",
        "abstract": "When mastering a complex manipulation task, humans often decompose the task into sub-skills of their body parts, practice the sub-skills independently, and then execute the sub-skills together. Similarly, a robot with multiple end-effectors can perform complex tasks by coordinating sub-skills of each end-effector. To realize temporal and behavioral coordination of skills, we propose a modular framework that first individually trains sub-skills of each end-effector with skill behavior diversification, and then learns to coordinate end-effectors using diverse behaviors of the skills. We demonstrate that our proposed framework is able to efficiently coordinate skills to solve challenging collaborative control tasks such as picking up a long bar, placing a block inside a container while pushing the container with two robot arms, and pushing a box with two ant agents. Videos and code are available at https://clvrai.com/coordination",
        "keywords": "reinforcement learning;hierarchical reinforcement learning;modular framework;skill coordination;bimanual manipulation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Youngwoon Lee;Jingyun Yang;Joseph J. Lim",
        "authorids": "lee504@usc.edu;jingyuny@usc.edu;limjj@usc.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nLee2020Learning,\ntitle={Learning to Coordinate Manipulation Skills via Skill Behavior Diversification},\nauthor={Youngwoon Lee and Jingyun Yang and Joseph J. Lim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxB2lBtvH}\n}",
        "github": "https://github.com/clvrai/coordination",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxB2lBtvH",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "298;345;471",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "533;737;729",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            371.3333333333333,
            73.04032371840154
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            666.3333333333334,
            94.33745574031323
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 96,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=953133317196313848&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "ryxC-kBYDS",
        "title": "Gaussian Conditional Random Fields for Classification",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "In this paper, a Gaussian conditional random field model for structured binary classification (GCRFBC) is proposed. The model is applicable to classification problems with undirected graphs, intractable for standard classification CRFs. The model representation of GCRFBC is extended by latent variables which yield some appealing properties. Thanks to the GCRF latent structure, the model becomes tractable, efficient, and open to improvements previously applied to GCRF regression. Two different forms of the algorithm are presented: GCRFBCb (GCRGBC - Bayesian) and GCRFBCnb (GCRFBC - non-Bayesian). The extended method of local variational approximation of sigmoid function is used for solving empirical Bayes in GCRFBCb variant, whereas MAP value of latent variables is the basis for learning and inference in the GCRFBCnb variant. The inference in GCRFBCb is solved by Newton-Cotes formulas for one-dimensional integration. Both models are evaluated on synthetic data and real-world data. It was shown that both models achieve better prediction performance than relevant baselines. Advantages and disadvantages of the proposed models are discussed.",
        "keywords": "Structured classification;Gaussian conditional random fields;Empirical Bayes;Local variational approximation;discriminative graph-based model",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrija Petrovic;Mladen Nikolic;Milos Jovanovic;Boris Delibasic",
        "authorids": "aapetrovic@mas.bg.ac.rs;nikolic@matf.bg.ac.rs;milos.jovanovic@fon.bg.ac.rs;boris.delibasic@fon.bg.ac.rs",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\npetrovic2020gaussian,\ntitle={Gaussian Conditional Random Fields for Classification},\nauthor={Andrija Petrovic and Mladen Nikolic and Milos Jovanovic and Boris Delibasic},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxC-kBYDS}\n}",
        "github": "https://github.com/andrijaster/GCRFBC_B_NB",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryxC-kBYDS",
        "pdf_size": 0,
        "rating": "1;6;6",
        "confidence": "0;0;0",
        "wc_review": "334;208;294",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "757;117;423",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            278.6666666666667,
            52.56953067657685
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            432.3333333333333,
            261.36224329887864
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 12,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11092350575473332057&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    },
    {
        "id": "ryxC6kSYPr",
        "title": "Infinite-Horizon Differentiable Model Predictive Control",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "This paper proposes a differentiable linear quadratic Model Predictive Control (MPC) framework for safe imitation learning. The infinite-horizon cost is enforced using a terminal cost function obtained from the discrete-time algebraic Riccati equation (DARE), so that the learned controller can be proven to be stabilizing in closed-loop. A central contribution is the derivation of the analytical derivative of the solution of the DARE, thereby allowing the use of differentiation-based learning methods. A further contribution is the structure of the MPC optimization problem: an augmented Lagrangian method ensures that the MPC optimization is feasible throughout training whilst enforcing hard constraints on state and input, and a pre-stabilizing controller ensures that the MPC solution and derivatives are accurate at each iteration. The learning capabilities of the framework are demonstrated in a set of numerical studies. ",
        "keywords": "Model Predictive Control;Riccati Equation;Imitation Learning;Safe Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Sebastian East;Marco Gallieri;Jonathan Masci;Jan Koutnik;Mark Cannon",
        "authorids": "sebastian.east@bath.edu;marco@nnaisense.com;jonathan@nnaisense.com;jan@nnaisense.com;mark.cannon@eng.ox.ac.uk",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nEast2020Infinite-Horizon,\ntitle={Infinite-Horizon Differentiable Model Predictive Control},\nauthor={Sebastian East and Marco Gallieri and Jonathan Masci and Jan Koutnik and Mark Cannon},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxC6kSYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ryxC6kSYPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "227;192;200",
        "wc_reply_reviewers": "206;0;0",
        "wc_reply_authors": "569;701;352",
        "reply_reviewers": "2;0;0",
        "reply_authors": "2;1;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            206.33333333333334,
            14.974051630144134
        ],
        "wc_reply_reviewers_avg": [
            68.66666666666667,
            97.10933128295251
        ],
        "wc_reply_authors_avg": [
            540.6666666666666,
            143.88035152707806
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.9428090415820634
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 45,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7788068841549896954&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "ryxF80NYwS",
        "title": "Neural Clustering Processes",
        "track": "main",
        "status": "Reject",
        "tldr": "A novel neural architecture for efficient amortized inference over discrete variables in mixture models. An application is presented to neural spike sorting.",
        "abstract": "Mixture models, a basic building block in countless statistical models, involve latent random variables over discrete spaces, and existing posterior inference methods can be inaccurate and/or very slow.  In this work we introduce a novel deep learning architecture for efficient amortized Bayesian inference over mixture models. While previous approaches to amortized clustering assumed a fixed or maximum number of mixture components and only amortized over the continuous parameters of each mixture component, our method amortizes over the local discrete labels of all the data points, and performs inference over an unbounded number of mixture components. The latter property makes our method natural for the challenging case of nonparametric Bayesian models, where the number of mixture components grows with the dataset. Our approach exploits the exchangeability of the generative models and is based on mapping distributed, permutation-invariant representations of discrete  arrangements into varying-size multinomial conditional probabilities. The resulting algorithm parallelizes easily, yields iid samples from the approximate posteriors along with a normalized probability estimate of each sample (a quantity generally unavailable using Markov Chain Monte Carlo) and can easily be applied to both conjugate and non-conjugate models, as training only requires samples from the generative model. We also present an extension of the method to models of random communities (such as infinite relational or stochastic block models). As a scientific application, we present a novel approach to neural spike sorting for high-density multielectrode arrays. \n",
        "keywords": "amortized inference;probabilistic clustering;mixture models;exchangeability;spike sorting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ari Pakman;Yueqi Wang;Catalin Mitelut;JinHyung Lee;Liam Paninski",
        "authorids": "aripakman@gmail.com;yueqi.wang.pku@gmail.com;mitelutco@gmail.com;jl4303@columbia.edu;liam@stat.columbia.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\npakman2020neural,\ntitle={Neural Clustering Processes},\nauthor={Ari Pakman and Yueqi Wang and Catalin Mitelut and JinHyung Lee and Liam Paninski},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxF80NYwS}\n}",
        "github": "https://bit.ly/2lkGJ1b",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryxF80NYwS",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "305;341;288",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "564;245;213",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            311.3333333333333,
            22.095751225568733
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            340.6666666666667,
            158.4599493738262
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 26,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=310784645794796163&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 8
    },
    {
        "id": "ryxGuJrFvS",
        "title": "Distributionally Robust Neural Networks",
        "track": "main",
        "status": "Poster",
        "tldr": "Overparameterized neural networks can be distributionally robust, but only when you account for generalization. ",
        "abstract": "Overparameterized neural networks can be highly accurate on average on an i.i.d. test set, yet consistently fail on atypical groups of the data (e.g., by learning spurious correlations that hold on average but not in such groups). Distributionally robust optimization (DRO) allows us to learn models that instead minimize the worst-case training loss over a set of pre-defined groups. However, we find that naively applying group DRO to overparameterized neural networks fails: these models can perfectly fit the training data, and any model with vanishing average training loss also already has vanishing worst-case training loss. Instead, the poor worst-case performance arises from poor generalization on some groups. By coupling group DRO models with increased regularization---stronger-than-typical L2 regularization or early stopping---we achieve substantially higher worst-group accuracies, with 10-40 percentage point improvements on a natural language inference task and two image tasks, while maintaining high average accuracies. Our results suggest that regularization is important for worst-group generalization in the overparameterized regime, even if it is not needed for average generalization. Finally, we introduce a stochastic optimization algorithm for the group DRO setting and provide convergence guarantees for the new algorithm.\n",
        "keywords": "distributionally robust optimization;deep learning;robustness;generalization;regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Shiori Sagawa*;Pang Wei Koh*;Tatsunori B. Hashimoto;Percy Liang",
        "authorids": "ssagawa@cs.stanford.edu;koh.pangwei@gmail.com;thashim@stanford.edu;pliang@cs.stanford.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nSagawa*2020Distributionally,\ntitle={Distributionally Robust Neural Networks},\nauthor={Shiori Sagawa* and Pang Wei Koh* and Tatsunori B. Hashimoto and Percy Liang},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxGuJrFvS}\n}",
        "github": "https://github.com/kohpangwei/group_DRO",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer5;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxGuJrFvS",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "280;186;356",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "355;12;804",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            274.0,
            69.53176732017292
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            390.3333333333333,
            324.2965035615127
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": -1,
        "gs_cited_by_link": "",
        "gs_version_total": -1
    },
    {
        "id": "ryxIZR4tvS",
        "title": "Knowledge Hypergraphs: Prediction Beyond Binary Relations",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "A Knowledge Hypergraph is a knowledge base where relations are defined on two or more entities. In this work, we introduce two embedding-based models that perform link prediction in knowledge hypergraphs:\n(1) HSimplE is a shift-based method that is inspired by an existing model operating on knowledge graphs, in which the representation of an entity is a function of its position in the relation, and (2) HypE is a convolution-based method which disentangles the representation of an entity from its position in the relation. We test our models on two new knowledge hypergraph datasets that we obtain from Freebase, and show that both HSimplE and HypE are more effective in predicting links in knowledge hypergraphs than the proposed baselines and existing methods.\nOur experiments show that HypE outperforms HSimplE when trained with fewer parameters and when tested on samples that contain at least one entity in a position never encountered during training.",
        "keywords": "knowledge graphs;knowledge hypergraphs;knowledge hypergraph completion",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Bahare Fatemi;Perouz Taslakian;David Vazquez;David Poole",
        "authorids": "bfatemi@cs.ubc.ca;perouz@elementai.com;dvazquez@elementai.com;poole@cs.ubc.ca",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nfatemi2020knowledge,\ntitle={Knowledge Hypergraphs: Prediction Beyond Binary Relations},\nauthor={Bahare Fatemi and Perouz Taslakian and David Vazquez and David Poole},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxIZR4tvS}\n}",
        "github": "https://anonymous.4open.science/r/4ca032e1-cfdc-4b0f-8dca-ba0ee6afc65f/",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryxIZR4tvS",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "267;288;323",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "699;1005;242",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.6666666666667,
            23.098821518760552
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            648.6666666666666,
            313.5201570695079
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 146,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9912862584043565500&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "ryxK0JBtPr",
        "title": "Gradient $\\ell_1$ Regularization for Quantization Robustness",
        "track": "main",
        "status": "Poster",
        "tldr": "We show that regularizing the $\\ell_1$-norm of gradients improves robustness to post-training quantization in neural networks.",
        "abstract": "We analyze the effect of quantizing weights and activations of neural networks on their loss and derive a simple regularization scheme that improves robustness against post-training quantization. By training quantization-ready networks, our approach enables storing a single set of weights that can be quantized on-demand to different bit-widths as energy and memory requirements of the application change. Unlike quantization-aware training using the straight-through estimator that only targets a specific bit-width and requires access to training data and pipeline, our regularization-based method paves the way for ``on the fly'' post-training quantization to various bit-widths. We show that by modeling quantization as a $\\ell_\\infty$-bounded perturbation, the first-order term in the loss expansion can be regularized using the $\\ell_1$-norm of gradients. We experimentally validate our method on different vision architectures on CIFAR-10 and ImageNet datasets and show that the regularization of a neural network using our method improves robustness against quantization noise.",
        "keywords": "quantization;regularization;robustness;gradient regularization",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Milad Alizadeh;Arash Behboodi;Mart van Baalen;Christos Louizos;Tijmen Blankevoort;Max Welling",
        "authorids": "milada@qti.qualcomm.com;behboodi@qti.qualcomm.com;mart@qti.qualcomm.com;clouizos@qti.qualcomm.com;tijmen@qti.qualcomm.com;mwelling@qti.qualcomm.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nAlizadeh2020Gradient,\ntitle={Gradient $\\ell_1$ Regularization for Quantization Robustness},\nauthor={Milad Alizadeh and Arash Behboodi and Mart van Baalen and Christos Louizos and Tijmen Blankevoort and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxK0JBtPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxK0JBtPr",
        "pdf_size": 0,
        "rating": "6;6;6",
        "confidence": "0;0;0",
        "wc_review": "250;290;389",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "87;537;652",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            6.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            309.6666666666667,
            58.4256412507005
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            425.3333333333333,
            243.80092060713983
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 66,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8397401266024663900&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryxMW6EtPB",
        "title": "DG-GAN: the GAN with the duality gap",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Generative Adversarial Networks (GANs) are powerful, but difficult to understand and train because  GANs is a min-max problem. This paper understand GANs with duality gap that comes from game theorem and show that duality gap can be a kind of metric to evolution the difference between the true data distribution and the distribution generated by generator with given condition. And train the networks using duality gap can get some better results. Furthermore, the paper calculates the generalization bound of duality gap to estimate the help design the neural networks and select the sample size.\n",
        "keywords": "GAN;duality gap;metric;saddle point;game",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Cheng Peng;Hao Wang;Xiao Wang;Zhouwang Yang",
        "authorids": "pch0051@mail.ustc.edu.cn;wh001@mail.ustc.edu.cn;wangxiao@purdue.edu;yangzw@ustc.edu.cn",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\npeng2020dggan,\ntitle={{\\{}DG{\\}}-{\\{}GAN{\\}}: the {\\{}GAN{\\}} with the duality gap},\nauthor={Cheng Peng and Hao Wang and Xiao Wang and Zhouwang Yang},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxMW6EtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryxMW6EtPB",
        "pdf_size": 0,
        "rating": "1;1;1",
        "confidence": "0;0;0",
        "wc_review": "400;285;311",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "50;148;138",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            332.0,
            49.24090440545001
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            112.0,
            44.03029260255565
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16431235776585031850&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryxO3gBtPB",
        "title": "Dataset Distillation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "We propose to distill a large dataset into a small set of synthetic data that can train networks close to original performance. ",
        "abstract": "Model distillation aims to distill the knowledge of a complex model into a simpler one. In this paper, we consider an alternative formulation called dataset distillation: we keep the model fixed and instead attempt to distill the knowledge from a large training dataset into a small one. The idea is to synthesize a small number of data points that do not need to come from the correct data distribution, but will, when given to the learning algorithm as training data, approximate the model trained on the original data. For example, we show that it is possible to compress 60,000 MNIST training images into just 10 synthetic distilled images (one per class) and achieve close to the original performance, given a fixed network initialization. We evaluate our method in various initialization settings.  Experiments on multiple datasets, MNIST, CIFAR10, PASCAL-VOC, and CUB-200, demonstrate the ad-vantage of our approach compared to alternative methods.  Finally, we include a real-world application of dataset distillation to the continual learning setting: we show that storing distilled images as episodic memory of previous tasks can alleviate forgetting more effectively than real images.",
        "keywords": "knowledge distillation;deep learning;few-shot learning;continual learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Tongzhou Wang;Jun-Yan Zhu;Antonio Torralba;Alexei A. Efros",
        "authorids": "tongzhou.wang.1994@gmail.com;junyanz@mit.edu;torralba@mit.edu;efros@eecs.berkeley.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryxO3gBtPB",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "722;557;484",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "31;60;72",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            587.6666666666666,
            99.55344739831408
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            54.333333333333336,
            17.21110752456745
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            22,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 804,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15986122155784801061&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "ryxOBgBFPH",
        "title": "Preventing Imitation Learning with Adversarial Policy Ensembles",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose a framework to study policy ensembles that cannot be cloned.",
        "abstract": "Imitation learning can reproduce policies by observing experts, which poses a problem regarding policy propriety. Policies, such as human, or policies on deployed robots, can all be cloned without consent from the owners. How can we protect our proprietary policies from cloning by an external observer? To answer this question we introduce a new reinforcement learning framework, where we train an ensemble of optimal policies, whose demonstrations are guaranteed to be useless for an external observer. We formulate this idea by a constrained optimization problem, where the objective is to improve proprietary policies, and at the same time deteriorate the virtual policy of an eventual external observer. We design a tractable algorithm to solve this new optimization problem by modifying the standard policy gradient algorithm. It appears such problem formulation admits plausible interpretations of confidentiality, adversarial behaviour, which enables a broader perspective of this work. We demonstrate explicitly the existence of such 'non-clonable' ensembles, providing a solution to the above optimization problem, which is calculated by our modified policy gradient algorithm. To our knowledge, this is the first work regarding the protection and privacy of policies in Reinforcement Learning.",
        "keywords": "Imitation Learning;Reinforcement Learning;Representation Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Albert Zhan;Pieter Abbeel;Stas Tiomkin",
        "authorids": "albertzhan@berkeley.edu;pabbeel@cs.berkeley.edu;stasti@gmail.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhan2020preventing,\ntitle={Preventing Imitation Learning with Adversarial Policy Ensembles},\nauthor={Albert Zhan and Pieter Abbeel and Stas Tiomkin},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxOBgBFPH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryxOBgBFPH",
        "pdf_size": 0,
        "rating": "1;3;3",
        "confidence": "0;0;0",
        "wc_review": "255;418;864",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "150;525;735",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            2.3333333333333335,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            512.3333333333334,
            257.41578989815076
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            470.0,
            241.97107265125723
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 5,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6617686428195072699&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "ryxOUTVYDH",
        "title": "Robust training with ensemble consensus",
        "track": "main",
        "status": "Poster",
        "tldr": "This work presents a method to robustly train neural networks by using ensemble in the presence of label noise.",
        "abstract": "Since deep neural networks are over-parameterized, they can memorize noisy examples. We address such a memorization issue in the presence of label noise. From the fact that deep neural networks cannot generalize to neighborhoods of memorized features, we hypothesize that noisy examples do not consistently incur small losses on the network under a certain perturbation. Based on this, we propose a novel training method called Learning with Ensemble Consensus (LEC) that prevents overfitting to noisy examples by removing them based on the consensus of an ensemble of perturbed networks. One of the proposed LECs, LTEC outperforms the current state-of-the-art methods on noisy MNIST, CIFAR-10, and CIFAR-100 in an efficient manner.",
        "keywords": "Annotation noise;Noisy label;Robustness;Ensemble;Perturbation",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jisoo Lee;Sae-Young Chung",
        "authorids": "jisoolee@kaist.ac.kr;schung@kaist.ac.kr",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nLee2020Robust,\ntitle={Robust training with ensemble consensus},\nauthor={Jisoo Lee and Sae-Young Chung},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxOUTVYDH}\n}",
        "github": "https://github.com/jisoolee0123/Robust-training-with-ensemble-consensus",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ryxOUTVYDH",
        "pdf_size": 0,
        "rating": "3;6;8",
        "confidence": "0;0;0",
        "wc_review": "396;336;419",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "584;587;717",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.666666666666667,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            383.6666666666667,
            34.988887124660344
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            629.3333333333334,
            62.00179208879549
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 31,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17248264883550169316&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryxPRpEtvH",
        "title": "Omnibus Dropout for Improving The Probabilistic Classification Outputs of ConvNets",
        "track": "main",
        "status": "Reject",
        "tldr": "We propose to combine structured dropout methods at different scales for improved model diversity and performance of dropout uncertainty estimates.",
        "abstract": "While neural network models achieve impressive classification accuracy across different tasks, they can suffer from poor calibration of their probabilistic predictions. A Bayesian perspective has recently suggested that dropout, a regularization strategy popularly used during training, can be employed to obtain better probabilistic predictions at test time (Gal & Ghahramani, 2016a). However, empirical results so far have not been encouraging, particularly with convolutional networks. In this paper, through the lens of ensemble learning, we associate this unsatisfactory performance with the correlation between the models sampled with dropout. Motivated by this, we explore the use of various structured dropout techniques to promote model diversity and improve the quality of probabilistic predictions. We also propose an omnibus dropout strategy that combines various structured dropout methods. Using the SVHN, CIFAR-10 and CIFAR-100 datasets, we empirically demonstrate the superior performance of omnibus dropout relative to several widely used strong baselines in addition to regular dropout. Lastly, we show the merit of omnibus dropout in a Bayesian active learning application. ",
        "keywords": "Uncertainty Estimation;Calibration;Deep Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhilu Zhang;Adrian V. Dalca;Mert R. Sabuncu",
        "authorids": "zz452@cornell.edu;adalca@mit.edu;msabuncu@cornell.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nzhang2020omnibus,\ntitle={Omnibus Dropout for Improving The Probabilistic Classification Outputs of ConvNets},\nauthor={Zhilu Zhang and Adrian V. Dalca and Mert R. Sabuncu},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxPRpEtvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxPRpEtvH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "318;180;232",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "419;489;437",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            243.33333333333334,
            56.9053795543288
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            448.3333333333333,
            29.67977238606942
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kVz9tIufM3EJ:scholar.google.com/&scioq=Omnibus+Dropout+for+Improving+The+Probabilistic+Classification+Outputs+of+ConvNets&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "ryxPbkrtvr",
        "title": "BOSH: An Efficient Meta Algorithm for Decision-based Attacks",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Adversarial example generation becomes a viable method for evaluating the robustness of a machine learning model. In this paper, we consider hard-label black- box attacks (a.k.a. decision-based attacks), which is a challenging setting that generates adversarial examples based on only a series of black-box hard-label queries. This type of attacks can be used to attack discrete and complex models, such as Gradient Boosting Decision Tree (GBDT) and detection-based defense models. Existing decision-based attacks based on iterative local updates often get stuck in a local minimum and fail to generate the optimal adversarial example with the smallest distortion. To remedy this issue, we propose an efficient meta algorithm called BOSH-attack, which tremendously improves existing algorithms through Bayesian Optimization (BO) and Successive Halving (SH). In particular, instead of traversing a single solution path when searching an adversarial example, we maintain a pool of solution paths to explore important regions. We show empirically that the proposed algorithm converges to a better solution than existing approaches, while the query count is smaller than applying multiple random initializations by a factor of 10.",
        "keywords": "",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhenxin Xiao;Puyudi Yang;Yuchen Jiang;Kai-Wei Chang;Cho-Jui Hsieh",
        "authorids": "alanshawzju@gmail.com;pydyang@ucdavis.edu;jyc@zju.edu.cn;kw@kwchang.net;chohsieh@cs.ucla.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nxiao2020bosh,\ntitle={{\\{}BOSH{\\}}: An Efficient Meta Algorithm for Decision-based Attacks},\nauthor={Zhenxin Xiao and Puyudi Yang and Yuchen Jiang and Kai-Wei Chang and Cho-Jui Hsieh},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxPbkrtvr}\n}",
        "github": "https://www.dropbox.com/s/7dcozncrtqpm14d/source_code.zip?dl=0",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxPbkrtvr",
        "pdf_size": 0,
        "rating": "3;3",
        "confidence": "0;0",
        "wc_review": "350;598",
        "wc_reply_reviewers": "0;0",
        "wc_reply_authors": "383;987",
        "reply_reviewers": "0;0",
        "reply_authors": "1;2",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            474.0,
            124.0
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            685.0,
            302.0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.5,
            0.5
        ],
        "replies_avg": [
            6,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 1,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12398224061659527822&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryxQ6T4YwB",
        "title": "GraphNVP: an Invertible Flow-based Model for Generating Molecular Graphs",
        "track": "main",
        "status": "Reject",
        "tldr": "The first fully invertible flow-based generative model for molecular graphs is proposed. ",
        "abstract": "We propose GraphNVP, an invertible flow-based molecular graph generation model. Existing flow-based models only handle node attributes of a graph with invertible maps. In contrast, our model is the first invertible model for the whole graph components: both of dequantized node attributes and adjacency tensor are converted into latent vectors through two novel invertible flows. This decomposition yields the exact likelihood maximization on graph-structured data. We decompose the generation of a graph into two steps: generation of (i) an adjacency tensor and(ii) node attributes. We empirically demonstrate that our model and the two-step generation efficiently generates valid molecular graphs with almost no duplicated molecules, although there are no domain-specific heuristics ingrained in the model. We also confirm that the sampling (generation) of graphs is faster in magnitude than other models in our implementation. In addition, we observe that the learned latent space can be used to generate molecules with desired chemical properties",
        "keywords": "Graph Neural Networks;graph generative model;invertible flow;graphNVP",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kaushalya Madhawa;Katsuhiko Ishiguro;Kosuke Nakago;Motoki Abe",
        "authorids": "kaushalya@net.c.titech.ac.jp;k.ishiguro.jp@ieee.org;nakago@preferred.jp;motoki@preferred.jp",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nmadhawa2020graphnvp,\ntitle={Graph{\\{}NVP{\\}}: an Invertible Flow-based Model for Generating Molecular Graphs},\nauthor={Kaushalya Madhawa and Katsuhiko Ishiguro and Kosuke Nakago and Motoki Abe},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxQ6T4YwB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryxQ6T4YwB",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "355;492;436",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "409;744;760",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            427.6666666666667,
            56.239566933689034
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            637.6666666666666,
            161.8236351368022
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 4,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1403383432761422840&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryxQuANKPB",
        "title": "Augmenting Non-Collaborative Dialog Systems with Explicit Semantic and Strategic Dialog History",
        "track": "main",
        "status": "Poster",
        "tldr": "",
        "abstract": "We study non-collaborative dialogs, where two agents have a conflict of interest but must strategically communicate to reach an agreement (e.g., negotiation). This setting poses new challenges for modeling dialog history because the dialog's outcome relies not only on the semantic intent, but also on tactics that convey the intent.  We propose to model both semantic and tactic history using finite state transducers (FSTs). Unlike RNN, FSTs can explicitly represent dialog history through all the states traversed, facilitating interpretability of dialog structure. We train FSTs on a set of strategies and tactics used in negotiation dialogs. The trained FSTs show plausible tactic structure and can be generalized to other non-collaborative domains (e.g., persuasion). We evaluate the FSTs by incorporating them in an automated negotiating system that attempts to sell products and a persuasion system that persuades people to donate to a charity. Experiments show that explicitly modeling both semantic and tactic history is an effective way to improve both dialog policy planning and generation performance. ",
        "keywords": "dialog systems;history tracking",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yiheng Zhou;Yulia Tsvetkov;Alan W Black;Zhou Yu",
        "authorids": "yihengz1@cs.cmu.edu;ytsvetko@cs.cmu.edu;awb@cs.cmu.edu;joyu@ucdavis.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@inproceedings{\nZhou2020Augmenting,\ntitle={Augmenting Non-Collaborative Dialog Systems with Explicit Semantic and Strategic Dialog History},\nauthor={Yiheng Zhou and Yulia Tsvetkov and Alan W Black and Zhou Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxQuANKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryxQuANKPB",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "293;458;116",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "658;223;318",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            289.0,
            139.64956140282
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            399.6666666666667,
            186.74105660572403
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 37,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14421269150468594851&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryxUMREYPr",
        "title": "Is There Mode Collapse? A Case Study on Face Generation and Its Black-box Calibration",
        "track": "main",
        "status": "Reject",
        "tldr": "",
        "abstract": "Generative adversarial networks (GANs) nowadays are capable of producing im-ages of incredible realism.   One concern raised is whether the state-of-the-artGAN\u2019s learned distribution still suffers from mode collapse. Existing evaluation metrics for image synthesis focus on low-level perceptual quality. Diversity tests of samples from GANs are usually conducted qualitatively on a small scale. In this work, we devise a set of statistical tools, that are broadly applicable to quantitatively measuring the mode collapse of GANs. Strikingly, we consistently observe strong mode collapse on several state-of-the-art GANs using our toolset.  We analyze possible causes, and for the first time present two simple yet effective \u201cblack-box\u201d methods to calibrate the GAN learned distribution, without accessing either model parameters or the original training data.",
        "keywords": "Generative Adversarial Networks;Mode Collapse;Calibration",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zhenyu Wu;Ye Yuan;Zhaowen Wang;Jianming Zhang;Zhangyang Wang;Hailin Jin",
        "authorids": "wuzhenyu_sjtu@tamu.edu;ye.yuan@tamu.edu;zhawang@adobe.com;jianmzha@adobe.com;atlaswang@tamu.edu;hljin@adobe.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nwu2020is,\ntitle={Is There Mode Collapse? A Case Study on Face Generation and Its Black-box Calibration},\nauthor={Zhenyu Wu and Ye Yuan and Zhaowen Wang and Jianming Zhang and Zhangyang Wang and Hailin Jin},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxUMREYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxUMREYPr",
        "pdf_size": 0,
        "rating": "1;3;6",
        "confidence": "0;0;0",
        "wc_review": "590;559;223",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "668;1023;97",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;2;1",
        "rating_avg": [
            3.3333333333333335,
            2.0548046676563256
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            457.3333333333333,
            166.18129323790393
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            596.0,
            381.4507395020577
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.3333333333333333,
            0.4714045207910317
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4Fu7N6mH6gYJ:scholar.google.com/&scioq=Is+There+Mode+Collapse%3F+A+Case+Study+on+Face+Generation+and+Its+Black-box+Calibration&hl=en&as_sdt=0,33",
        "gs_version_total": 0
    },
    {
        "id": "ryxUkTVYvH",
        "title": "Towards Controllable and Interpretable Face Completion via Structure-Aware and Frequency-Oriented Attentive GANs",
        "track": "main",
        "status": "Reject",
        "tldr": "Structure-aware and frequency-oriented attentive GANs for high-resolution and fast face completion",
        "abstract": "Face completion is a challenging conditional image synthesis task. This paper proposes controllable and interpretable high-resolution and fast face completion by learning generative adversarial networks (GANs) progressively from low resolution to high resolution. We present structure-aware and frequency-oriented attentive GANs. The proposed structure-aware component leverages off-the-shelf facial landmark detectors and proposes a simple yet effective  method of integrating the detected landmarks in generative learning. It facilitates facial expression transfer together with facial attributes control, and helps regularize the structural consistency in progressive training. The proposed  frequency-oriented attentive module (FOAM) encourages GANs to attend to only finer details in the coarse-to-fine progressive training, thus enabling progressive attention to face structures. The learned FOAMs show a strong pattern of switching its attention from low-frequency to high-frequency signals. In experiments, the proposed method is tested on the CelebA-HQ benchmark. Experiment results show that our approach outperforms state-of-the-art face completion methods. The proposed method is also fast with mean inference time of 0.54 seconds for images at 1024x1024 resolution (using a Titan Xp GPU).",
        "keywords": "Face Completion;GANs;Conditional Image Synthesis;Interpretability;Frequency-Oriented Attention",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Zeyuan Chen;Shaoliang Nie;Tianfu Wu;Christopher G. Healey",
        "authorids": "zchen23@ncsu.edu;snie@ncsu.edu;tianfu_wu@ncsu.edu;healey@ncsu.edu",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "@misc{\nchen2020towards,\ntitle={Towards Controllable and Interpretable Face Completion via  Structure-Aware and Frequency-Oriented Attentive {\\{}GAN{\\}}s},\nauthor={Zeyuan Chen and Shaoliang Nie and Tianfu Wu and Christopher G. Healey},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxUkTVYvH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxUkTVYvH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "217;159;197",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "206;332;330",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            191.0,
            24.055491403558285
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            289.3333333333333,
            58.93122168162551
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yJF50C7QkEoJ:scholar.google.com/&scioq=Towards+Controllable+and+Interpretable+Face+Completion+via+Structure-Aware+and+Frequency-Oriented+Attentive+GANs&hl=en&as_sdt=0,5",
        "gs_version_total": 0
    },
    {
        "id": "ryxW804FPH",
        "title": "ADAPTING PRETRAINED LANGUAGE MODELS FOR LONG DOCUMENT CLASSIFICATION",
        "track": "main",
        "status": "Reject",
        "tldr": "We acheive state of the art results on long document classication by combining pretrained language models representations with attention.",
        "abstract": "Pretrained language models (LMs) have shown excellent results in achieving human like performance on many language tasks. However, the most powerful LMs have one significant drawback: a fixed-sized input. With this constraint, these LMs are unable to utilize the full input of long documents. In this paper, we introduce a new framework to handle documents of arbitrary lengths. We investigate the addition of a recurrent mechanism to extend the input size and utilizing attention to identify the most discriminating segment of the input. We perform extensive validating experiments on patent and Arxiv datasets, both of which have long text. We demonstrate our method significantly outperforms state-of-the-art results reported in recent literature.",
        "keywords": "NLP;Deep Learning;Language Models;Long Document",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Matthew Lyle Olson;Lisa Zhang;Chun-Nam Yu",
        "authorids": "olsomatt@oregonstate.edu;lisa.zhang@nokia-bell-labs.com;cnyu@cs.cornell.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nolson2020adapting,\ntitle={{\\{}ADAPTING{\\}} {\\{}PRETRAINED{\\}} {\\{}LANGUAGE{\\}} {\\{}MODELS{\\}} {\\{}FOR{\\}} {\\{}LONG{\\}} {\\{}DOCUMENT{\\}} {\\{}CLASSIFICATION{\\}}},\nauthor={Matthew Lyle Olson and Lisa Zhang and Chun-Nam Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxW804FPH}\n}",
        "github": "https://github.com/cf-anonymous/long_doc",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxW804FPH",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "208;194;189",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "167;499;158",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            197.0,
            8.04155872120988
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            274.6666666666667,
            158.6701680285939
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 2,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13484502760066623880&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryxWIgBFPS",
        "title": "A Meta-Transfer Objective for Learning to Disentangle Causal Mechanisms",
        "track": "main",
        "status": "Poster",
        "tldr": "This paper proposes a meta-learning objective based on speed of adaptation to transfer distributions to discover a modular decomposition and causal variables.",
        "abstract": "We propose to use a meta-learning objective that maximizes the speed of transfer on a modified distribution to learn how to modularize acquired knowledge. In particular, we focus on how to factor a joint distribution into appropriate conditionals, consistent with the causal directions. We explain when this can work, using the assumption that the changes in distributions are localized (e.g. to one of the marginals, for example due to an intervention on one of the variables). We prove that under this assumption of localized changes in causal mechanisms, the correct causal graph will tend to have only a few of its parameters with non-zero gradient, i.e. that need to be adapted (those of the modified variables). We argue and observe experimentally that this leads to faster adaptation, and use this property to define a meta-learning surrogate score which, in addition to a continuous parametrization of graphs, would favour correct causal graphs. Finally, motivated by the AI agent point of view (e.g. of a robot discovering its environment autonomously), we consider how the same objective can discover the causal variables themselves, as a transformation of observed low-level variables with no causal meaning. Experiments in the two-variable case validate the proposed ideas and theoretical results.",
        "keywords": "meta-learning;transfer learning;structure learning;modularity;causality",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yoshua Bengio;Tristan Deleu;Nasim Rahaman;Nan Rosemary Ke;Sebastien Lachapelle;Olexa Bilaniuk;Anirudh Goyal;Christopher Pal",
        "authorids": "yoshua.bengio@mila.quebec;tristan.deleu@gmail.com;nasim.rahaman@tuebingen.mpg.de;rosemary.nan.ke@gmail.com;sebastien.lachapelle@umontreal.ca;obilaniu@gmail.com;anirudhgoyal9119@gmail.com;chris.j.pal@gmail.com",
        "gender": ";;;;;;;",
        "homepage": ";;;;;;;",
        "dblp": ";;;;;;;",
        "google_scholar": ";;;;;;;",
        "orcid": ";;;;;;;",
        "linkedin": ";;;;;;;",
        "or_profile": ";;;;;;;",
        "aff": ";;;;;;;",
        "aff_domain": ";;;;;;;",
        "position": ";;;;;;;",
        "bibtex": "@inproceedings{\nBengio2020A,\ntitle={A Meta-Transfer Objective for Learning to Disentangle Causal Mechanisms},\nauthor={Yoshua Bengio and Tristan Deleu and Nasim Rahaman and Nan Rosemary Ke and Sebastien Lachapelle and Olexa Bilaniuk and Anirudh Goyal and Christopher Pal},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxWIgBFPS}\n}",
        "github": "https://github.com/ec6dde01667145e58de60f864e05a4/CausalOptimizationAnon",
        "project": "",
        "reviewers": "AnonReviewer5;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryxWIgBFPS",
        "pdf_size": 0,
        "rating": "3;8;8",
        "confidence": "0;0;0",
        "wc_review": "859;426;438",
        "wc_reply_reviewers": "231;28;0",
        "wc_reply_authors": "1656;595;410",
        "reply_reviewers": "1;1;0",
        "reply_authors": "3;1;1",
        "rating_avg": [
            6.333333333333333,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            574.3333333333334,
            201.34933711228243
        ],
        "wc_reply_reviewers_avg": [
            86.33333333333333,
            102.93147666719297
        ],
        "wc_reply_authors_avg": [
            887.0,
            548.9851242671942
        ],
        "reply_reviewers_avg": [
            0.6666666666666666,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "replies_avg": [
            12,
            0
        ],
        "authors#_avg": [
            8,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 438,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17277977944855014243&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 14
    },
    {
        "id": "ryx_Y6EYwH",
        "title": "Differentially Private Survival Function Estimation",
        "track": "main",
        "status": "Withdraw",
        "tldr": "A first differentially private estimate of the survival function",
        "abstract": "Survival function estimation is used in many disciplines, but it is most common in medical analytics in the form of the Kaplan-Meier estimator. Sensitive data (patient records) is used in the estimation without any explicit control on the information leakage, which is a significant privacy concern. We propose a first differentially private estimator of the survival function and show that it can be easily extended to provide differentially private confidence intervals and test statistics without spending any extra privacy budget. We further provide extensions for differentially private estimation of the competing risk cumulative incidence function. Using nine real-life clinical datasets, we provide empirical evidence that our proposed method provides good utility while simultaneously providing strong privacy guarantees.",
        "keywords": "Survival function;differential privacy;healthcare",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Lovedeep Gondara;Ke Wang",
        "authorids": "lgondara@sfu.ca;wang@sfu.ca",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "",
        "github": "https://bit.ly/2kNHC1J",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryx_Y6EYwH",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "308;121;211",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            213.33333333333334,
            76.36025726049441
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            5,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 6,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6539169697462022137&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "ryxdEkHtPS",
        "title": "A Closer Look at Deep Policy Gradients",
        "track": "main",
        "status": "Talk",
        "tldr": "",
        "abstract": "    We study how the behavior of deep policy gradient algorithms reflects the conceptual framework motivating their development. To this end, we propose a fine-grained analysis of state-of-the-art methods based on key elements of this framework: gradient estimation, value prediction, and optimization landscapes. Our results show that the behavior of deep policy gradient algorithms often deviates from what their motivating framework would predict: surrogate rewards do not match the true reward landscape, learned value estimators fail to fit the true value function, and gradient estimates poorly correlate with the \"true\" gradient. The mismatch between predicted and empirical behavior we uncover highlights our poor understanding of current methods, and indicates the need to move beyond current benchmark-centric evaluation methods.",
        "keywords": "deep policy gradient methods;deep reinforcement learning;trpo;ppo",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Andrew Ilyas;Logan Engstrom;Shibani Santurkar;Dimitris Tsipras;Firdaus Janoos;Larry Rudolph;Aleksander Madry",
        "authorids": "ailyas@mit.edu;engstrom@mit.edu;shibani@mit.edu;tsipras@mit.edu;firdaus.janoos@twosigma.com;rudolph@csail.mit.edu;madry@mit.edu",
        "gender": ";;;;;;",
        "homepage": ";;;;;;",
        "dblp": ";;;;;;",
        "google_scholar": ";;;;;;",
        "orcid": ";;;;;;",
        "linkedin": ";;;;;;",
        "or_profile": ";;;;;;",
        "aff": ";;;;;;",
        "aff_domain": ";;;;;;",
        "position": ";;;;;;",
        "bibtex": "@inproceedings{\nIlyas2020A,\ntitle={A Closer Look at Deep Policy Gradients},\nauthor={Andrew Ilyas and Logan Engstrom and Shibani Santurkar and Dimitris Tsipras and Firdaus Janoos and Larry Rudolph and Aleksander Madry},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxdEkHtPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxdEkHtPS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "400;236;342",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "169;85;175",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            326.0,
            67.90189000805992
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            143.0,
            41.08527716834828
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            7,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 98,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15249421445017346788&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "ryxf9CEKDr",
        "title": "Efficient Saliency Maps for Explainable AI",
        "track": "main",
        "status": "Reject",
        "tldr": "An efficent method for determining which locations in an image are informative to a CNN.",
        "abstract": "We describe an explainable AI saliency map method for use with deep convolutional neural networks (CNN) that is much more efficient than popular gradient methods. It is also quantitatively similar or better in accuracy. Our technique works by measuring information at the end of each network scale. This is then combined into a single saliency map. We describe how saliency measures can be made more efficient by exploiting Saliency Map Order Equivalence.  Finally, we visualize individual scale/layer contributions by using a Layer Ordered Visualization of Information. This provides an interesting comparison of scale information contributions within the network not provided by other saliency map methods.  Our method is generally straight forward and should be applicable to the most commonly used CNNs. (Full source code is available at http://www.anonymous.submission.com).",
        "keywords": "Saliency;XAI;Efficent;Information",
        "primary_area": "",
        "supplementary_material": "",
        "author": "T. Nathan Mundhenk;Barry Chen;Gerald Friedland",
        "authorids": "mundhenk1@llnl.gov;chen52@llnl.gov;fractor@eecs.berkeley.edu",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@misc{\nmundhenk2020efficient,\ntitle={Efficient Saliency Maps for Explainable {\\{}AI{\\}}},\nauthor={T. Nathan Mundhenk and Barry Chen and Gerald Friedland},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxf9CEKDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxf9CEKDr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "899;196;663",
        "wc_reply_reviewers": "217;0;0",
        "wc_reply_authors": "2575;0;1945",
        "reply_reviewers": "4;0;0",
        "reply_authors": "10;0;7",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            586.0,
            292.11755624519844
        ],
        "wc_reply_reviewers_avg": [
            72.33333333333333,
            102.29478101165388
        ],
        "wc_reply_authors_avg": [
            1506.6666666666667,
            1095.9800282040828
        ],
        "reply_reviewers_avg": [
            1.3333333333333333,
            1.8856180831641267
        ],
        "reply_authors_avg": [
            5.666666666666667,
            4.189935029992178
        ],
        "replies_avg": [
            27,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 104,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8470374287719629353&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryxgJTEYDr",
        "title": "Reinforcement Learning with Competitive Ensembles of Information-Constrained Primitives",
        "track": "main",
        "status": "Poster",
        "tldr": "Learning an implicit master policy, as a master policy in HRL can fail to generalize.",
        "abstract": "Reinforcement learning agents that operate in diverse and complex environments can benefit from the structured decomposition of their behavior. Often, this is addressed in the context of hierarchical reinforcement learning, where the aim is to decompose a policy into lower-level primitives or options, and a higher-level meta-policy that triggers the appropriate behaviors for a given situation. However, the meta-policy must still produce appropriate decisions in all states.\nIn this work, we propose a policy design that decomposes into primitives, similarly to hierarchical reinforcement learning, but without a high-level meta-policy. Instead, each primitive can decide for themselves whether they wish to act in the current state.\nWe use an information-theoretic mechanism for enabling this decentralized decision: each primitive chooses how much information it needs about the current state to make a decision and the primitive that requests the most information about the current state acts in the world. The primitives are regularized to use as little information as possible, which leads to natural competition and specialization. We experimentally demonstrate that this policy architecture improves over both flat and hierarchical policies in terms of generalization. ",
        "keywords": "Reinforcement Learning;Variational Information Bottleneck;Learning primitives",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Anirudh Goyal;Shagun Sodhani;Jonathan Binas;Xue Bin Peng;Sergey Levine;Yoshua Bengio",
        "authorids": "anirudhgoyal9119@gmail.com;sshagunsodhani@gmail.com;jbinas@gmail.com;xbpeng@berkeley.edu;svlevine@eecs.berkeley.edu;yoshua.bengio@mila.quebec",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nGoyal2020Reinforcement,\ntitle={Reinforcement Learning with Competitive  Ensembles of Information-Constrained Primitives},\nauthor={Anirudh Goyal and Shagun Sodhani and Jonathan Binas and Xue Bin Peng and Sergey Levine and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxgJTEYDr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxgJTEYDr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "184;280;251",
        "wc_reply_reviewers": "0;40;0",
        "wc_reply_authors": "121;502;400",
        "reply_reviewers": "0;1;0",
        "reply_authors": "1;2;2",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            238.33333333333334,
            40.20226638166339
        ],
        "wc_reply_reviewers_avg": [
            13.333333333333334,
            18.856180831641264
        ],
        "wc_reply_authors_avg": [
            341.0,
            161.04036761011196
        ],
        "reply_reviewers_avg": [
            0.3333333333333333,
            0.4714045207910317
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 55,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14540344743441199250&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 6
    },
    {
        "id": "ryxgegBKwr",
        "title": "Contextualized Sparse Representation with Rectified N-Gram Attention for Open-Domain Question Answering",
        "track": "main",
        "status": "Withdraw",
        "tldr": "",
        "abstract": "A sparse representation is known to be an effective means to encode precise lexical cues in information retrieval tasks by associating each dimension with a unique n-gram-based feature.  However, it has often relied on term frequency (such as tf-idf and BM25) or hand-engineered features that are coarse-grained (document-level) and often task-specific, hence not easily generalizable and not appropriate for fine-grained (word or phrase-level) retrieval. In this work, we propose an effective method for learning a highly contextualized, word-level sparse representation by utilizing rectified self-attention weights on the neighboring n-grams. We kernelize the inner product space during training for memory efficiency without the explicit mapping of the large sparse vectors. We particularly focus on the application of our model to phrase retrieval problem, which has recently shown to be a promising direction for open-domain question answering (QA) and requires lexically sensitive phrase encoding. We demonstrate the effectiveness of the learned sparse representations by not only drastically improving the phrase retrieval accuracy (by more than 4%), but also outperforming all other (pipeline-based) open-domain QA methods with up to 97x faster inference in SQuADopen and CuratedTrec.",
        "keywords": "Open-Domain Question Answering;Sparse Representation;Phrase Representation;Information Retrieval",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Jinhyuk Lee;Minjoon Seo;Hannaneh Hajishirzi;Jaewoo Kang",
        "authorids": "jinhyuk_lee@korea.ac.kr;minjoon@cs.washington.edu;hannaneh@washington.edu;kangj@korea.ac.kr",
        "gender": ";;;",
        "homepage": ";;;",
        "dblp": ";;;",
        "google_scholar": ";;;",
        "orcid": ";;;",
        "linkedin": ";;;",
        "or_profile": ";;;",
        "aff": ";;;",
        "aff_domain": ";;;",
        "position": ";;;",
        "bibtex": "",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryxgegBKwr",
        "pdf_size": 0,
        "rating": "3;3;6",
        "confidence": "0;0;0",
        "wc_review": "278;133;246",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "11;11;48",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            219.0,
            62.198606629623676
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            23.333333333333332,
            17.441967269268172
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            4,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 0,
        "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fzsK6uphXtYJ:scholar.google.com/&scioq=Contextualized+Sparse+Representation+with+Rectified+N-Gram+Attention+for+Open-Domain+Question+Answering&hl=en&as_sdt=0,5",
        "gs_version_total": 2
    },
    {
        "id": "ryxgsCVYPr",
        "title": "NeurQuRI: Neural Question Requirement Inspector for Answerability Prediction in Machine Reading Comprehension",
        "track": "main",
        "status": "Poster",
        "tldr": "We propose a neural question requirement inspection model called NeurQuRI that extracts a list of conditions from the question, each of which should be satisfied by the candidate answer generated by an MRC model.",
        "abstract": "Real-world question answering systems often retrieve potentially relevant documents to a given question through a keyword search, followed by a machine reading comprehension (MRC) step to find the exact answer from them. In this process, it is essential to properly determine whether an answer to the question exists in a given document. This task often becomes complicated when the question involves multiple different conditions or requirements which are to be met in the answer. For example, in a question \"What was the projection of sea level increases in the fourth assessment report?\", the answer should properly satisfy several conditions, such as \"increases\" (but not decreases) and \"fourth\" (but not third). To address this, we propose a neural question requirement inspection model called NeurQuRI that extracts a list of conditions from the question, each of which should be satisfied by the candidate answer generated by an MRC model. To check whether each condition is met, we propose a novel, attention-based loss function. We evaluate our approach on SQuAD 2.0 dataset by integrating the proposed module with various MRC models, demonstrating the consistent performance improvements across a wide range of state-of-the-art methods.",
        "keywords": "Question Answering;Machine Reading Comprehension;Answerability Prediction;Neural Checklist",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Seohyun Back;Sai Chetan Chinthakindi;Akhil Kedia;Haejun Lee;Jaegul Choo",
        "authorids": "scv.back@samsung.com;sai.chetan@samsung.com;akhil.kedia@samsung.com;haejun82.lee@samsung.com;jchoo@korea.ac.kr",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nBack2020NeurQuRI:,\ntitle={NeurQuRI: Neural Question Requirement Inspector for Answerability Prediction in Machine Reading Comprehension},\nauthor={Seohyun Back and Sai Chetan Chinthakindi and Akhil Kedia and Haejun Lee and Jaegul Choo},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxgsCVYPr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryxgsCVYPr",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "393;211;81",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "487;435;4",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            228.33333333333334,
            127.96179985535615
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            308.6666666666667,
            216.47530010501328
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 34,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17742680332632920107&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryxjnREFwH",
        "title": "Neural Symbolic Reader: Scalable Integration of Distributed and Symbolic Representations for Reading Comprehension",
        "track": "main",
        "status": "Spotlight",
        "tldr": "",
        "abstract": "Integrating distributed representations with symbolic operations is essential for reading comprehension requiring complex reasoning, such as counting, sorting and arithmetics, but most existing approaches are hard to scale to more domains or more complex reasoning. In this work, we propose the Neural Symbolic Reader (NeRd), which includes a reader, e.g., BERT, to encode the passage and question, and a programmer, e.g., LSTM, to generate a program that is executed to produce the answer. Compared to previous works, NeRd is more scalable in two aspects: (1) domain-agnostic, i.e., the same neural architecture works for different domains; (2) compositional, i.e., when needed, complex programs can be generated by recursively applying the predefined operators, which become executable and interpretable representations for more complex reasoning. Furthermore, to overcome the challenge of training NeRd with weak supervision, we apply data augmentation techniques and hard Expectation-Maximization (EM) with thresholding. On DROP, a challenging reading comprehension dataset that requires discrete reasoning, NeRd achieves 1.37%/1.18% absolute improvement over the state-of-the-art on EM/F1 metrics. With the same architecture, NeRd significantly outperforms the baselines on MathQA, a math problem benchmark that requires multiple steps of reasoning, by 25.5% absolute increment on accuracy when trained on all the annotated programs. More importantly, NeRd still beats the baselines even when only 20% of the program annotations are given.",
        "keywords": "neural symbolic;reading comprehension;question answering",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Xinyun Chen;Chen Liang;Adams Wei Yu;Denny Zhou;Dawn Song;Quoc V. Le",
        "authorids": "xinyun.chen@berkeley.edu;crazydonkey@google.com;adamsyuwei@google.com;dennyzhou@google.com;dawnsong.travel@gmail.com;qvl@google.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nChen2020Neural,\ntitle={Neural Symbolic Reader: Scalable Integration of Distributed and Symbolic Representations for Reading Comprehension},\nauthor={Xinyun Chen and Chen Liang and Adams Wei Yu and Denny Zhou and Dawn Song and Quoc V. Le},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxjnREFwH}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryxjnREFwH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "243;513;1155",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "251;825;2062",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;4",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            637.0,
            382.50751626602056
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            1046.0,
            755.6723275776787
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            1.4142135623730951
        ],
        "replies_avg": [
            13,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 129,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2718500706707870712&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 2
    },
    {
        "id": "ryxmb1rKDS",
        "title": "Symplectic ODE-Net: Learning Hamiltonian Dynamics with Control",
        "track": "main",
        "status": "Poster",
        "tldr": "This work enforces Hamiltonian dynamics with control to learn system models from embedded position and velocity data, and exploits this physically-consistent dynamics to synthesize model-based control via energy shaping.",
        "abstract": "In this paper, we introduce Symplectic ODE-Net (SymODEN), a deep learning framework which can infer the dynamics of a physical system, given by an ordinary differential equation (ODE), from observed state trajectories. To achieve better generalization with fewer training samples, SymODEN incorporates appropriate inductive bias by designing the associated computation graph in a physics-informed manner. In particular, we enforce Hamiltonian dynamics with control to learn the underlying dynamics in a transparent way, which can then be leveraged to draw insight about relevant physical aspects of the system, such as mass and potential energy. In addition, we propose a parametrization which can enforce this Hamiltonian formalism even when the generalized coordinate data is embedded in a high-dimensional space or we can only access velocity data instead of generalized momentum. This framework, by offering interpretable, physically-consistent models for physical systems, opens up new possibilities for synthesizing model-based control strategies.",
        "keywords": "Deep Model Learning;Physics-based Priors;Control of Mechanical Systems",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yaofeng Desmond Zhong;Biswadip Dey;Amit Chakraborty",
        "authorids": "y.zhong@princeton.edu;biswadip.dey@siemens.com;amit.chakraborty@siemens.com",
        "gender": ";;",
        "homepage": ";;",
        "dblp": ";;",
        "google_scholar": ";;",
        "orcid": ";;",
        "linkedin": ";;",
        "or_profile": ";;",
        "aff": ";;",
        "aff_domain": ";;",
        "position": ";;",
        "bibtex": "@inproceedings{\nZhong2020Symplectic,\ntitle={Symplectic ODE-Net: Learning Hamiltonian Dynamics with Control},\nauthor={Yaofeng Desmond Zhong and Biswadip Dey and Amit Chakraborty},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxmb1rKDS}\n}",
        "github": "https://github.com/d-biswa/Symplectic-ODENet",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxmb1rKDS",
        "pdf_size": 0,
        "rating": "6;8;8",
        "confidence": "0;0;0",
        "wc_review": "575;990;283",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "915;826;327",
        "reply_reviewers": "0;0;0",
        "reply_authors": "3;1;2",
        "rating_avg": [
            7.333333333333333,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            616.0,
            290.08389591059114
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            689.3333333333334,
            258.77188581623176
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            2.0,
            0.816496580927726
        ],
        "replies_avg": [
            11,
            0
        ],
        "authors#_avg": [
            3,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 337,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16212087481734650197&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 4
    },
    {
        "id": "ryxmrpNtvH",
        "title": "Deeper Insights into Weight Sharing in Neural Architecture Search",
        "track": "main",
        "status": "Reject",
        "tldr": "A comprehensive study of the impact of weight-sharing in Neural Architecture Search",
        "abstract": "With the success of deep neural networks, Neural Architecture Search (NAS) as a way of automatic model design has attracted wide attention. As training every child model from scratch is very time-consuming, recent works leverage weight-sharing to speed up the model evaluation procedure. These approaches greatly reduce computation by maintaining a single copy of weights on the super-net and share the weights among every child model. However, weight-sharing has no theoretical guarantee and its impact has not been well studied before. In this paper, we conduct comprehensive experiments to reveal the impact of weight-sharing: (1) The best-performing models from different runs or even from consecutive epochs within the same run have significant variance; (2) Even with high variance, we can extract valuable information from training the super-net with shared weights; (3) The interference between child models is a main factor that induces high variance; (4) Properly reducing the degree of weight sharing could effectively reduce variance and improve performance.",
        "keywords": "Neural Architecture Search;NAS;AutoML;AutoDL;Deep Learning;Machine Learning",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yuge Zhang;Quanlu Zhang;Junyang Jiang;Zejun Lin;Yujing Wang",
        "authorids": "scottyugochang@gmail.com;quanlu.zhang@microsoft.com;jyjiang97@gmail.com;gdzejlin@gmail.com;yujing.wang@microsoft.com",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\nzhang2020deeper,\ntitle={Deeper Insights into Weight Sharing in Neural Architecture Search},\nauthor={Yuge Zhang and Quanlu Zhang and Junyang Jiang and Zejun Lin and Yujing Wang},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxmrpNtvH}\n}",
        "github": "https://drive.google.com/file/d/13v81qfUCr0vz_rKNqqR0QJHcumvOk2Jj/view?usp=sharing",
        "project": "",
        "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryxmrpNtvH",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "572;555;197",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "336;210;193",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            441.3333333333333,
            172.9090962198217
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            246.33333333333334,
            63.78261692829969
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 60,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16462742924436383739&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryxn8RNtvr",
        "title": "NormLime: A New Feature Importance Metric for Explaining Deep Neural Networks",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a new salience map (feature importance function) to generate global interpretations, and evaluate the method both quantitatively using a standard ablation technique, as well as qualitatively through a human user study.",
        "abstract": "The problem of explaining deep learning models, and model predictions generally, has attracted intensive interest recently. Many successful approaches forgo global approximations in order to provide more faithful local interpretations of the model\u2019s behavior. LIME develops multiple  interpretable models, each approximating a large neural network on a small region of the data manifold, and SP-LIME aggregates the local models to form a global interpretation.  Extending this line of research, we propose a simple yet effective method, NormLIME, for aggregating local models into global and class-specific interpretations.  A human user study strongly favored the class-specific interpretations created by NormLIME to other feature importance metrics. Numerical experiments employing Keep And Retrain (KAR) based feature ablation across various baselines (Random, Gradient-based, LIME, SHAP) confirms NormLIME\u2019s effectiveness for recognizing important features.",
        "keywords": "Machine Learning;Deep Learning;Interpretability;Feature Importance;Salience",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Isaac Ahern;Adam Noack;Luis Guzman-Nateras;Dejing Dou;Boyang Li;Jun Huan",
        "authorids": "isaac@biofidelic.com;anoack2@uoregon.edu;lguzmann@uoregon.edu;dou@cs.uoregon.edu;boyangli@baidu.com;huanjun@baidu.com",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nahern2020normlime,\ntitle={NormLime: A New Feature Importance Metric for Explaining Deep Neural Networks},\nauthor={Isaac Ahern and Adam Noack and Luis Guzman-Nateras and Dejing Dou and Boyang Li and Jun Huan},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxn8RNtvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4",
        "site": "https://openreview.net/forum?id=ryxn8RNtvr",
        "pdf_size": 0,
        "rating": "3;6;6",
        "confidence": "0;0;0",
        "wc_review": "586;132;266",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "726;158;456",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            5.0,
            1.4142135623730951
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            328.0,
            190.45909447087757
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            446.6666666666667,
            231.97892624594638
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 58,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17799605712905690715&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryxnJlSKvr",
        "title": "SCELMo: Source Code Embeddings from Language Models",
        "track": "main",
        "status": "Reject",
        "tldr": "A new set of deep contextualized word representations for computer programs based on language models.",
        "abstract": "Continuous embeddings of tokens in computer programs have been used to support a variety of software development tools, including readability, code search, and program repair. \nContextual embeddings are common in natural language processing but have not been previously applied in software engineering.\nWe introduce a new set of deep contextualized word representations for computer programs based on language models.\nWe train a set of embeddings using the ELMo (embeddings from language models) framework of Peters et al (2018).\nWe investigate whether these embeddings are effective when fine-tuned for the downstream task of bug detection.\nWe show that even a low-dimensional embedding trained on a relatively small corpus of programs can improve a state-of-the-art machine learning system for bug detection.",
        "keywords": "Transfer Learning;Pretraining;Program Repair",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Rafael - Michael Karampatsis;Charles Sutton",
        "authorids": "mpatsis13@gmail.com;charlessutton@google.com",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nkarampatsis2020scelmo,\ntitle={{\\{}SCELM{\\}}o: Source Code Embeddings from Language Models},\nauthor={Rafael - Michael Karampatsis and Charles Sutton},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxnJlSKvr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryxnJlSKvr",
        "pdf_size": 0,
        "rating": "3;3;8",
        "confidence": "0;0;0",
        "wc_review": "357;204;240",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "282;143;71",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            4.666666666666667,
            2.357022603955158
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            267.0,
            65.31462317123173
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            165.33333333333334,
            87.57599874141061
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            8,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 63,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17856251601212872472&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 5
    },
    {
        "id": "ryxnY3NYPS",
        "title": "Diverse Trajectory Forecasting with Determinantal Point Processes",
        "track": "main",
        "status": "Poster",
        "tldr": "We learn a diversity sampling function with DPPs to obtain a diverse set of samples from a generative model.",
        "abstract": "The ability to forecast a set of likely yet diverse possible future behaviors of an agent (e.g., future trajectories of a pedestrian) is essential for safety-critical perception systems (e.g., autonomous vehicles). In particular, a set of possible future behaviors generated by the system must be diverse to account for all possible outcomes in order to take necessary safety precautions. It is not sufficient to maintain a set of the most likely future outcomes because the set may only contain perturbations of a dominating single outcome (major mode). While generative models such as variational autoencoders (VAEs) have been shown to be a powerful tool for learning a distribution over future trajectories, randomly drawn samples from the learned implicit likelihood model may not be diverse -- the likelihood model is derived from the training data distribution and the samples will concentrate around the major mode of the data. In this work, we propose to learn a diversity sampling function (DSF) that generates a diverse yet likely set of future trajectories. The DSF maps forecasting context features to a set of latent codes which can be decoded by a generative model (e.g., VAE) into a set of diverse trajectory samples. Concretely, the process of identifying the diverse set of samples is posed as DSF parameter estimation. To learn the parameters of the DSF, the diversity of the trajectory samples is evaluated by a diversity loss based on a determinantal point process (DPP). Gradient descent is performed over the DSF parameters, which in turn moves the latent codes of the sample set to find an optimal set of diverse yet likely trajectories. Our method is a novel application of DPPs to optimize a set of items (forecasted trajectories) in continuous space. We demonstrate the diversity of the trajectories produced by our approach on both low-dimensional 2D trajectory data and high-dimensional human motion data.",
        "keywords": "Diverse Inference;Generative Models;Trajectory Forecasting",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Ye Yuan;Kris M. Kitani",
        "authorids": "yyuan2@cs.cmu.edu;kkitani@cs.cmu.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@inproceedings{\nYuan2020Diverse,\ntitle={Diverse Trajectory Forecasting with Determinantal Point Processes},\nauthor={Ye Yuan and Kris M. Kitani},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxnY3NYPS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxnY3NYPS",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "461;628;119",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1129;1411;131",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            402.6666666666667,
            211.85267417607824
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            890.3333333333334,
            549.1334587835233
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 161,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16112360265659724518&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 3
    },
    {
        "id": "ryxsUySFwr",
        "title": "Neural Network Out-of-Distribution Detection for Regression Tasks",
        "track": "main",
        "status": "Reject",
        "tldr": "Detect out-of-distribution data on regression neural networks with a generative model of the hidden features",
        "abstract": "Neural network out-of-distribution (OOD) detection aims to identify when a model is unable to generalize to new inputs, either due to covariate shift or anomalous data. Most existing OOD methods only apply to classification tasks, as they assume a discrete set of possible predictions. In this paper, we propose a method for neural network OOD detection that can be applied to regression problems. We demonstrate that the hidden features for in-distribution data can be described by a highly concentrated, low dimensional distribution. Therefore, we can model these in-distribution features with an extremely simple generative model, such as a Gaussian mixture model (GMM) with 4 or fewer components. We demonstrate on several real-world benchmark data sets that GMM-based feature detection achieves state-of-the-art OOD detection results on several regression tasks. Moreover, this approach is simple to implement and computationally efficient.",
        "keywords": "Out-of-distribution;deep learning;regression",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Geoff Pleiss;Amauri Souza;Joseph Kim;Boyi Li;Kilian Q. Weinberger",
        "authorids": "geoff@cs.cornell.edu;ahd64@cornell.edu;jk2569@cornell.edu;bl728@cornell.edu;kqw4@cornell.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@misc{\npleiss2020neural,\ntitle={Neural Network Out-of-Distribution Detection for Regression Tasks},\nauthor={Geoff Pleiss and Amauri Souza and Joseph Kim and Boyi Li and Kilian Q. Weinberger},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxsUySFwr}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2;AnonReviewer3",
        "site": "https://openreview.net/forum?id=ryxsUySFwr",
        "pdf_size": 0,
        "rating": "1;1;3;3",
        "confidence": "0;0;0;0",
        "wc_review": "436;323;276;344",
        "wc_reply_reviewers": "0;0;0;90",
        "wc_reply_authors": "433;360;200;216",
        "reply_reviewers": "0;0;0;1",
        "reply_authors": "1;1;1;1",
        "rating_avg": [
            2.0,
            1.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            344.75,
            58.15227854521265
        ],
        "wc_reply_reviewers_avg": [
            22.5,
            38.97114317029974
        ],
        "wc_reply_authors_avg": [
            302.25,
            97.88354049583617
        ],
        "reply_reviewers_avg": [
            0.25,
            0.4330127018922193
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            10,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 3,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13383504455030862670&as_sdt=2005&sciodt=0,5&hl=en",
        "gs_version_total": 0
    },
    {
        "id": "ryxtCpNtDS",
        "title": "Autoencoders and Generative Adversarial Networks for Imbalanced Sequence Classification",
        "track": "main",
        "status": "Reject",
        "tldr": "We introduce a novel oversampling method for variable length,  multivariate time series data that significantly improves classification accuracy.",
        "abstract": "We introduce a novel synthetic oversampling method for variable length, multi- feature sequence datasets based on autoencoders and generative adversarial net- works. We show that this method improves classification accuracy for highly imbalanced sequence classification tasks. We show that this method outperforms standard oversampling techniques that use techniques such as SMOTE and autoencoders. We also use generative adversarial networks on the majority class as an outlier detection method for novelty detection, with limited classification improvement. We show that the use of generative adversarial network based synthetic data improves classification model performance on a variety of sequence data sets.\n",
        "keywords": "imbalanced multivariate time series classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Stephanie Ger;Diego Klabjan",
        "authorids": "stephanieger@u.northwestern.edu;d-klabjan@northwestern.edu",
        "gender": ";",
        "homepage": ";",
        "dblp": ";",
        "google_scholar": ";",
        "orcid": ";",
        "linkedin": ";",
        "or_profile": ";",
        "aff": ";",
        "aff_domain": ";",
        "position": ";",
        "bibtex": "@misc{\nger2020autoencoders,\ntitle={Autoencoders and Generative Adversarial Networks for Imbalanced Sequence Classification},\nauthor={Stephanie Ger and Diego Klabjan},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxtCpNtDS}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer5",
        "site": "https://openreview.net/forum?id=ryxtCpNtDS",
        "pdf_size": 0,
        "rating": "3;3;3",
        "confidence": "0;0;0",
        "wc_review": "428;89;215",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "272;126;294",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            3.0,
            0.0
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            244.0,
            139.90711204224036
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            230.66666666666666,
            74.55348564770277
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            7,
            0
        ],
        "authors#_avg": [
            2,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 10,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10901466392595571262&as_sdt=400005&sciodt=0,14&hl=en",
        "gs_version_total": 7
    },
    {
        "id": "ryxtWgSKPB",
        "title": "Quantum Optical Experiments Modeled by Long Short-Term Memory",
        "track": "main",
        "status": "Reject",
        "tldr": "We demonstrate how machine learning is able to model experiments in quantum physics.",
        "abstract": "We demonstrate how machine learning is able to model experiments in quantum physics. Quantum entanglement is a cornerstone for upcoming quantum technologies such as quantum computation and quantum cryptography. Of particular interest are complex quantum states with more than two particles and a large number of entangled quantum levels. Given such a multiparticle high-dimensional quantum state, it is usually impossible to reconstruct an experimental setup that produces it. To search for interesting experiments, one thus has to randomly create millions of setups on a computer and calculate the respective output states. In this work, we show that machine learning models can provide significant improvement over random search. We demonstrate that a long short-term memory (LSTM) neural network can successfully learn to model quantum experiments by correctly predicting output state characteristics for given setups without the necessity of computing the states themselves. This approach not only allows for faster search but is also an essential step towards automated design of multiparticle high-dimensional quantum experiments using generative machine learning models.",
        "keywords": "Recurrent Networks;LSTM;Sequence Analysis;Binary Classification",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Thomas Adler;Manuel Erhard;Mario Krenn;Johannes Brandstetter;Johannes Kofler;Sepp Hochreiter",
        "authorids": "adler@ml.jku.at;manuel.erhard@univie.ac.at;mario.krenn@univie.ac.at;brandstetter@ml.jku.at;kofler@ml.jku.at;hochreit@ml.jku.at",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@misc{\nadler2020quantum,\ntitle={Quantum Optical Experiments Modeled by Long Short-Term Memory},\nauthor={Thomas Adler and Manuel Erhard and Mario Krenn and Johannes Brandstetter and Johannes Kofler and Sepp Hochreiter},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxtWgSKPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryxtWgSKPB",
        "pdf_size": 0,
        "rating": "1;1;3",
        "confidence": "0;0;0",
        "wc_review": "225;1122;134",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "0;0;0",
        "reply_reviewers": "0;0;0",
        "reply_authors": "0;0;0",
        "rating_avg": [
            1.6666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            493.6666666666667,
            445.84925205225534
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            0,
            0
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            0,
            0
        ],
        "replies_avg": [
            4,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 13,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11908674214380831857&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 11
    },
    {
        "id": "ryxyCeHtPB",
        "title": "Pay Attention to Features, Transfer Learn Faster CNNs",
        "track": "main",
        "status": "Poster",
        "tldr": "We introduce attentive feature distillation and selection, to fine-tune a large model and produce a faster one.",
        "abstract": "Deep convolutional neural networks are now widely deployed in vision applications, but a limited size of training data can restrict their task performance. Transfer learning offers the chance for CNNs to learn with limited data samples by transferring knowledge from models pretrained on large datasets. Blindly transferring all learned features from the source dataset, however, brings unnecessary computation to CNNs on the target task. In this paper, we propose attentive feature distillation and selection (AFDS), which not only adjusts the strength of transfer learning regularization but also dynamically determines the important features to transfer. By deploying AFDS on ResNet-101, we achieved a state-of-the-art computation reduction at the same accuracy budget, outperforming all existing transfer learning methods. With a 10x MACs reduction budget, a ResNet-101 equipped with AFDS transfer learned from ImageNet to Stanford Dogs 120, can achieve an accuracy 11.07% higher than its best competitor.",
        "keywords": "transfer learning;pruning;faster CNNs",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Kafeng Wang;Xitong Gao;Yiren Zhao;Xingjian Li;Dejing Dou;Cheng-Zhong Xu",
        "authorids": "kf.wang@siat.ac.cn;xt.gao@siat.ac.cn;yiren.zhao@cl.cam.ac.uk;lixingjian@baidu.com;doudejing@baidu.com;czxu@um.edu.mo",
        "gender": ";;;;;",
        "homepage": ";;;;;",
        "dblp": ";;;;;",
        "google_scholar": ";;;;;",
        "orcid": ";;;;;",
        "linkedin": ";;;;;",
        "or_profile": ";;;;;",
        "aff": ";;;;;",
        "aff_domain": ";;;;;",
        "position": ";;;;;",
        "bibtex": "@inproceedings{\nWang2020Pay,\ntitle={Pay Attention to Features, Transfer Learn Faster CNNs},\nauthor={Kafeng Wang and Xitong Gao and Yiren Zhao and Xingjian Li and Dejing Dou and Cheng-Zhong Xu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxyCeHtPB}\n}",
        "github": "",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1",
        "site": "https://openreview.net/forum?id=ryxyCeHtPB",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "184;197;151",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "106;380;11",
        "reply_reviewers": "0;0;0",
        "reply_authors": "1;1;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            177.33333333333334,
            19.362047641943473
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            165.66666666666666,
            156.44026193904034
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.0,
            0.0
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            6,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 132,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9419209268909715373&as_sdt=2005&sciodt=0,5&hl=en&oe=ASCII",
        "gs_version_total": 4
    },
    {
        "id": "ryxz8CVYDH",
        "title": "Learning to Learn by Zeroth-Order Oracle",
        "track": "main",
        "status": "Poster",
        "tldr": "Novel variant of learning to learn framework for zeroth-order optimization that learns both the update rule and the Gaussian sampling rule.",
        "abstract": "In the learning to learn (L2L) framework, we cast the design of optimization algorithms as a machine learning problem and use deep neural networks to learn the update rules. In this paper, we extend the L2L framework to zeroth-order (ZO) optimization setting, where no explicit gradient information is available. Our learned optimizer, modeled as recurrent neural network (RNN), first approximates gradient by ZO gradient estimator and then produces parameter update utilizing the knowledge of previous iterations. To reduce high variance effect due to ZO gradient estimator, we further introduce another RNN to learn the Gaussian sampling rule and dynamically guide the query direction sampling. Our learned optimizer outperforms hand-designed algorithms in terms of convergence rate and final solution on both synthetic and practical ZO optimization tasks (in particular, the black-box adversarial attack task, which is one of the most widely used tasks of ZO optimization). We finally conduct extensive analytical experiments to demonstrate the effectiveness of our proposed optimizer.",
        "keywords": "learning to learn;zeroth-order optimization;black-box adversarial attack",
        "primary_area": "",
        "supplementary_material": "",
        "author": "Yangjun Ruan;Yuanhao Xiong;Sashank Reddi;Sanjiv Kumar;Cho-Jui Hsieh",
        "authorids": "ruanyj3107@zju.edu.cn;yhxiong@cs.ucla.edu;sashank@google.com;sanjivk@google.com;chohsieh@cs.ucla.edu",
        "gender": ";;;;",
        "homepage": ";;;;",
        "dblp": ";;;;",
        "google_scholar": ";;;;",
        "orcid": ";;;;",
        "linkedin": ";;;;",
        "or_profile": ";;;;",
        "aff": ";;;;",
        "aff_domain": ";;;;",
        "position": ";;;;",
        "bibtex": "@inproceedings{\nRuan2020Learning,\ntitle={Learning to Learn by Zeroth-Order Oracle},\nauthor={Yangjun Ruan and Yuanhao Xiong and Sashank Reddi and Sanjiv Kumar and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=ryxz8CVYDH}\n}",
        "github": "https://github.com/RYoungJ/ZO-L2L",
        "project": "",
        "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2",
        "site": "https://openreview.net/forum?id=ryxz8CVYDH",
        "pdf_size": 0,
        "rating": "6;6;8",
        "confidence": "0;0;0",
        "wc_review": "486;231;159",
        "wc_reply_reviewers": "0;0;0",
        "wc_reply_authors": "1069;1086;85",
        "reply_reviewers": "0;0;0",
        "reply_authors": "2;2;1",
        "rating_avg": [
            6.666666666666667,
            0.9428090415820634
        ],
        "confidence_avg": [
            0,
            0
        ],
        "wc_review_avg": [
            292.0,
            140.29255147726127
        ],
        "wc_reply_reviewers_avg": [
            0,
            0
        ],
        "wc_reply_authors_avg": [
            746.6666666666666,
            467.9204585776899
        ],
        "reply_reviewers_avg": [
            0,
            0
        ],
        "reply_authors_avg": [
            1.6666666666666667,
            0.4714045207910317
        ],
        "replies_avg": [
            9,
            0
        ],
        "authors#_avg": [
            5,
            0
        ],
        "corr_rating_confidence": 0,
        "gs_citation": 20,
        "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8954748594282159172&as_sdt=5,33&sciodt=0,33&hl=en",
        "gs_version_total": 10
    }
]